linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3
@ 2009-06-01  6:34 Tejun Heo
  2009-06-01  6:34 ` [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix Tejun Heo
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86

Hello,

Upon ack, please pull from the following git tree.

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu-fix-remap

This is the third take of x86-percpu-fix-pageattr patchset.  Changes
from the last take[L] are...

* Rebased to reflect changes in linus#master.

* After fixing remap pageattr buf, reenable remap allocator which was
  disabled in upstream.

This patchset contains the following five patches.

  0001-x86-prepare-setup_pcpu_remap-for-pageattr-fix.patch
  0002-x86-reorganize-cpa_process_alias.patch
  0003-x86-fix-pageattr-handling-for-remap-percpu-allocato.patch
  0004-x86-implement-percpu_alloc-kernel-parameter.patch
  0005-x86-ensure-percpu-remap-doesn-t-consume-too-much-vm.patch

This patchset is on top of

core/percpu (e1b9aa3f47242e757c776a3771bb6613e675bf9c)
+ linus-2.6#master (3218911f839b6c85acbf872ad264ea69aa4d89ad)

and contains the following changes.

 Documentation/kernel-parameters.txt |    6 +
 arch/x86/include/asm/percpu.h       |    9 +
 arch/x86/kernel/setup_percpu.c      |  201 +++++++++++++++++++++++++++---------
 arch/x86/mm/pageattr.c              |   70 ++++++++----
 mm/percpu.c                         |   13 +-
 5 files changed, 222 insertions(+), 77 deletions(-)

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix
  2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
@ 2009-06-01  6:34 ` Tejun Heo
  2009-06-01  6:34 ` [PATCH 2/5] x86: reorganize cpa_process_alias() Tejun Heo
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86; +Cc: Tejun Heo

Make the following changes in preparation of coming pageattr updates.

* Define and use array of struct pcpur_ent instead of array of
  pointers.  The only difference is ->cpu field which is set but
  unused yet.

* Rename variables according to the above change.

* Rename local variable vm to pcpur_vm and move it out of the
  function.

[ Impact: no functional difference ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |   58 ++++++++++++++++++++++-----------------
 1 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13b..65e5ade 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
  * better than only using 4k mappings while still being NUMA friendly.
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pcpur_ent {
+	unsigned int	cpu;
+	void		*ptr;
+};
+
 static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
+static struct pcpur_ent *pcpur_map __initdata;
+static struct vm_struct pcpur_vm;
 
 static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 {
@@ -147,13 +153,12 @@ static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 	if (off >= pcpur_size)
 		return NULL;
 
-	return virt_to_page(pcpur_ptrs[cpu] + off);
+	return virt_to_page(pcpur_map[cpu].ptr + off);
 }
 
 static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
-	static struct vm_struct vm;
-	size_t ptrs_size, dyn_size;
+	size_t map_size, dyn_size;
 	unsigned int cpu;
 	ssize_t ret;
 
@@ -180,12 +185,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
-	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
-	pcpur_ptrs = alloc_bootmem(ptrs_size);
+	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_map[0]));
+	pcpur_map = alloc_bootmem(map_size);
 
 	for_each_possible_cpu(cpu) {
-		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
-		if (!pcpur_ptrs[cpu])
+		pcpur_map[cpu].cpu = cpu;
+		pcpur_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
+							PMD_SIZE);
+		if (!pcpur_map[cpu].ptr)
 			goto enomem;
 
 		/*
@@ -196,42 +203,43 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 		 * not well-specified to have a PAT-incompatible area
 		 * (unmapped RAM, device memory, etc.) in that hole.
 		 */
-		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+		free_bootmem(__pa(pcpur_map[cpu].ptr + pcpur_size),
 			     PMD_SIZE - pcpur_size);
 
-		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(pcpur_map[cpu].ptr, __per_cpu_load, static_size);
 	}
 
 	/* allocate address and map */
-	vm.flags = VM_ALLOC;
-	vm.size = num_possible_cpus() * PMD_SIZE;
-	vm_area_register_early(&vm, PMD_SIZE);
+	pcpur_vm.flags = VM_ALLOC;
+	pcpur_vm.size = num_possible_cpus() * PMD_SIZE;
+	vm_area_register_early(&pcpur_vm, PMD_SIZE);
 
 	for_each_possible_cpu(cpu) {
-		pmd_t *pmd;
+		pmd_t *pmd, pmd_v;
 
-		pmd = populate_extra_pmd((unsigned long)vm.addr
-					 + cpu * PMD_SIZE);
-		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
-				     PAGE_KERNEL_LARGE));
+		pmd = populate_extra_pmd((unsigned long)pcpur_vm.addr +
+					 cpu * PMD_SIZE);
+		pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpur_map[cpu].ptr)),
+				PAGE_KERNEL_LARGE);
+		set_pmd(pmd, pmd_v);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", vm.addr, static_size);
+		"%zu bytes\n", pcpur_vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     PMD_SIZE, vm.addr, NULL);
-	goto out_free_ar;
+				     PMD_SIZE, pcpur_vm.addr, NULL);
+	goto out_free_map;
 
 enomem:
 	for_each_possible_cpu(cpu)
-		if (pcpur_ptrs[cpu])
-			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+		if (pcpur_map[cpu].ptr)
+			free_bootmem(__pa(pcpur_map[cpu].ptr), PMD_SIZE);
 	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+out_free_map:
+	free_bootmem(__pa(pcpur_map), map_size);
 	return ret;
 }
 #else
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/5] x86: reorganize cpa_process_alias()
  2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
  2009-06-01  6:34 ` [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix Tejun Heo
@ 2009-06-01  6:34 ` Tejun Heo
  2009-06-01  6:34 ` [PATCH 3/5] x86: fix pageattr handling for remap percpu allocator and re-enable it Tejun Heo
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86; +Cc: Tejun Heo

Reorganize cpa_process_alias() so that new alias condition can be
added easily.

Jan Beulich spotted problem in the original cleanup thread which
incorrectly assumed the two existing conditions were mutially
exclusive.

[ Impact: code reorganization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c |   50 ++++++++++++++++++++++-------------------------
 1 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed..dca77d0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -681,8 +681,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
 static int cpa_process_alias(struct cpa_data *cpa)
 {
 	struct cpa_data alias_cpa;
-	int ret = 0;
-	unsigned long temp_cpa_vaddr, vaddr;
+	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+	unsigned long vaddr;
+	int ret;
 
 	if (cpa->pfn >= max_pfn_mapped)
 		return 0;
@@ -706,42 +707,37 @@ static int cpa_process_alias(struct cpa_data *cpa)
 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 
 		alias_cpa = *cpa;
-		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
-		alias_cpa.vaddr = &temp_cpa_vaddr;
+		alias_cpa.vaddr = &laddr;
 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
-
 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
+		if (ret)
+			return ret;
 	}
 
 #ifdef CONFIG_X86_64
-	if (ret)
-		return ret;
 	/*
-	 * No need to redo, when the primary call touched the high
-	 * mapping already:
-	 */
-	if (within(vaddr, (unsigned long) _text, _brk_end))
-		return 0;
-
-	/*
-	 * If the physical address is inside the kernel map, we need
+	 * If the primary call didn't touch the high mapping already
+	 * and the physical address is inside the kernel map, we need
 	 * to touch the high mapped kernel as well:
 	 */
-	if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
-		return 0;
-
-	alias_cpa = *cpa;
-	temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
-	alias_cpa.vaddr = &temp_cpa_vaddr;
-	alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+					       __START_KERNEL_map - phys_base;
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &temp_cpa_vaddr;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
-	/*
-	 * The high mapping range is imprecise, so ignore the return value.
-	 */
-	__change_page_attr_set_clr(&alias_cpa, 0);
+		/*
+		 * The high mapping range is imprecise, so ignore the
+		 * return value.
+		 */
+		__change_page_attr_set_clr(&alias_cpa, 0);
+	}
 #endif
-	return ret;
+
+	return 0;
 }
 
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/5] x86: fix pageattr handling for remap percpu allocator and re-enable it
  2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
  2009-06-01  6:34 ` [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix Tejun Heo
  2009-06-01  6:34 ` [PATCH 2/5] x86: reorganize cpa_process_alias() Tejun Heo
@ 2009-06-01  6:34 ` Tejun Heo
  2009-06-01  6:34 ` [PATCH 4/5] x86: implement percpu_alloc kernel parameter Tejun Heo
  2009-06-01  6:34 ` [PATCH 5/5] x86: ensure percpu remap doesn't consume too much vmalloc space Tejun Heo
  4 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86; +Cc: Tejun Heo

Remap allocator aliases a PMD page for each cpu and returns whatever
is unused to the page allocator.  When the pageattr of the recycled
pages are changed, this makes the two aliases point to the overlapping
regions with different attributes which isn't allowed and known to
cause subtle data corruption in certain cases.

This can be handled in simliar manner to the x86_64 highmap alias.
pageattr code should detect if the target pages have PMD alias and
split the PMD alias and synchronize the attributes.

pcpur allocator is updated to keep the allocated PMD pages map sorted
in ascending address order and provide pcpu_pmd_remapped() function
which binary searches the array to determine whether the given address
is aliased and if so to which address.  pageattr is updated to use
pcpu_pmd_remapped() to detect the PMD alias and split it up as
necessary from cpa_process_alias().

Jan Beulich spotted the original problem and incorrect usage of vaddr
instead of laddr for lookup.

With this, remap percpu allocator should work correctly.  Re-enable
it.

[ Impact: fix subtle remap pageattr bug and re-enable remap ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h  |    9 +++++
 arch/x86/kernel/setup_percpu.c |   72 +++++++++++++++++++++++++++++++++++-----
 arch/x86/mm/pageattr.c         |   26 ++++++++++++++-
 3 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 02ecb30..7a785c6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -155,6 +155,15 @@ do {							\
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+void *pcpu_pmd_remapped(void *kaddr);
+#else
+static inline void *pcpu_pmd_remapped(void *kaddr)
+{
+	return NULL;
+}
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 65e5ade..dd567a7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -142,8 +142,8 @@ struct pcpur_ent {
 	void		*ptr;
 };
 
-static size_t pcpur_size __initdata;
-static struct pcpur_ent *pcpur_map __initdata;
+static size_t pcpur_size;
+static struct pcpur_ent *pcpur_map;
 static struct vm_struct pcpur_vm;
 
 static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
@@ -160,15 +160,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	size_t map_size, dyn_size;
 	unsigned int cpu;
+	int i, j;
 	ssize_t ret;
 
 	/*
 	 * If large page isn't supported, there's no benefit in doing
 	 * this.  Also, on non-NUMA, embedding is better.
-	 *
-	 * NOTE: disabled for now.
 	 */
-	if (true || !cpu_has_pse || !pcpu_need_numa())
+	if (!cpu_has_pse || !pcpu_need_numa())
 		return -EINVAL;
 
 	/*
@@ -231,16 +230,71 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
 				     PMD_SIZE, pcpur_vm.addr, NULL);
-	goto out_free_map;
+
+	/* sort pcpur_map array for pcpu_pmd_remapped() */
+	for (i = 0; i < num_possible_cpus() - 1; i++)
+		for (j = i + 1; j < num_possible_cpus(); j++)
+			if (pcpur_map[i].ptr > pcpur_map[j].ptr) {
+				struct pcpur_ent tmp = pcpur_map[i];
+				pcpur_map[i] = pcpur_map[j];
+				pcpur_map[j] = tmp;
+			}
+
+	return ret;
 
 enomem:
 	for_each_possible_cpu(cpu)
 		if (pcpur_map[cpu].ptr)
 			free_bootmem(__pa(pcpur_map[cpu].ptr), PMD_SIZE);
-	ret = -ENOMEM;
-out_free_map:
 	free_bootmem(__pa(pcpur_map), map_size);
-	return ret;
+	return -ENOMEM;
+}
+
+/**
+ * pcpu_pmd_remapped - determine whether a kaddr is in pcpur recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpur recycled area.  This is
+ * used by pageattr to detect VM aliases and break up the pcpu PMD
+ * mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used PMD
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_pmd_remapped(void *kaddr)
+{
+	void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
+	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
+	int left = 0, right = num_possible_cpus() - 1;
+	int pos;
+
+	/* pcpur in use at all? */
+	if (!pcpur_map)
+		return NULL;
+
+	/* okay, perform binary search */
+	while (left <= right) {
+		pos = (left + right) / 2;
+
+		if (pcpur_map[pos].ptr < pmd_addr)
+			left = pos + 1;
+		else if (pcpur_map[pos].ptr > pmd_addr)
+			right = pos - 1;
+		else {
+			/* it shouldn't be in the area for the first chunk */
+			WARN_ON(offset < pcpur_size);
+
+			return pcpur_vm.addr +
+				pcpur_map[pos].cpu * PMD_SIZE + offset;
+		}
+	}
+
+	return NULL;
 }
 #else
 static ssize_t __init setup_pcpu_remap(size_t static_size)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dca77d0..489b9c3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/pfn.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -682,7 +683,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 {
 	struct cpa_data alias_cpa;
 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-	unsigned long vaddr;
+	unsigned long vaddr, remapped;
 	int ret;
 
 	if (cpa->pfn >= max_pfn_mapped)
@@ -737,6 +738,29 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	}
 #endif
 
+	/*
+	 * If the PMD page was partially used for per-cpu remapping,
+	 * the remapped area needs to be split and modified.  Note
+	 * that the partial recycling only happens at the tail of a
+	 * partially used PMD page, so touching single PMD page is
+	 * always enough.
+	 *
+	 * Look up alias using the linear address to detect, for
+	 * example, recycled pages which got vmapped.
+	 */
+	remapped = (unsigned long)pcpu_pmd_remapped((void *)laddr);
+	if (remapped) {
+		int max_pages = PFN_DOWN(PMD_SIZE - (vaddr & ~PMD_MASK));
+
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &remapped;
+		alias_cpa.numpages = min(alias_cpa.numpages, max_pages);
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+		ret = __change_page_attr_set_clr(&alias_cpa, 0);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/5] x86: implement percpu_alloc kernel parameter
  2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
                   ` (2 preceding siblings ...)
  2009-06-01  6:34 ` [PATCH 3/5] x86: fix pageattr handling for remap percpu allocator and re-enable it Tejun Heo
@ 2009-06-01  6:34 ` Tejun Heo
  2009-06-01  6:34 ` [PATCH 5/5] x86: ensure percpu remap doesn't consume too much vmalloc space Tejun Heo
  4 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86; +Cc: Tejun Heo

According to Andi, it isn't clear whether remap allocator is worth the
trouble as there are many processors where PMD TLB is far scarcer than
PTE TLB.  The advantage or disadvantage probably depends on the actual
size of percpu area and specific processor.  As performance
degradation due to TLB pressure tends to be highly workload specific
and subtle, it is difficult to decide which way to go without more
data.

This patch implements percpu_alloc kernel parameter to allow selecting
which first chunk allocator to use to ease debugging and testing.

While at it, make sure all the failure paths report why something
failed to help determining why certain allocator isn't working.  Also,
kill the "Great future plan" comment which had already been realized
quite some time ago.

[ Impact: allow explicit percpu first chunk allocator selection ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |    6 +++
 arch/x86/kernel/setup_percpu.c      |   69 +++++++++++++++++++++++++----------
 mm/percpu.c                         |   13 +++++--
 3 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd5cac0..7523207 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1841,6 +1841,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: { 0 | 1 }
 			See arch/parisc/kernel/pdc_chassis.c
 
+	percpu_alloc=	[X86] Select which percpu first chunk allocator to use.
+			Allowed values are one of "remap", "embed" and "4k".
+			See comments in arch/x86/kernel/setup_percpu.c for
+			details on each allocator.  This parameter is primarily
+			for debugging and performance comparison.
+
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index dd567a7..29be178 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -156,18 +156,21 @@ static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 	return virt_to_page(pcpur_map[cpu].ptr + off);
 }
 
-static ssize_t __init setup_pcpu_remap(size_t static_size)
+static ssize_t __init setup_pcpu_remap(size_t static_size, bool chosen)
 {
 	size_t map_size, dyn_size;
 	unsigned int cpu;
 	int i, j;
 	ssize_t ret;
 
-	/*
-	 * If large page isn't supported, there's no benefit in doing
-	 * this.  Also, on non-NUMA, embedding is better.
-	 */
-	if (!cpu_has_pse || !pcpu_need_numa())
+	/* need PSE */
+	if (!cpu_has_pse) {
+		pr_warning("PERCPU: remap allocator requires PSE\n");
+		return -EINVAL;
+	}
+
+	/* on non-NUMA, embedding is better */
+	if (!chosen && !pcpu_need_numa())
 		return -EINVAL;
 
 	/*
@@ -191,8 +194,11 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 		pcpur_map[cpu].cpu = cpu;
 		pcpur_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
 							PMD_SIZE);
-		if (!pcpur_map[cpu].ptr)
+		if (!pcpur_map[cpu].ptr) {
+			pr_warning("PERCPU: failed to allocate large page "
+				   "for cpu%u\n", cpu);
 			goto enomem;
+		}
 
 		/*
 		 * Only use pcpur_size bytes and give back the rest.
@@ -297,7 +303,7 @@ void *pcpu_pmd_remapped(void *kaddr)
 	return NULL;
 }
 #else
-static ssize_t __init setup_pcpu_remap(size_t static_size)
+static ssize_t __init setup_pcpu_remap(size_t static_size, bool chosen)
 {
 	return -EINVAL;
 }
@@ -311,7 +317,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(size_t static_size)
+static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -320,7 +326,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	 * this.  Also, embedding allocation doesn't play well with
 	 * NUMA.
 	 */
-	if (!cpu_has_pse || pcpu_need_numa())
+	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
 		return -EINVAL;
 
 	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
@@ -370,8 +376,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 			void *ptr;
 
 			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
-			if (!ptr)
+			if (!ptr) {
+				pr_warning("PERCPU: failed to allocate "
+					   "4k page for cpu%u\n", cpu);
 				goto enomem;
+			}
 
 			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
 			pcpu4k_pages[j++] = virt_to_page(ptr);
@@ -395,6 +404,16 @@ out_free_ar:
 	return ret;
 }
 
+/* for explicit first chunk allocator selection */
+static char pcpu_chosen_alloc[16] __initdata;
+
+static int __init percpu_alloc_setup(char *str)
+{
+	strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
+	return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -408,11 +427,6 @@ static inline void setup_percpu_segment(int cpu)
 #endif
 }
 
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
 void __init setup_per_cpu_areas(void)
 {
 	size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -429,9 +443,26 @@ void __init setup_per_cpu_areas(void)
 	 * of large page mappings.  Please read comments on top of
 	 * each allocator for details.
 	 */
-	ret = setup_pcpu_remap(static_size);
-	if (ret < 0)
-		ret = setup_pcpu_embed(static_size);
+	ret = -EINVAL;
+	if (strlen(pcpu_chosen_alloc)) {
+		if (strcmp(pcpu_chosen_alloc, "4k")) {
+			if (!strcmp(pcpu_chosen_alloc, "remap"))
+				ret = setup_pcpu_remap(static_size, true);
+			else if (!strcmp(pcpu_chosen_alloc, "embed"))
+				ret = setup_pcpu_embed(static_size, true);
+			else
+				pr_warning("PERCPU: unknown allocator %s "
+					   "specified\n", pcpu_chosen_alloc);
+			if (ret < 0)
+				pr_warning("PERCPU: %s allocator failed (%zd), "
+					   "falling back to 4k\n",
+					   pcpu_chosen_alloc, ret);
+		}
+	} else {
+		ret = setup_pcpu_remap(static_size, false);
+		if (ret < 0)
+			ret = setup_pcpu_embed(static_size, false);
+	}
 	if (ret < 0)
 		ret = setup_pcpu_4k(static_size);
 	if (ret < 0)
diff --git a/mm/percpu.c b/mm/percpu.c
index c0b2c1a..f780bee 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1234,6 +1234,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      ssize_t dyn_size, ssize_t unit_size)
 {
+	size_t chunk_size;
 	unsigned int cpu;
 
 	/* determine parameters and allocate */
@@ -1248,11 +1249,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	} else
 		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 
-	pcpue_ptr = __alloc_bootmem_nopanic(
-					num_possible_cpus() * pcpue_unit_size,
-					PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-	if (!pcpue_ptr)
+	chunk_size = pcpue_unit_size * num_possible_cpus();
+
+	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+					    __pa(MAX_DMA_ADDRESS));
+	if (!pcpue_ptr) {
+		pr_warning("PERCPU: failed to allocate %zu bytes for "
+			   "embedding\n", chunk_size);
 		return -ENOMEM;
+	}
 
 	/* return the leftover and copy */
 	for_each_possible_cpu(cpu) {
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/5] x86: ensure percpu remap doesn't consume too much vmalloc space
  2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
                   ` (3 preceding siblings ...)
  2009-06-01  6:34 ` [PATCH 4/5] x86: implement percpu_alloc kernel parameter Tejun Heo
@ 2009-06-01  6:34 ` Tejun Heo
  4 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-06-01  6:34 UTC (permalink / raw)
  To: JBeulich, andi, mingo, hpa, tglx, linux-kernel, x86; +Cc: Tejun Heo

On extreme configuration (e.g. 32bit 32-way NUMA machine), remap
percpu first chunk allocator can consume too much of vmalloc space.
Make it fall back to 4k allocator if the consumption goes over 20%.

[ Impact: add sanity check for remap percpu first chunk allocator ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |   18 +++++++++++++++---
 1 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29be178..1f28574 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -169,9 +169,21 @@ static ssize_t __init setup_pcpu_remap(size_t static_size, bool chosen)
 		return -EINVAL;
 	}
 
-	/* on non-NUMA, embedding is better */
-	if (!chosen && !pcpu_need_numa())
-		return -EINVAL;
+	if (!chosen) {
+		size_t vm_size = VMALLOC_END - VMALLOC_START;
+		size_t tot_size = num_possible_cpus() * PMD_SIZE;
+
+		/* on non-NUMA, embedding is better */
+		if (!pcpu_need_numa())
+			return -EINVAL;
+
+		/* don't consume more than 20% of vmalloc area */
+		if (tot_size > vm_size / 5) {
+			pr_info("PERCPU: too large chunk size %zuMB for "
+				"large page remap\n", tot_size >> 20);
+			return -EINVAL;
+		}
+	}
 
 	/*
 	 * Currently supports only single page.  Supporting multiple
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix
  2009-05-15  4:28 [GIT PATCH 2.6.30-rc5] x86,percpu: fix pageattr handling with remap allocator, take#2 Tejun Heo
@ 2009-05-15  4:28 ` Tejun Heo
  0 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-05-15  4:28 UTC (permalink / raw)
  To: JBeulich, andi, mingo, linux-kernel-owner, hpa, tglx, linux-kernel
  Cc: Tejun Heo

Make the following changes in preparation of coming pageattr updates.

* Define and use array of struct pcpur_ent instead of array of
  pointers.  The only difference is ->cpu field which is set but
  unused yet.

* Rename variables according to the above change.

* Rename local variable vm to pcpur_vm and move it out of the
  function.

[ Impact: no functional difference ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |   58 ++++++++++++++++++++++-----------------
 1 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4c..c17059c 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
  * better than only using 4k mappings while still being NUMA friendly.
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pcpur_ent {
+	unsigned int	cpu;
+	void		*ptr;
+};
+
 static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
+static struct pcpur_ent *pcpur_map __initdata;
+static struct vm_struct pcpur_vm;
 
 static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 {
@@ -147,13 +153,12 @@ static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 	if (off >= pcpur_size)
 		return NULL;
 
-	return virt_to_page(pcpur_ptrs[cpu] + off);
+	return virt_to_page(pcpur_map[cpu].ptr + off);
 }
 
 static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
-	static struct vm_struct vm;
-	size_t ptrs_size, dyn_size;
+	size_t map_size, dyn_size;
 	unsigned int cpu;
 	ssize_t ret;
 
@@ -178,12 +183,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
-	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
-	pcpur_ptrs = alloc_bootmem(ptrs_size);
+	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_map[0]));
+	pcpur_map = alloc_bootmem(map_size);
 
 	for_each_possible_cpu(cpu) {
-		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
-		if (!pcpur_ptrs[cpu])
+		pcpur_map[cpu].cpu = cpu;
+		pcpur_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
+							PMD_SIZE);
+		if (!pcpur_map[cpu].ptr)
 			goto enomem;
 
 		/*
@@ -194,42 +201,43 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 		 * not well-specified to have a PAT-incompatible area
 		 * (unmapped RAM, device memory, etc.) in that hole.
 		 */
-		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+		free_bootmem(__pa(pcpur_map[cpu].ptr + pcpur_size),
 			     PMD_SIZE - pcpur_size);
 
-		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(pcpur_map[cpu].ptr, __per_cpu_load, static_size);
 	}
 
 	/* allocate address and map */
-	vm.flags = VM_ALLOC;
-	vm.size = num_possible_cpus() * PMD_SIZE;
-	vm_area_register_early(&vm, PMD_SIZE);
+	pcpur_vm.flags = VM_ALLOC;
+	pcpur_vm.size = num_possible_cpus() * PMD_SIZE;
+	vm_area_register_early(&pcpur_vm, PMD_SIZE);
 
 	for_each_possible_cpu(cpu) {
-		pmd_t *pmd;
+		pmd_t *pmd, pmd_v;
 
-		pmd = populate_extra_pmd((unsigned long)vm.addr
-					 + cpu * PMD_SIZE);
-		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
-				     PAGE_KERNEL_LARGE));
+		pmd = populate_extra_pmd((unsigned long)pcpur_vm.addr +
+					 cpu * PMD_SIZE);
+		pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpur_map[cpu].ptr)),
+				PAGE_KERNEL_LARGE);
+		set_pmd(pmd, pmd_v);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", vm.addr, static_size);
+		"%zu bytes\n", pcpur_vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     PMD_SIZE, vm.addr, NULL);
-	goto out_free_ar;
+				     PMD_SIZE, pcpur_vm.addr, NULL);
+	goto out_free_map;
 
 enomem:
 	for_each_possible_cpu(cpu)
-		if (pcpur_ptrs[cpu])
-			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+		if (pcpur_map[cpu].ptr)
+			free_bootmem(__pa(pcpur_map[cpu].ptr), PMD_SIZE);
 	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+out_free_map:
+	free_bootmem(__pa(pcpur_map), map_size);
 	return ret;
 }
 #else
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-06-01  6:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-01  6:34 [GIT PATCH core/percpu] x86,percpu: fix pageattr handling with remap allocator, take#3 Tejun Heo
2009-06-01  6:34 ` [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix Tejun Heo
2009-06-01  6:34 ` [PATCH 2/5] x86: reorganize cpa_process_alias() Tejun Heo
2009-06-01  6:34 ` [PATCH 3/5] x86: fix pageattr handling for remap percpu allocator and re-enable it Tejun Heo
2009-06-01  6:34 ` [PATCH 4/5] x86: implement percpu_alloc kernel parameter Tejun Heo
2009-06-01  6:34 ` [PATCH 5/5] x86: ensure percpu remap doesn't consume too much vmalloc space Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2009-05-15  4:28 [GIT PATCH 2.6.30-rc5] x86,percpu: fix pageattr handling with remap allocator, take#2 Tejun Heo
2009-05-15  4:28 ` [PATCH 1/5] x86: prepare setup_pcpu_remap() for pageattr fix Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).