Re: 2.6.31-git5 kernel boot hangs on powerpc

From: Tejun Heo <tj@kernel.org>
To: Sachin Sant <sachinp@in.ibm.com>
Cc: Linux/PPC Development <linuxppc-dev@ozlabs.org>,
	David Miller <davem@davemloft.net>
Subject: Re: 2.6.31-git5 kernel boot hangs on powerpc
Date: Wed, 23 Sep 2009 23:17:06 +0900	[thread overview]
Message-ID: <4ABA2DE2.6000601@kernel.org> (raw)
In-Reply-To: <4AB9DD8F.1040305@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 605 bytes --]

Tejun Heo wrote:
>> One workaround i have found for this problem is to disable IPv6.
>> With IPv6 disabled the machine boots OK. Till a reliable solution
>> is available for this issue, i will keep IPv6 disabled in my configs.
> 
> I'm think it's most likely caused by some code accessing invalid
> percpu address.  I'm currently writing up access validator.  Should be
> done in several hours.  So, ipv6 it is.  I couldn't reproduce your
> problem here.  I'll give ipv6 a shot.

Can you please apply the attached patch and see whether anything
interesting shows up in the kernel log?

Thanks.

-- 
tejun

[-- Attachment #2: pcpu-verify --]
[-- Type: text/plain, Size: 13367 bytes --]

---
 arch/ia64/include/asm/sn/arch.h  |    4 -
 arch/powerpc/mm/stab.c           |    4 -
 arch/x86/kernel/cpu/cpu_debug.c  |   12 ++--
 arch/x86/kernel/cpu/perf_event.c |    4 -
 include/asm-generic/percpu.h     |   23 +++++---
 include/linux/percpu-defs.h      |    6 ++
 include/linux/percpu.h           |    7 ++
 kernel/softirq.c                 |    8 +-
 lib/Kconfig.debug                |   17 +++++
 mm/percpu.c                      |  112 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 173 insertions(+), 24 deletions(-)

Index: work/arch/ia64/include/asm/sn/arch.h
===================================================================

--- work.orig/arch/ia64/include/asm/sn/arch.h
+++ work/arch/ia64/include/asm/sn/arch.h
@@ -71,8 +71,8 @@ DECLARE_PER_CPU(struct sn_hub_info_s, __
  * Compact node ID to nasid mappings kept in the per-cpu data areas of each
  * cpu.
  */
-DECLARE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]);
-#define sn_cnodeid_to_nasid	(&__get_cpu_var(__sn_cnodeid_to_nasid[0]))
+DECLARE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
+#define sn_cnodeid_to_nasid	(&__get_cpu_var(__sn_cnodeid_to_nasid)[0])
 
 
 extern u8 sn_partition_id;
Index: work/arch/powerpc/mm/stab.c
===================================================================
--- work.orig/arch/powerpc/mm/stab.c
+++ work/arch/powerpc/mm/stab.c
@@ -138,7 +138,7 @@ static int __ste_allocate(unsigned long
 	if (!is_kernel_addr(ea)) {
 		offset = __get_cpu_var(stab_cache_ptr);
 		if (offset < NR_STAB_CACHE_ENTRIES)
-			__get_cpu_var(stab_cache[offset++]) = stab_entry;
+			__get_cpu_var(stab_cache)[offset++] = stab_entry;
 		else
 			offset = NR_STAB_CACHE_ENTRIES+1;
 		__get_cpu_var(stab_cache_ptr) = offset;
@@ -185,7 +185,7 @@ void switch_stab(struct task_struct *tsk
 		int i;
 
 		for (i = 0; i < offset; i++) {
-			ste = stab + __get_cpu_var(stab_cache[i]);
+			ste = stab + __get_cpu_var(stab_cache)[i];
 			ste->esid_data = 0; /* invalidate entry */
 		}
 	} else {
Index: work/arch/x86/kernel/cpu/cpu_debug.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/cpu_debug.c
+++ work/arch/x86/kernel/cpu/cpu_debug.c
@@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu,
 
 	/* Already intialized */
 	if (file == CPU_INDEX_BIT)
-		if (per_cpu(cpu_arr[type].init, cpu))
+		if (per_cpu(cpu_arr, cpu)[type].init)
 			return 0;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -543,7 +543,7 @@ static int cpu_create_file(unsigned cpu,
 	priv->reg = reg;
 	priv->file = file;
 	mutex_lock(&cpu_debug_lock);
-	per_cpu(priv_arr[type], cpu) = priv;
+	per_cpu(priv_arr, cpu)[type] = priv;
 	per_cpu(cpu_priv_count, cpu)++;
 	mutex_unlock(&cpu_debug_lock);
 
@@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu,
 				    dentry, (void *)priv, &cpu_fops);
 	else {
 		debugfs_create_file(cpu_base[type].name, S_IRUGO,
-				    per_cpu(cpu_arr[type].dentry, cpu),
+				    per_cpu(cpu_arr, cpu)[type].dentry,
 				    (void *)priv, &cpu_fops);
 		mutex_lock(&cpu_debug_lock);
-		per_cpu(cpu_arr[type].init, cpu) = 1;
+		per_cpu(cpu_arr, cpu)[type].init = 1;
 		mutex_unlock(&cpu_debug_lock);
 	}
 
@@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu,
 		if (!is_typeflag_valid(cpu, cpu_base[type].flag))
 			continue;
 		cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
-		per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+		per_cpu(cpu_arr, cpu)[type].dentry = cpu_dentry;
 
 		if (type < CPU_TSS_BIT)
 			err = cpu_init_msr(cpu, type, cpu_dentry);
@@ -677,7 +677,7 @@ static void __exit cpu_debug_exit(void)
 
 	for (cpu = 0; cpu <  nr_cpu_ids; cpu++)
 		for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
-			kfree(per_cpu(priv_arr[i], cpu));
+			kfree(per_cpu(priv_arr, cpu)[i]);
 }
 
 module_init(cpu_debug_init);
Index: work/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/perf_event.c
+++ work/arch/x86/kernel/cpu/perf_event.c
@@ -1253,7 +1253,7 @@ x86_perf_event_set_period(struct perf_ev
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
-	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+	per_cpu(pmc_prev_left, smp_processor_id())[idx] = left;
 
 	/*
 	 * The hw event starts counting from this event offset,
@@ -1470,7 +1470,7 @@ void perf_event_print_debug(void)
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
-		prev_left = per_cpu(pmc_prev_left[idx], cpu);
+		prev_left = per_cpu(pmc_prev_left, cpu)[idx];
 
 		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
Index: work/include/asm-generic/percpu.h
===================================================================
--- work.orig/include/asm-generic/percpu.h
+++ work/include/asm-generic/percpu.h
@@ -49,13 +49,22 @@ extern unsigned long __per_cpu_offset[NR
  * established ways to produce a usable pointer from the percpu variable
  * offset.
  */
-#define per_cpu(var, cpu) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
-#define __get_cpu_var(var) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
-#define __raw_get_cpu_var(var) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
-
+#define per_cpu(var, cpu)	(*({					\
+	typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var);	\
+	unsigned int __pcpu_cpu__ = (cpu);				\
+	pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__);			\
+	SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset(__pcpu_cpu__));	\
+}))
+#define __get_cpu_var(var)	(*({					\
+	typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var);	\
+	pcpu_verify_access(__pcpu_ptr__, NR_CPUS);			\
+	SHIFT_PERCPU_PTR(__pcpu_ptr__, my_cpu_offset);			\
+}))
+#define __raw_get_cpu_var(var)	(*({					\
+	typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var);	\
+	pcpu_verify_access(__pcpu_ptr__, NR_CPUS);			\
+	SHIFT_PERCPU_PTR(__pcpu_ptr__, __my_cpu_offset);		\
+}))
 
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void setup_per_cpu_areas(void);
Index: work/include/linux/percpu-defs.h
===================================================================
--- work.orig/include/linux/percpu-defs.h
+++ work/include/linux/percpu-defs.h
@@ -7,6 +7,12 @@
  */
 #define per_cpu_var(var) per_cpu__##var
 
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+extern void pcpu_verify_access(void *ptr, unsigned int cpu);
+#else
+#define pcpu_verify_access(ptr, cpu)	do {} while (0)
+#endif
+
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
Index: work/include/linux/percpu.h
===================================================================
--- work.orig/include/linux/percpu.h
+++ work/include/linux/percpu.h
@@ -127,7 +127,12 @@ extern int __init pcpu_page_first_chunk(
  * dynamically allocated. Non-atomic access to the current CPU's
  * version should probably be combined with get_cpu()/put_cpu().
  */
-#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+#define per_cpu_ptr(ptr, cpu)	({					\
+	typeof(ptr) __pcpu_ptr__ = (ptr);				\
+	unsigned int __pcpu_cpu__ = (cpu);				\
+	pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__);			\
+	SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset((__pcpu_cpu__)));	\
+})
 
 extern void *__alloc_reserved_percpu(size_t size, size_t align);
 
Index: work/kernel/softirq.c
===================================================================
--- work.orig/kernel/softirq.c
+++ work/kernel/softirq.c
@@ -560,7 +560,7 @@ EXPORT_PER_CPU_SYMBOL(softirq_work_list)
 
 static void __local_trigger(struct call_single_data *cp, int softirq)
 {
-	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+	struct list_head *head = &__get_cpu_var(softirq_work_list)[softirq];
 
 	list_add_tail(&cp->list, head);
 
@@ -656,13 +656,13 @@ static int __cpuinit remote_softirq_cpu_
 
 		local_irq_disable();
 		for (i = 0; i < NR_SOFTIRQS; i++) {
-			struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+			struct list_head *head = &per_cpu(softirq_work_list, cpu)[i];
 			struct list_head *local_head;
 
 			if (list_empty(head))
 				continue;
 
-			local_head = &__get_cpu_var(softirq_work_list[i]);
+			local_head = &__get_cpu_var(softirq_work_list)[i];
 			list_splice_init(head, local_head);
 			raise_softirq_irqoff(i);
 		}
@@ -688,7 +688,7 @@ void __init softirq_init(void)
 		per_cpu(tasklet_hi_vec, cpu).tail =
 			&per_cpu(tasklet_hi_vec, cpu).head;
 		for (i = 0; i < NR_SOFTIRQS; i++)
-			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
+			INIT_LIST_HEAD(&per_cpu(softirq_work_list, cpu)[i]);
 	}
 
 	register_hotcpu_notifier(&remote_softirq_cpu_notifier);
Index: work/lib/Kconfig.debug
===================================================================
--- work.orig/lib/Kconfig.debug
+++ work/lib/Kconfig.debug
@@ -805,6 +805,21 @@ config DEBUG_BLOCK_EXT_DEVT
 
 	  Say N if you are unsure.
 
+config DEBUG_VERIFY_PER_CPU
+	bool "Verify per-cpu accesses"
+	depends on DEBUG_KERNEL
+	depends on SMP
+	help
+
+	  This option makes percpu access macros to verify the
+	  specified processor and percpu variable offset on each
+	  access.  This helps catching percpu variable access bugs
+	  which may cause corruption on unrelated memory region making
+	  it very difficult to catch at the cost of making percpu
+	  accesses considerably slow.
+
+	  Say N if you are unsure.
+
 config DEBUG_FORCE_WEAK_PER_CPU
 	bool "Force weak per-cpu definitions"
 	depends on DEBUG_KERNEL
@@ -820,6 +835,8 @@ config DEBUG_FORCE_WEAK_PER_CPU
 	  To ensure that generic code follows the above rules, this
 	  option forces all percpu variables to be defined as weak.
 
+	  Say N if you are unsure.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -1241,6 +1241,118 @@ void free_percpu(void *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+static struct pcpu_chunk *pcpu_verify_match_chunk(void *addr)
+{
+	void *first_start = pcpu_first_chunk->base_addr;
+	struct pcpu_chunk *chunk;
+	int slot;
+
+	/* is it in the first chunk? */
+	if (addr >= first_start && addr < first_start + pcpu_unit_size) {
+		/* is it in the reserved area? */
+		if (addr < first_start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+		return pcpu_first_chunk;
+	}
+
+	/* walk each dynamic chunk */
+	for (slot = 0; slot < pcpu_nr_slots; slot++)
+		list_for_each_entry(chunk, &pcpu_slot[slot], list)
+			if (addr >= chunk->base_addr &&
+			    addr < chunk->base_addr + pcpu_unit_size)
+				return chunk;
+	return NULL;
+}
+
+void pcpu_verify_access(void *ptr, unsigned int cpu)
+{
+	static bool verifying[NR_CPUS];
+	static int warn_limit = 10;
+	char cbuf[80], obuf[160];
+	void *addr = __pcpu_ptr_to_addr(ptr);
+	bool is_static = false;
+	struct pcpu_chunk *chunk;
+	unsigned long flags;
+	int i, addr_off, off, len, end;
+
+	/* not been initialized yet or whined enough already */
+	if (unlikely(!pcpu_first_chunk || !warn_limit))
+		return;
+
+	/* don't re-enter */
+	preempt_disable();
+	if (verifying[raw_smp_processor_id()]) {
+		preempt_enable_no_resched();
+		return;
+	}
+	verifying[raw_smp_processor_id()] = true;
+
+	cbuf[0] = '\0';
+	obuf[0] = '\0';
+
+	if (unlikely(cpu < NR_CPUS && !cpu_possible(cpu)) && warn_limit)
+		snprintf(cbuf, sizeof(cbuf), "invalid cpu %u", cpu);
+
+	/*
+	 * We can enter this function from weird places and have no
+	 * way to reliably avoid deadlock.  If lock is available, grab
+	 * it and verify.  If not, just let it go through.
+	 */
+	if (!spin_trylock_irqsave(&pcpu_lock, flags))
+		goto out;
+
+	chunk = pcpu_verify_match_chunk(addr);
+	if (!chunk) {
+		snprintf(obuf, sizeof(obuf),
+			 "no matching chunk ptr=%p addr=%p", ptr, addr);
+		goto out_unlock;
+	}
+
+	addr_off = addr - chunk->base_addr;
+	if (chunk->base_addr == pcpu_first_chunk->base_addr)
+		if (chunk == pcpu_reserved_chunk || addr_off < -chunk->map[0])
+			is_static = true;
+
+	for (i = 0, off = 0; i < chunk->map_used; i++, off = end) {
+		len = chunk->map[i];
+		end = off + abs(len);
+
+		if (addr_off == off) {
+			if (unlikely(len > 0))
+				snprintf(obuf, sizeof(obuf),
+					 "free area accessed ptr=%p addr=%p "
+					 "off=%d len=%d", ptr, addr, off, len);
+			break;
+		}
+		if (!is_static && off < addr_off && addr_off < end) {
+			snprintf(obuf, sizeof(obuf),
+				 "%sarea accessed in the middle ptr=%p "
+				 "addr=%p:%d off=%d len=%d",
+				 len > 0 ? "free " : "",
+				 ptr, addr, addr_off, off, abs(len));
+			break;
+		}
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+out:
+	if (unlikely(cbuf[0] || obuf[0])) {
+		printk(KERN_ERR "PERCPU: %s%s%s\n",
+		       cbuf, cbuf[0] ? ", " : "", obuf);
+		dump_stack();
+		if (!--warn_limit)
+			printk(KERN_WARNING "PERCPU: access warning limit "
+			       "reached, turning off access validation\n");
+	}
+
+	verifying[raw_smp_processor_id()] = false;
+	preempt_enable_no_resched();
+}
+EXPORT_SYMBOL_GPL(pcpu_verify_access);
+#endif	/* CONFIG_DEBUG_VERIFY_PER_CPU */
+
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 					size_t reserved_size,
 					ssize_t *dyn_sizep)