From: Tejun Heo <tj@kernel.org>
To: Sachin Sant <sachinp@in.ibm.com>
Cc: Linux/PPC Development <linuxppc-dev@ozlabs.org>,
David Miller <davem@davemloft.net>
Subject: Re: 2.6.31-git5 kernel boot hangs on powerpc
Date: Wed, 23 Sep 2009 23:17:06 +0900 [thread overview]
Message-ID: <4ABA2DE2.6000601@kernel.org> (raw)
In-Reply-To: <4AB9DD8F.1040305@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 605 bytes --]
Tejun Heo wrote:
>> One workaround i have found for this problem is to disable IPv6.
>> With IPv6 disabled the machine boots OK. Till a reliable solution
>> is available for this issue, i will keep IPv6 disabled in my configs.
>
> I'm think it's most likely caused by some code accessing invalid
> percpu address. I'm currently writing up access validator. Should be
> done in several hours. So, ipv6 it is. I couldn't reproduce your
> problem here. I'll give ipv6 a shot.
Can you please apply the attached patch and see whether anything
interesting shows up in the kernel log?
Thanks.
--
tejun
[-- Attachment #2: pcpu-verify --]
[-- Type: text/plain, Size: 13367 bytes --]
---
arch/ia64/include/asm/sn/arch.h | 4 -
arch/powerpc/mm/stab.c | 4 -
arch/x86/kernel/cpu/cpu_debug.c | 12 ++--
arch/x86/kernel/cpu/perf_event.c | 4 -
include/asm-generic/percpu.h | 23 +++++---
include/linux/percpu-defs.h | 6 ++
include/linux/percpu.h | 7 ++
kernel/softirq.c | 8 +-
lib/Kconfig.debug | 17 +++++
mm/percpu.c | 112 +++++++++++++++++++++++++++++++++++++++
10 files changed, 173 insertions(+), 24 deletions(-)
Index: work/arch/ia64/include/asm/sn/arch.h
===================================================================
--- work.orig/arch/ia64/include/asm/sn/arch.h
+++ work/arch/ia64/include/asm/sn/arch.h
@@ -71,8 +71,8 @@ DECLARE_PER_CPU(struct sn_hub_info_s, __
* Compact node ID to nasid mappings kept in the per-cpu data areas of each
* cpu.
*/
-DECLARE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]);
-#define sn_cnodeid_to_nasid (&__get_cpu_var(__sn_cnodeid_to_nasid[0]))
+DECLARE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
+#define sn_cnodeid_to_nasid (&__get_cpu_var(__sn_cnodeid_to_nasid)[0])
extern u8 sn_partition_id;
Index: work/arch/powerpc/mm/stab.c
===================================================================
--- work.orig/arch/powerpc/mm/stab.c
+++ work/arch/powerpc/mm/stab.c
@@ -138,7 +138,7 @@ static int __ste_allocate(unsigned long
if (!is_kernel_addr(ea)) {
offset = __get_cpu_var(stab_cache_ptr);
if (offset < NR_STAB_CACHE_ENTRIES)
- __get_cpu_var(stab_cache[offset++]) = stab_entry;
+ __get_cpu_var(stab_cache)[offset++] = stab_entry;
else
offset = NR_STAB_CACHE_ENTRIES+1;
__get_cpu_var(stab_cache_ptr) = offset;
@@ -185,7 +185,7 @@ void switch_stab(struct task_struct *tsk
int i;
for (i = 0; i < offset; i++) {
- ste = stab + __get_cpu_var(stab_cache[i]);
+ ste = stab + __get_cpu_var(stab_cache)[i];
ste->esid_data = 0; /* invalidate entry */
}
} else {
Index: work/arch/x86/kernel/cpu/cpu_debug.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/cpu_debug.c
+++ work/arch/x86/kernel/cpu/cpu_debug.c
@@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu,
/* Already intialized */
if (file == CPU_INDEX_BIT)
- if (per_cpu(cpu_arr[type].init, cpu))
+ if (per_cpu(cpu_arr, cpu)[type].init)
return 0;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -543,7 +543,7 @@ static int cpu_create_file(unsigned cpu,
priv->reg = reg;
priv->file = file;
mutex_lock(&cpu_debug_lock);
- per_cpu(priv_arr[type], cpu) = priv;
+ per_cpu(priv_arr, cpu)[type] = priv;
per_cpu(cpu_priv_count, cpu)++;
mutex_unlock(&cpu_debug_lock);
@@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu,
dentry, (void *)priv, &cpu_fops);
else {
debugfs_create_file(cpu_base[type].name, S_IRUGO,
- per_cpu(cpu_arr[type].dentry, cpu),
+ per_cpu(cpu_arr, cpu)[type].dentry,
(void *)priv, &cpu_fops);
mutex_lock(&cpu_debug_lock);
- per_cpu(cpu_arr[type].init, cpu) = 1;
+ per_cpu(cpu_arr, cpu)[type].init = 1;
mutex_unlock(&cpu_debug_lock);
}
@@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu,
if (!is_typeflag_valid(cpu, cpu_base[type].flag))
continue;
cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
- per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+ per_cpu(cpu_arr, cpu)[type].dentry = cpu_dentry;
if (type < CPU_TSS_BIT)
err = cpu_init_msr(cpu, type, cpu_dentry);
@@ -677,7 +677,7 @@ static void __exit cpu_debug_exit(void)
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
- kfree(per_cpu(priv_arr[i], cpu));
+ kfree(per_cpu(priv_arr, cpu)[i]);
}
module_init(cpu_debug_init);
Index: work/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/perf_event.c
+++ work/arch/x86/kernel/cpu/perf_event.c
@@ -1253,7 +1253,7 @@ x86_perf_event_set_period(struct perf_ev
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left, smp_processor_id())[idx] = left;
/*
* The hw event starts counting from this event offset,
@@ -1470,7 +1470,7 @@ void perf_event_print_debug(void)
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(pmc_prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left, cpu)[idx];
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
Index: work/include/asm-generic/percpu.h
===================================================================
--- work.orig/include/asm-generic/percpu.h
+++ work/include/asm-generic/percpu.h
@@ -49,13 +49,22 @@ extern unsigned long __per_cpu_offset[NR
* established ways to produce a usable pointer from the percpu variable
* offset.
*/
-#define per_cpu(var, cpu) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
-#define __get_cpu_var(var) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
-#define __raw_get_cpu_var(var) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
-
+#define per_cpu(var, cpu) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ unsigned int __pcpu_cpu__ = (cpu); \
+ pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset(__pcpu_cpu__)); \
+}))
+#define __get_cpu_var(var) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ pcpu_verify_access(__pcpu_ptr__, NR_CPUS); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, my_cpu_offset); \
+}))
+#define __raw_get_cpu_var(var) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ pcpu_verify_access(__pcpu_ptr__, NR_CPUS); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, __my_cpu_offset); \
+}))
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
extern void setup_per_cpu_areas(void);
Index: work/include/linux/percpu-defs.h
===================================================================
--- work.orig/include/linux/percpu-defs.h
+++ work/include/linux/percpu-defs.h
@@ -7,6 +7,12 @@
*/
#define per_cpu_var(var) per_cpu__##var
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+extern void pcpu_verify_access(void *ptr, unsigned int cpu);
+#else
+#define pcpu_verify_access(ptr, cpu) do {} while (0)
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
Index: work/include/linux/percpu.h
===================================================================
--- work.orig/include/linux/percpu.h
+++ work/include/linux/percpu.h
@@ -127,7 +127,12 @@ extern int __init pcpu_page_first_chunk(
* dynamically allocated. Non-atomic access to the current CPU's
* version should probably be combined with get_cpu()/put_cpu().
*/
-#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+#define per_cpu_ptr(ptr, cpu) ({ \
+ typeof(ptr) __pcpu_ptr__ = (ptr); \
+ unsigned int __pcpu_cpu__ = (cpu); \
+ pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset((__pcpu_cpu__))); \
+})
extern void *__alloc_reserved_percpu(size_t size, size_t align);
Index: work/kernel/softirq.c
===================================================================
--- work.orig/kernel/softirq.c
+++ work/kernel/softirq.c
@@ -560,7 +560,7 @@ EXPORT_PER_CPU_SYMBOL(softirq_work_list)
static void __local_trigger(struct call_single_data *cp, int softirq)
{
- struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+ struct list_head *head = &__get_cpu_var(softirq_work_list)[softirq];
list_add_tail(&cp->list, head);
@@ -656,13 +656,13 @@ static int __cpuinit remote_softirq_cpu_
local_irq_disable();
for (i = 0; i < NR_SOFTIRQS; i++) {
- struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+ struct list_head *head = &per_cpu(softirq_work_list, cpu)[i];
struct list_head *local_head;
if (list_empty(head))
continue;
- local_head = &__get_cpu_var(softirq_work_list[i]);
+ local_head = &__get_cpu_var(softirq_work_list)[i];
list_splice_init(head, local_head);
raise_softirq_irqoff(i);
}
@@ -688,7 +688,7 @@ void __init softirq_init(void)
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
for (i = 0; i < NR_SOFTIRQS; i++)
- INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
+ INIT_LIST_HEAD(&per_cpu(softirq_work_list, cpu)[i]);
}
register_hotcpu_notifier(&remote_softirq_cpu_notifier);
Index: work/lib/Kconfig.debug
===================================================================
--- work.orig/lib/Kconfig.debug
+++ work/lib/Kconfig.debug
@@ -805,6 +805,21 @@ config DEBUG_BLOCK_EXT_DEVT
Say N if you are unsure.
+config DEBUG_VERIFY_PER_CPU
+ bool "Verify per-cpu accesses"
+ depends on DEBUG_KERNEL
+ depends on SMP
+ help
+
+ This option makes percpu access macros to verify the
+ specified processor and percpu variable offset on each
+ access. This helps catching percpu variable access bugs
+ which may cause corruption on unrelated memory region making
+ it very difficult to catch at the cost of making percpu
+ accesses considerably slow.
+
+ Say N if you are unsure.
+
config DEBUG_FORCE_WEAK_PER_CPU
bool "Force weak per-cpu definitions"
depends on DEBUG_KERNEL
@@ -820,6 +835,8 @@ config DEBUG_FORCE_WEAK_PER_CPU
To ensure that generic code follows the above rules, this
option forces all percpu variables to be defined as weak.
+ Say N if you are unsure.
+
config LKDTM
tristate "Linux Kernel Dump Test Tool Module"
depends on DEBUG_KERNEL
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -1241,6 +1241,118 @@ void free_percpu(void *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+static struct pcpu_chunk *pcpu_verify_match_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+ struct pcpu_chunk *chunk;
+ int slot;
+
+ /* is it in the first chunk? */
+ if (addr >= first_start && addr < first_start + pcpu_unit_size) {
+ /* is it in the reserved area? */
+ if (addr < first_start + pcpu_reserved_chunk_limit)
+ return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
+ }
+
+ /* walk each dynamic chunk */
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_slot[slot], list)
+ if (addr >= chunk->base_addr &&
+ addr < chunk->base_addr + pcpu_unit_size)
+ return chunk;
+ return NULL;
+}
+
+void pcpu_verify_access(void *ptr, unsigned int cpu)
+{
+ static bool verifying[NR_CPUS];
+ static int warn_limit = 10;
+ char cbuf[80], obuf[160];
+ void *addr = __pcpu_ptr_to_addr(ptr);
+ bool is_static = false;
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int i, addr_off, off, len, end;
+
+ /* not been initialized yet or whined enough already */
+ if (unlikely(!pcpu_first_chunk || !warn_limit))
+ return;
+
+ /* don't re-enter */
+ preempt_disable();
+ if (verifying[raw_smp_processor_id()]) {
+ preempt_enable_no_resched();
+ return;
+ }
+ verifying[raw_smp_processor_id()] = true;
+
+ cbuf[0] = '\0';
+ obuf[0] = '\0';
+
+ if (unlikely(cpu < NR_CPUS && !cpu_possible(cpu)) && warn_limit)
+ snprintf(cbuf, sizeof(cbuf), "invalid cpu %u", cpu);
+
+ /*
+ * We can enter this function from weird places and have no
+ * way to reliably avoid deadlock. If lock is available, grab
+ * it and verify. If not, just let it go through.
+ */
+ if (!spin_trylock_irqsave(&pcpu_lock, flags))
+ goto out;
+
+ chunk = pcpu_verify_match_chunk(addr);
+ if (!chunk) {
+ snprintf(obuf, sizeof(obuf),
+ "no matching chunk ptr=%p addr=%p", ptr, addr);
+ goto out_unlock;
+ }
+
+ addr_off = addr - chunk->base_addr;
+ if (chunk->base_addr == pcpu_first_chunk->base_addr)
+ if (chunk == pcpu_reserved_chunk || addr_off < -chunk->map[0])
+ is_static = true;
+
+ for (i = 0, off = 0; i < chunk->map_used; i++, off = end) {
+ len = chunk->map[i];
+ end = off + abs(len);
+
+ if (addr_off == off) {
+ if (unlikely(len > 0))
+ snprintf(obuf, sizeof(obuf),
+ "free area accessed ptr=%p addr=%p "
+ "off=%d len=%d", ptr, addr, off, len);
+ break;
+ }
+ if (!is_static && off < addr_off && addr_off < end) {
+ snprintf(obuf, sizeof(obuf),
+ "%sarea accessed in the middle ptr=%p "
+ "addr=%p:%d off=%d len=%d",
+ len > 0 ? "free " : "",
+ ptr, addr, addr_off, off, abs(len));
+ break;
+ }
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+out:
+ if (unlikely(cbuf[0] || obuf[0])) {
+ printk(KERN_ERR "PERCPU: %s%s%s\n",
+ cbuf, cbuf[0] ? ", " : "", obuf);
+ dump_stack();
+ if (!--warn_limit)
+ printk(KERN_WARNING "PERCPU: access warning limit "
+ "reached, turning off access validation\n");
+ }
+
+ verifying[raw_smp_processor_id()] = false;
+ preempt_enable_no_resched();
+}
+EXPORT_SYMBOL_GPL(pcpu_verify_access);
+#endif /* CONFIG_DEBUG_VERIFY_PER_CPU */
+
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
next prev parent reply other threads:[~2009-09-23 14:17 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-16 12:25 2.6.31-git5 kernel boot hangs on powerpc Sachin Sant
2009-09-17 10:51 ` Sachin Sant
2009-09-17 11:13 ` Benjamin Herrenschmidt
2009-09-17 15:53 ` Tejun Heo
2009-09-17 16:41 ` Sachin Sant
2009-09-19 8:54 ` Sachin Sant
2009-09-23 8:23 ` Sachin Sant
2009-09-23 8:34 ` Tejun Heo
2009-09-23 14:17 ` Tejun Heo [this message]
2009-09-24 7:58 ` Sachin Sant
2009-09-24 12:59 ` Tejun Heo
2009-09-24 13:23 ` Sachin Sant
2009-09-24 21:05 ` Benjamin Herrenschmidt
2009-09-25 3:22 ` Tejun Heo
2009-09-25 3:40 ` Benjamin Herrenschmidt
2009-09-25 7:15 ` Sachin Sant
2009-09-25 7:39 ` Tejun Heo
2009-09-25 7:43 ` Tejun Heo
2009-09-25 8:03 ` Sachin Sant
2009-09-25 9:01 ` Tejun Heo
2009-09-25 9:48 ` Benjamin Herrenschmidt
2009-10-05 6:54 ` Sachin Sant
2009-09-25 8:31 ` Benjamin Herrenschmidt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4ABA2DE2.6000601@kernel.org \
--to=tj@kernel.org \
--cc=davem@davemloft.net \
--cc=linuxppc-dev@ozlabs.org \
--cc=sachinp@in.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).