On Tue, Sep 03, 2019 at 08:15:56PM +1000, Oliver O'Halloran wrote: > Currently we print a stack trace in the event handler to help with > debugging EEH issues. In the case of suprise hot-unplug this is unneeded, > so we want to prevent printing the stack trace unless we know it's due to > an actual device error. To accomplish this, we can save a stack trace at > the point of detection and only print it once the EEH recovery handler has > determined the freeze was due to an actual error. > > Since the whole point of this is to prevent spurious EEH output we also > move a few prints out of the detection thread, or mark them as pr_debug > so anyone interested can get output from the eeh_check_dev_failure() > if they want. > > Signed-off-by: Oliver O'Halloran I think this is a good change, and even in the normal case it will place the stacktrace closer to the rest of the recovery information. But, I think it would make more sense to put the stacktrace into the struct eeh_event, rather than the struct eeh_pe. Is there some reason we can't do that? (It would save a fair bit of memory!) Cheers, Sam. > --- > arch/powerpc/include/asm/eeh.h | 11 +++++++++ > arch/powerpc/kernel/eeh.c | 15 ++++--------- > arch/powerpc/kernel/eeh_driver.c | 38 +++++++++++++++++++++++++++++++- > arch/powerpc/kernel/eeh_event.c | 26 ++++++++++------------ > 4 files changed, 64 insertions(+), 26 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h > index c13119a5e69b..9d0e1694a94d 100644 > --- a/arch/powerpc/include/asm/eeh.h > +++ b/arch/powerpc/include/asm/eeh.h > @@ -88,6 +88,17 @@ struct eeh_pe { > struct list_head child_list; /* List of PEs below this PE */ > struct list_head child; /* Memb. child_list/eeh_phb_pe */ > struct list_head edevs; /* List of eeh_dev in this PE */ > + > + /* > + * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure > + * the stack trace is saved here so we can print it in the recovery > + * thread if it turns out to due to a real problem rather than > + * a hot-remove. > + * > + * A max of 64 entries might be overkill, but it also might not be. > + */ > + unsigned long stack_trace[64]; > + int trace_entries; > }; > > #define eeh_pe_for_each_dev(pe, edev, tmp) \ > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 9c468e79d13c..46d17817b438 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) > eeh_pe_mark_isolated(phb_pe); > eeh_serialize_unlock(flags); > > - pr_err("EEH: PHB#%x failure detected, location: %s\n", > + pr_debug("EEH: PHB#%x failure detected, location: %s\n", > phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); > - dump_stack(); > eeh_send_failure_event(phb_pe); > - > return 1; > out: > eeh_serialize_unlock(flags); > @@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) > unsigned long flags; > struct device_node *dn; > struct pci_dev *dev; > - struct eeh_pe *pe, *parent_pe, *phb_pe; > + struct eeh_pe *pe, *parent_pe; > int rc = 0; > const char *location = NULL; > > @@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev) > * a stack trace will help the device-driver authors figure > * out what happened. So print that out. > */ > - phb_pe = eeh_phb_pe_get(pe->phb); > - pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", > - pe->phb->global_number, pe->addr); > - pr_err("EEH: PE location: %s, PHB location: %s\n", > - eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); > - dump_stack(); > - > + pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", > + __func__, pe->phb->global_number, pe->addr); > eeh_send_failure_event(pe); > > return 1; > diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c > index 52ce7584af43..0d34cc12c529 100644 > --- a/arch/powerpc/kernel/eeh_driver.c > +++ b/arch/powerpc/kernel/eeh_driver.c > @@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe) > if (eeh_slot_presence_check(edev->pdev)) > devices++; > > - if (!devices) > + if (!devices) { > + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", > + pe->phb->global_number, pe->addr); > goto out; /* nothing to recover */ > + } > + > + /* Log the event */ > + if (pe->type & EEH_PE_PHB) { > + pr_err("EEH: PHB#%x failure detected, location: %s\n", > + pe->phb->global_number, eeh_pe_loc_get(pe)); > + } else { > + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); > + > + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", > + pe->phb->global_number, pe->addr); > + pr_err("EEH: PE location: %s, PHB location: %s\n", > + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); > + } > + > + /* > + * Print the saved stack trace now that we've verified there's > + * something to recover. > + */ > + if (pe->trace_entries) { > + void **ptrs = (void **) pe->stack_trace; > + int i; > + > + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", > + pe->phb->global_number, pe->addr); > + > + /* FIXME: Use the same format as dump_stack() */ > + pr_err("EEH: Call Trace:\n"); > + for (i = 0; i < pe->trace_entries; i++) > + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); > + > + pe->trace_entries = 0; > + } > + > > eeh_pe_update_time_stamp(pe); > pe->freeze_count++; > diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c > index e36653e5f76b..1d55486adb0f 100644 > --- a/arch/powerpc/kernel/eeh_event.c > +++ b/arch/powerpc/kernel/eeh_event.c > @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy) > { > unsigned long flags; > struct eeh_event *event; > - struct eeh_pe *pe; > > while (!kthread_should_stop()) { > if (wait_for_completion_interruptible(&eeh_eventlist_event)) > @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy) > continue; > > /* We might have event without binding PE */ > - pe = event->pe; > - if (pe) { > - if (pe->type & EEH_PE_PHB) > - pr_info("EEH: Detected error on PHB#%x\n", > - pe->phb->global_number); > - else > - pr_info("EEH: Detected PCI bus error on " > - "PHB#%x-PE#%x\n", > - pe->phb->global_number, pe->addr); > - eeh_handle_normal_event(pe); > - } else { > + if (event->pe) > + eeh_handle_normal_event(event->pe); > + else > eeh_handle_special_event(); > - } > > kfree(event); > } > @@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe) > * This prevents the PE from being free()ed by a hotplug driver > * while the PE is sitting in the event queue. > */ > - if (pe) > + if (pe) { > + /* > + * Save the current stack trace so we can dump it from the > + * event handler thread. > + */ > + pe->trace_entries = stack_trace_save(pe->stack_trace, > + ARRAY_SIZE(pe->stack_trace), 0); > + > eeh_pe_state_mark(pe, EEH_PE_RECOVERING); > + } > > /* We may or may not be called in an interrupt context */ > spin_lock_irqsave(&eeh_eventlist_lock, flags); > -- > 2.21.0 >