On 3/5/19 9:14 AM, Oliver wrote: > On Sat, Mar 2, 2019 at 3:04 AM Sergey Miroshnichenko > wrote: >> >> Reading an empty slot returns all ones, which triggers a false >> EEH error event on PowerNV. This patch unfreezes the bus where >> it has happened. >> >> Signed-off-by: Sergey Miroshnichenko >> --- >> arch/powerpc/include/asm/ppc-pci.h | 1 + >> arch/powerpc/kernel/pci_dn.c | 2 +- >> arch/powerpc/platforms/powernv/pci.c | 34 ++++++++++++++++++++++++---- >> 3 files changed, 32 insertions(+), 5 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h >> index f67da277d652..737393c54f58 100644 >> --- a/arch/powerpc/include/asm/ppc-pci.h >> +++ b/arch/powerpc/include/asm/ppc-pci.h >> @@ -40,6 +40,7 @@ void *traverse_pci_dn(struct pci_dn *root, >> void *(*fn)(struct pci_dn *, void *), >> void *data); >> extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); >> +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus); >> >> /* From rtas_pci.h */ >> extern void init_pci_config_tokens (void); >> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c >> index ab147a1909c8..341ed71250f1 100644 >> --- a/arch/powerpc/kernel/pci_dn.c >> +++ b/arch/powerpc/kernel/pci_dn.c >> @@ -40,7 +40,7 @@ >> * one of PF's bridge. For other devices, their firmware >> * data is linked to that of their bridge. >> */ >> -static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) >> +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) >> { >> struct pci_bus *pbus; >> struct device_node *dn; >> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c >> index 3260250d2029..73c2d0aed996 100644 >> --- a/arch/powerpc/platforms/powernv/pci.c >> +++ b/arch/powerpc/platforms/powernv/pci.c >> @@ -761,6 +761,21 @@ static inline pnv_pci_cfg_check(struct pci_dn *pdn) >> } >> #endif /* CONFIG_EEH */ >> >> +static int get_bus_pe_number(struct pci_bus *bus) >> +{ >> + struct pci_dn *pdn = pci_bus_to_pdn(bus); >> + struct pci_dn *child; >> + >> + if (!pdn) >> + return IODA_INVALID_PE; >> + >> + list_for_each_entry(child, &pdn->child_list, list) >> + if (child->pe_number != IODA_INVALID_PE) >> + return child->pe_number; >> + >> + return IODA_INVALID_PE; >> +} >> + >> static int pnv_pci_read_config(struct pci_bus *bus, >> unsigned int devfn, >> int where, int size, u32 *val) >> @@ -769,12 +784,23 @@ static int pnv_pci_read_config(struct pci_bus *bus, >> struct pci_controller *hose = pci_bus_to_host(bus); >> struct pnv_phb *phb = hose->private_data; >> int ret; >> + u32 empty_val = 0xFFFFFFFF; >> >> - *val = 0xFFFFFFFF; >> + *val = empty_val; >> pdn = pci_get_pdn_by_devfn(bus, devfn); >> - if (!pdn) >> - return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn, >> - where, size, val); >> + if (!pdn) { >> + int pe_number = get_bus_pe_number(bus); >> + >> + ret = pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn, >> + where, size, val); >> + >> + if (!ret && (*val == empty_val) && phb->unfreeze_pe) > > Do this empty val check work when using 1 or 2 byte cfg accesses? > That was intentional because 0xff and 0xffff are valid values, but the 0xffffffff is the only reliable sign of an empty slot. And the kernel pokes a slot by the pci_bus_generic_read_dev_vendor_id() function, which in turn tries to pci_bus_read_config_dword(PCI_VENDOR_ID). But I haven't tried actually to read 1-2 bytes from an empty slot to test if that triggers an EEH. If it does, I'll change that to EEH_IO_ERROR_VALUE(size). >> + phb->unfreeze_pe(phb, (pe_number == IODA_INVALID_PE) ? >> + 0xff : pe_number, > > Use phb->ioda.reserved_pe_idx rather than guessing that 0xff is safe > to use. On P9 we have PHBs with 512 PEs and some older P8 firmware > releases used 0 as the reserved PE rather than 0xff. > Thanks for the catch! I'll fix that in v5. Best regards, Serge >> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >> + >> + return ret; >> + } >> >> if (!pnv_pci_cfg_check(pdn)) >> return PCIBIOS_DEVICE_NOT_FOUND; >> -- >> 2.20.1 >>