From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alexey Kardashevskiy Subject: Re: [PATCH v7 27/50] powerpc/powernv: Dynamically release PEs Date: Wed, 18 Nov 2015 13:23:05 +1100 Message-ID: <564BE109.8000809@au1.ibm.com> References: <1446642770-4681-1-git-send-email-gwshan@linux.vnet.ibm.com> <1446642770-4681-28-git-send-email-gwshan@linux.vnet.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=koi8-r; format=flowed Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1446642770-4681-28-git-send-email-gwshan@linux.vnet.ibm.com> Sender: linux-pci-owner@vger.kernel.org To: Gavin Shan , linuxppc-dev@lists.ozlabs.org Cc: linux-pci@vger.kernel.org, devicetree@vger.kernel.org, benh@kernel.crashing.org, mpe@ellerman.id.au, bhelgaas@google.com, grant.likely@linaro.org, robherring2@gmail.com, panto@antoniou-consulting.com, frowand.list@gmail.com List-Id: devicetree@vger.kernel.org On 11/05/2015 12:12 AM, Gavin Shan wrote: > This adds a reference count of PE, representing the number of PCI > devices associated with the PE. The reference count is increased > or decreased when PCI devices join or leave the PE. Once it becomes > zero, the PE together with its used resources (IO, MMIO, DMA, PELTM, > PELTV) are released to support PCI hot unplug. The commit log suggest the patch only adds a counter, initializes it, and replaces unconditional release of an object (in this case - PE) with the conditional one. But it is more that that... > Signed-off-by: Gavin Shan > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 245 ++++++++++++++++++++++++++---- > arch/powerpc/platforms/powernv/pci.h | 1 + > 2 files changed, 218 insertions(+), 28 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c > index 0bb0056..dcffce5 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -129,6 +129,215 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) > (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); > } > > +static void pnv_pci_ioda1_release_dma_pe(struct pnv_ioda_pe *pe) > +{ > + struct pnv_phb *phb = pe->phb; > + struct iommu_table *tbl; > + int start, count, i; > + int64_t rc; > + > + /* Search for the used DMA32 segments */ > + start = -1; > + count = 0; > + for (i = 0; i < phb->ioda.dma32_count; i++) { > + if (phb->ioda.dma32_segmap[i] != pe->pe_number) > + continue; > + > + count++; > + if (start < 0) > + start = i; > + } > + > + if (!count) > + return; imho checking pe->table_group.tables[0] != NULL is shorter than the loop above. > + > + /* Unlink IOMMU table from group */ > + tbl = pe->table_group.tables[0]; > + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); > + if (pe->table_group.group) { > + iommu_group_put(pe->table_group.group); > + WARN_ON(pe->table_group.group); > + } > + > + /* Release IOMMU table */ > + pnv_pci_ioda2_table_free_pages(tbl); This is IODA2 helper with multilevel support, does IODA1 support multilevel TCE tables? If not, it should WARN_ON on levels!=1. Another thing is you should first unprogram TVEs (via opal_pci_map_pe_dma_window), then invalidate the cache (if required, not sure if this is needed on IODA1), only then free the actual table. > + iommu_free_table(tbl, of_node_full_name(pci_bus_to_OF_node(pe->pbus))); > + > + /* Disable TVE */ > + for (i = start; i < start + count; i++) { > + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, > + i, 0, 0ul, 0ul, 0ul); > + if (rc) > + pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n", > + rc, i); > + > + phb->ioda.dma32_segmap[i] = IODA_INVALID_PE; > + } You could implement pnv_pci_ioda1_unset_window/pnv_ioda1_table_free as callbacks, change pnv_pci_ioda2_release_dma_pe() to use them (and rename it to reflect that it supports IODA1 and IODA2). > +} > + > +static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe); > +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, > + int num); > +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); > + > +static void pnv_pci_ioda2_release_dma_pe(struct pnv_ioda_pe *pe) You moved this function and changed it, please do one thing at once (which is "change", not "move"). > +{ > + struct iommu_table *tbl; > + unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); > + int64_t rc; > + > + if (!weight) > + return; Checking for pe->table_group.group is better because if we ever change the logic of what gets included to an IOMMU group, we will have to do the change where we add devices to a group but we won't have to touch releasing code. > + > + tbl = pe->table_group.tables[0]; > + rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); > + if (rc) > + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); > + > + pnv_pci_ioda2_set_bypass(pe, false); > + if (pe->table_group.group) { > + iommu_group_put(pe->table_group.group); > + WARN_ON(pe->table_group.group); > + } > + > + pnv_pci_ioda2_table_free_pages(tbl); > + iommu_free_table(tbl, "pnv"); > +} > + > +static void pnv_ioda_release_dma_pe(struct pnv_ioda_pe *pe) Merge this into pnv_ioda_release_pe() - it is small and called just once. > +{ > + struct pnv_phb *phb = pe->phb; > + > + switch (phb->type) { > + case PNV_PHB_IODA1: > + pnv_pci_ioda1_release_dma_pe(pe); > + break; > + case PNV_PHB_IODA2: > + pnv_pci_ioda2_release_dma_pe(pe); > + break; > + default: > + WARN_ON(1); > + } > +} > + > +static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win) > +{ > + struct pnv_phb *phb = pe->phb; > + int index, *segmap = NULL; > + int64_t rc; > + > + switch (win) { > + case OPAL_IO_WINDOW_TYPE: > + segmap = phb->ioda.io_segmap; > + break; > + case OPAL_M32_WINDOW_TYPE: > + segmap = phb->ioda.m32_segmap; > + break; > + case OPAL_M64_WINDOW_TYPE: > + if (phb->type != PNV_PHB_IODA1) > + return; > + segmap = phb->ioda.m64_segmap; > + break; > + default: > + return; Unnecessary return. > + } > + > + for (index = 0; index < phb->ioda.total_pe_num; index++) { > + if (segmap[index] != pe->pe_number) > + continue; > + > + if (win == OPAL_M64_WINDOW_TYPE) > + rc = opal_pci_map_pe_mmio_window(phb->opal_id, > + phb->ioda.reserved_pe_idx, win, > + index / PNV_IODA1_M64_SEGS, > + index % PNV_IODA1_M64_SEGS); > + else > + rc = opal_pci_map_pe_mmio_window(phb->opal_id, > + phb->ioda.reserved_pe_idx, win, > + 0, index); > + > + if (rc != OPAL_SUCCESS) > + pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", > + rc, win, index); > + > + segmap[index] = IODA_INVALID_PE; > + } > +} > + > +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) > +{ > + struct pnv_phb *phb = pe->phb; > + int win; > + > + for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++) { > + if (phb->type == PNV_PHB_IODA2 && win == OPAL_IO_WINDOW_TYPE) > + continue; Move this check to pnv_ioda_release_window() or move case(win == OPAL_M64_WINDOW_TYPE):if(phb->type != PNV_PHB_IODA1) from that function here. > + > + pnv_ioda_release_window(pe, win); > + } > +} This is shorter and cleaner: static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win, int *segmap { struct pnv_phb *phb = pe->phb; int index; int64_t rc; for (index = 0; index < phb->ioda.total_pe_num; index++) { if (segmap[index] != pe->pe_number) continue; if (win == OPAL_M64_WINDOW_TYPE) rc = opal_pci_map_pe_mmio_window(phb->opal_id, phb->ioda.reserved_pe_idx, win, index / PNV_IODA1_M64_SEGS, index % PNV_IODA1_M64_SEGS); else rc = opal_pci_map_pe_mmio_window(phb->opal_id, phb->ioda.reserved_pe_idx, win, 0, index); if (rc != OPAL_SUCCESS) pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", rc, win, index); segmap[index] = IODA_INVALID_PE; } } static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) { pnv_ioda_release_window(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); if (phb->type != PNV_PHB_IODA2) pnv_ioda_release_window(pe, OPAL_IO_WINDOW_TYPE, phb->ioda.io_segmap); else pnv_ioda_release_window(pe, OPAL_M64_WINDOW_TYPE, phb->ioda.m64_segmap); } I'd actually merge pnv_ioda_release_pe_seg() into pnv_ioda_release_pe() as well as it is also small and called once. > + > +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, > + struct pnv_ioda_pe *pe); > +static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe); > +static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) > +{ > + struct pnv_ioda_pe *tmp, *slave; > + > + /* Release slave PEs in compound PE */ > + if (pe->flags & PNV_IODA_PE_MASTER) { > + list_for_each_entry_safe(slave, tmp, &pe->slaves, list) > + pnv_ioda_release_pe(slave); > + } > + > + /* Remove the PE from the list */ > + list_del(&pe->list); > + > + /* Release resources */ > + pnv_ioda_release_dma_pe(pe); > + pnv_ioda_release_pe_seg(pe); > + pnv_ioda_deconfigure_pe(pe->phb, pe); > + > + pnv_ioda_free_pe(pe); > +} > + > +static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe) > +{ > + if (!pe) > + return NULL; > + > + pe->device_count++; > + return pe; > +} > + > +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe) Merge this into pnv_pci_release_device() as it is small and called only once. > +{ > + if (!pe) > + return; > + > + pe->device_count--; > + WARN_ON(pe->device_count < 0); > + if (pe->device_count == 0) > + pnv_ioda_release_pe(pe); > +} > + > +static void pnv_pci_release_device(struct pci_dev *pdev) > +{ > + struct pci_controller *hose = pci_bus_to_host(pdev->bus); > + struct pnv_phb *phb = hose->private_data; > + struct pci_dn *pdn = pci_get_pdn(pdev); > + struct pnv_ioda_pe *pe; > + > + if (pdev->is_virtfn) > + return; > + > + if (!pdn || pdn->pe_number == IODA_INVALID_PE) > + return; > + > + pe = &phb->ioda.pe_array[pdn->pe_number]; > + pnv_ioda_pe_put(pe); > +} > + > static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) > { > phb->ioda.pe_array[pe_no].phb = phb; > @@ -724,7 +933,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > return 0; > } > > -#ifdef CONFIG_PCI_IOV > static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > { > struct pci_dev *parent; > @@ -759,9 +967,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > } > rid_end = pe->rid + (count << 8); > } else { > +#ifdef CONFIG_PCI_IOV > if (pe->flags & PNV_IODA_PE_VF) > parent = pe->parent_dev; > else > +#endif > parent = pe->pdev->bus->self; > bcomp = OpalPciBusAll; > dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; > @@ -799,11 +1009,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > > pe->pbus = NULL; > pe->pdev = NULL; > +#ifdef CONFIG_PCI_IOV > pe->parent_dev = NULL; > +#endif These #ifdef movements seem very much unrelated. > > return 0; > } > -#endif /* CONFIG_PCI_IOV */ > > static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) > { > @@ -985,6 +1196,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) > continue; > > pdn->pe_number = pe->pe_number; > + pnv_ioda_pe_get(pe); > if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) > pnv_ioda_setup_same_PE(dev->subordinate, pe); > } > @@ -1047,9 +1259,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) > bus->busn_res.start, pe->pe_number); > > if (pnv_ioda_configure_pe(phb, pe)) { > - /* XXX What do we do here ? */ > - pnv_ioda_free_pe(pe); > pe->pbus = NULL; > + pnv_ioda_release_pe(pe); This is unrelated unexplained change. > return NULL; > } > > @@ -1199,29 +1410,6 @@ m64_failed: > return -EBUSY; > } > > -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, > - int num); > -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); > - > -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) > -{ > - struct iommu_table *tbl; > - int64_t rc; > - > - tbl = pe->table_group.tables[0]; > - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); > - if (rc) > - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); > - > - pnv_pci_ioda2_set_bypass(pe, false); > - if (pe->table_group.group) { > - iommu_group_put(pe->table_group.group); > - BUG_ON(pe->table_group.group); > - } > - pnv_pci_ioda2_table_free_pages(tbl); > - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); > -} > - > static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) > { > struct pci_bus *bus; > @@ -1242,7 +1430,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) > if (pe->parent_dev != pdev) > continue; > > - pnv_pci_ioda2_release_dma_pe(pdev, pe); > + pnv_pci_ioda2_release_dma_pe(pe); This is unrelated change. > > /* Remove from list */ > mutex_lock(&phb->ioda.pe_list_mutex); > @@ -3124,6 +3312,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { > .teardown_msi_irqs = pnv_teardown_msi_irqs, > #endif > .enable_device_hook = pnv_pci_enable_device_hook, > + .release_device = pnv_pci_release_device, > .window_alignment = pnv_pci_window_alignment, > .setup_bridge = pnv_pci_setup_bridge, > .reset_secondary_bus = pnv_pci_reset_secondary_bus, > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h > index ef5271a..3bb10de 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -30,6 +30,7 @@ struct pnv_phb; > struct pnv_ioda_pe { > unsigned long flags; > struct pnv_phb *phb; > + int device_count; Not atomic_t, no kref, no additional mutex, just "int"? Sure about it? If so, put a note to the commit log about what provides a guarantee that there is no race. > > /* A PE can be associated with a single device or an > * entire bus (& children). In the former case, pdev > -- Alexey Kardashevskiy IBM OzLabs, LTC Team e-mail: aik@au1.ibm.com notes: Alexey Kardashevskiy/Australia/IBM