All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH kernel 0/2] powerpc/powernv: Fix crash on PF unbind when VF is passed
@ 2016-04-08  6:36 Alexey Kardashevskiy
  2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
  2016-04-08  6:36 ` [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal Alexey Kardashevskiy
  0 siblings, 2 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-08  6:36 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson, Gavin Shan

These 2 patches are to fix the crash which happens when the user (accidentally)
unbinds the driver of a physical function of a SRIOV device while
a virtual function is passed to a guest.

Please comment. Thanks!

There might be more work required in addition to this as it is not clear
what protects from races between pnv_ioda_free_pe and pnv_ioda_alloc_pe.
Gavin, please help.


Alexey Kardashevskiy (2):
  powerpc/iommu: Get rid of default group_release()
  powerpc/powernv/ioda2: Delay PE disposal

 arch/powerpc/include/asm/iommu.h          | 12 ++++++----
 arch/powerpc/kernel/iommu.c               | 14 ++++--------
 arch/powerpc/platforms/powernv/pci-ioda.c | 38 ++++++++++++++++---------------
 arch/powerpc/platforms/pseries/iommu.c    | 17 +++++++-------
 4 files changed, 40 insertions(+), 41 deletions(-)

-- 
2.5.0.rc3

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release()
  2016-04-08  6:36 [PATCH kernel 0/2] powerpc/powernv: Fix crash on PF unbind when VF is passed Alexey Kardashevskiy
@ 2016-04-08  6:36 ` Alexey Kardashevskiy
  2016-04-08  7:14   ` kbuild test robot
                     ` (2 more replies)
  2016-04-08  6:36 ` [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal Alexey Kardashevskiy
  1 sibling, 3 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-08  6:36 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson, Gavin Shan

IBM PPC IOMMU API users always set IOMMU data and IOMMU release callback
to an IOMMU group. At the moment the callback clears one pointer in
iommu_table_group and that's it.

The platform code calls iommu_group_put() and counts on _put() being
called last so they check for table_group->group being reset which
is conceptually wrong as there may be another user holding a reference.

This removes the default IOMMU group release() callback and adds it
as a parameter to iommu_register_group(). As we are changing the prototype
anyway, this also changes the function name to more distinctive
iommu_register_table_group().

This should cause no behavioral change as it leaves BUG_ON for IODA2
(where it was reported) and removes BUG_ON for pseries/IODA1 as they
do not support IOV anyway and this BUG_ON has never been reported for
these platforms.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h          | 12 +++++++-----
 arch/powerpc/kernel/iommu.c               | 14 ++++----------
 arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++++++++++----
 arch/powerpc/platforms/pseries/iommu.c    | 17 +++++++++--------
 4 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 7b87bab..d7ba3b4 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -201,17 +201,19 @@ struct iommu_table_group {
 
 #ifdef CONFIG_IOMMU_API
 
-extern void iommu_register_group(struct iommu_table_group *table_group,
-				 int pci_domain_number, unsigned long pe_num);
+extern void iommu_register_table_group(struct iommu_table_group *table_group,
+		int pci_domain_number, unsigned long pe_num,
+		void (*release)(void *iommu_data));
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction);
 #else
-static inline void iommu_register_group(struct iommu_table_group *table_group,
-					int pci_domain_number,
-					unsigned long pe_num)
+static inline void iommu_register_table_group(
+		struct iommu_table_group *table_group,
+		int pci_domain_number, unsigned long pe_num,
+		void (*release)(void *iommu_data))
 {
 }
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a8e3490..8eed2fa 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -887,15 +887,9 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
 /*
  * SPAPR TCE API
  */
-static void group_release(void *iommu_data)
-{
-	struct iommu_table_group *table_group = iommu_data;
-
-	table_group->group = NULL;
-}
-
-void iommu_register_group(struct iommu_table_group *table_group,
-		int pci_domain_number, unsigned long pe_num)
+void iommu_register_table_group(struct iommu_table_group *table_group,
+		int pci_domain_number, unsigned long pe_num,
+		void (*release)(void *iommu_data))
 {
 	struct iommu_group *grp;
 	char *name;
@@ -907,7 +901,7 @@ void iommu_register_group(struct iommu_table_group *table_group,
 		return;
 	}
 	table_group->group = grp;
-	iommu_group_set_iommudata(grp, table_group, group_release);
+	iommu_group_set_iommudata(grp, table_group, release);
 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
 			pci_domain_number, pe_num);
 	if (!name)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index c5baaf3..ce9f2bf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1330,6 +1330,13 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 		int num);
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 
+static void pnv_pci_ioda2_group_release(void *iommu_data)
+{
+	struct iommu_table_group *table_group = iommu_data;
+
+	table_group->group = NULL;
+}
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
 {
 	struct iommu_table    *tbl;
@@ -1965,8 +1972,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		return;
 
 	tbl = pnv_pci_table_alloc(phb->hose->node);
-	iommu_register_group(&pe->table_group, phb->hose->global_number,
-			pe->pe_number);
+	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number, NULL);
 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* Grab a 32-bit TCE table */
@@ -2450,8 +2457,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
-	iommu_register_group(&pe->table_group, phb->hose->global_number,
-			pe->pe_number);
+	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number, pnv_pci_ioda2_group_release);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index bd98ce2..0f6f06c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -112,7 +112,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 	}
 	if (table_group->group) {
 		iommu_group_put(table_group->group);
-		BUG_ON(table_group->group);
+		table_group->group = NULL;
 	}
 #endif
 	iommu_free_table(tbl, node_name);
@@ -692,7 +692,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
 	iommu_init_table(tbl, pci->phb->node);
-	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
+	iommu_register_table_group(pci->table_group, pci_domain_nr(bus), 0,
+			NULL);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -743,8 +744,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, ppci->phb->node);
-		iommu_register_group(ppci->table_group,
-				pci_domain_nr(bus), 0);
+		iommu_register_table_group(ppci->table_group,
+				pci_domain_nr(bus), 0, NULL);
 		pr_debug("  created table: %p\n", ppci->table_group);
 	}
 }
@@ -772,8 +773,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
 		iommu_init_table(tbl, phb->node);
-		iommu_register_group(PCI_DN(dn)->table_group,
-				pci_domain_nr(phb->bus), 0);
+		iommu_register_table_group(PCI_DN(dn)->table_group,
+				pci_domain_nr(phb->bus), 0, NULL);
 		set_iommu_table_base(&dev->dev, tbl);
 		iommu_add_device(&dev->dev);
 		return;
@@ -1197,8 +1198,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, pci->phb->node);
-		iommu_register_group(pci->table_group,
-				pci_domain_nr(pci->phb->bus), 0);
+		iommu_register_table_group(pci->table_group,
+				pci_domain_nr(pci->phb->bus), 0, NULL);
 		pr_debug("  created table: %p\n", pci->table_group);
 	} else {
 		pr_debug("  found DMA window, table: %p\n", pci->table_group);
-- 
2.5.0.rc3

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-08  6:36 [PATCH kernel 0/2] powerpc/powernv: Fix crash on PF unbind when VF is passed Alexey Kardashevskiy
  2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
@ 2016-04-08  6:36 ` Alexey Kardashevskiy
  2016-04-14  1:40   ` David Gibson
  2016-04-21  0:21   ` Gavin Shan
  1 sibling, 2 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-08  6:36 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson, Gavin Shan

When SRIOV is disabled, the existing code presumes there is no
virtual function (VF) in use and destroys all associated PEs.
However it is possible to get into the situation when the user
activated SRIOV disabling while a VF is still in use via VFIO.
For example, unbinding a physical function (PF) while there is a guest
running with a VF passed throuhgh via VFIO will trigger the bug.

This defines an IODA2-specific IOMMU group release() callback.
This moves all the disposal code from pnv_ioda_release_vf_PE() to this
new callback so the cleanup happens when the last user of an IOMMU
group released the reference.

As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
into pnv_ioda_release_vf_PE().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ce9f2bf..8108c54 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 static void pnv_pci_ioda2_group_release(void *iommu_data)
 {
 	struct iommu_table_group *table_group = iommu_data;
+	struct pnv_ioda_pe *pe = container_of(table_group,
+			struct pnv_ioda_pe, table_group);
+	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct iommu_table *tbl = pe->table_group.tables[0];
+	int64_t rc;
 
-	table_group->group = NULL;
-}
-
-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
-{
-	struct iommu_table    *tbl;
-	int64_t               rc;
-
-	tbl = pe->table_group.tables[0];
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
 	pnv_pci_ioda2_set_bypass(pe, false);
-	if (pe->table_group.group) {
-		iommu_group_put(pe->table_group.group);
-		BUG_ON(pe->table_group.group);
-	}
+
+	BUG_ON(!tbl);
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
+
+	pnv_ioda_deconfigure_pe(phb, pe);
+	pnv_ioda_free_pe(phb, pe->pe_number);
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 		if (pe->parent_dev != pdev)
 			continue;
 
-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
-
 		/* Remove from list */
 		mutex_lock(&phb->ioda.pe_list_mutex);
 		list_del(&pe->list);
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
-		pnv_ioda_deconfigure_pe(phb, pe);
-
-		pnv_ioda_free_pe(phb, pe->pe_number);
+		if (pe->table_group.group)
+			iommu_group_put(pe->table_group.group);
 	}
 }
 
-- 
2.5.0.rc3

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release()
  2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
@ 2016-04-08  7:14   ` kbuild test robot
  2016-04-14  1:35   ` David Gibson
  2016-04-21  0:02   ` Gavin Shan
  2 siblings, 0 replies; 14+ messages in thread
From: kbuild test robot @ 2016-04-08  7:14 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kbuild-all, linuxppc-dev, Alexey Kardashevskiy, David Gibson,
	Gavin Shan, Daniel Axtens

[-- Attachment #1: Type: text/plain, Size: 1807 bytes --]

Hi Alexey,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.6-rc2 next-20160408]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Kardashevskiy/powerpc-powernv-Fix-crash-on-PF-unbind-when-VF-is-passed/20160408-144325
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/powernv/pci-ioda.c: In function 'pnv_pci_ioda2_setup_dma_pe':
>> arch/powerpc/platforms/powernv/pci-ioda.c:2461:19: error: 'pnv_pci_ioda2_group_release' undeclared (first use in this function)
       pe->pe_number, pnv_pci_ioda2_group_release);
                      ^
   arch/powerpc/platforms/powernv/pci-ioda.c:2461:19: note: each undeclared identifier is reported only once for each function it appears in

vim +/pnv_pci_ioda2_group_release +2461 arch/powerpc/platforms/powernv/pci-ioda.c

  2455			return;
  2456	
  2457		/* TVE #1 is selected by PCI address bit 59 */
  2458		pe->tce_bypass_base = 1ull << 59;
  2459	
  2460		iommu_register_table_group(&pe->table_group, phb->hose->global_number,
> 2461				pe->pe_number, pnv_pci_ioda2_group_release);
  2462	
  2463		/* The PE will reserve all possible 32-bits space */
  2464		pe->tce32_seg = 0;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 21917 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release()
  2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
  2016-04-08  7:14   ` kbuild test robot
@ 2016-04-14  1:35   ` David Gibson
  2016-04-21  0:02   ` Gavin Shan
  2 siblings, 0 replies; 14+ messages in thread
From: David Gibson @ 2016-04-14  1:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, Gavin Shan

[-- Attachment #1: Type: text/plain, Size: 8294 bytes --]

On Fri, Apr 08, 2016 at 04:36:43PM +1000, Alexey Kardashevskiy wrote:
> IBM PPC IOMMU API users always set IOMMU data and IOMMU release callback
> to an IOMMU group. At the moment the callback clears one pointer in
> iommu_table_group and that's it.
> 
> The platform code calls iommu_group_put() and counts on _put() being
> called last so they check for table_group->group being reset which
> is conceptually wrong as there may be another user holding a reference.
> 
> This removes the default IOMMU group release() callback and adds it
> as a parameter to iommu_register_group(). As we are changing the prototype
> anyway, this also changes the function name to more distinctive
> iommu_register_table_group().
> 
> This should cause no behavioral change as it leaves BUG_ON for IODA2
> (where it was reported) and removes BUG_ON for pseries/IODA1 as they
> do not support IOV anyway and this BUG_ON has never been reported for
> these platforms.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/iommu.h          | 12 +++++++-----
>  arch/powerpc/kernel/iommu.c               | 14 ++++----------
>  arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++++++++++----
>  arch/powerpc/platforms/pseries/iommu.c    | 17 +++++++++--------
>  4 files changed, 31 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 7b87bab..d7ba3b4 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -201,17 +201,19 @@ struct iommu_table_group {
>  
>  #ifdef CONFIG_IOMMU_API
>  
> -extern void iommu_register_group(struct iommu_table_group *table_group,
> -				 int pci_domain_number, unsigned long pe_num);
> +extern void iommu_register_table_group(struct iommu_table_group *table_group,
> +		int pci_domain_number, unsigned long pe_num,
> +		void (*release)(void *iommu_data));
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
>  extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
>  		unsigned long *hpa, enum dma_data_direction *direction);
>  #else
> -static inline void iommu_register_group(struct iommu_table_group *table_group,
> -					int pci_domain_number,
> -					unsigned long pe_num)
> +static inline void iommu_register_table_group(
> +		struct iommu_table_group *table_group,
> +		int pci_domain_number, unsigned long pe_num,
> +		void (*release)(void *iommu_data))
>  {
>  }
>  
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index a8e3490..8eed2fa 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -887,15 +887,9 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>  /*
>   * SPAPR TCE API
>   */
> -static void group_release(void *iommu_data)
> -{
> -	struct iommu_table_group *table_group = iommu_data;
> -
> -	table_group->group = NULL;
> -}
> -
> -void iommu_register_group(struct iommu_table_group *table_group,
> -		int pci_domain_number, unsigned long pe_num)
> +void iommu_register_table_group(struct iommu_table_group *table_group,
> +		int pci_domain_number, unsigned long pe_num,
> +		void (*release)(void *iommu_data))
>  {
>  	struct iommu_group *grp;
>  	char *name;
> @@ -907,7 +901,7 @@ void iommu_register_group(struct iommu_table_group *table_group,
>  		return;
>  	}
>  	table_group->group = grp;
> -	iommu_group_set_iommudata(grp, table_group, group_release);
> +	iommu_group_set_iommudata(grp, table_group, release);
>  	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
>  			pci_domain_number, pe_num);
>  	if (!name)
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c5baaf3..ce9f2bf 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1330,6 +1330,13 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>  		int num);
>  static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>  
> +static void pnv_pci_ioda2_group_release(void *iommu_data)
> +{
> +	struct iommu_table_group *table_group = iommu_data;
> +
> +	table_group->group = NULL;
> +}
> +
>  static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>  {
>  	struct iommu_table    *tbl;
> @@ -1965,8 +1972,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  		return;
>  
>  	tbl = pnv_pci_table_alloc(phb->hose->node);
> -	iommu_register_group(&pe->table_group, phb->hose->global_number,
> -			pe->pe_number);
> +	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
> +			pe->pe_number, NULL);
>  	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>  
>  	/* Grab a 32-bit TCE table */
> @@ -2450,8 +2457,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	/* TVE #1 is selected by PCI address bit 59 */
>  	pe->tce_bypass_base = 1ull << 59;
>  
> -	iommu_register_group(&pe->table_group, phb->hose->global_number,
> -			pe->pe_number);
> +	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
> +			pe->pe_number, pnv_pci_ioda2_group_release);
>  
>  	/* The PE will reserve all possible 32-bits space */
>  	pe->tce32_seg = 0;
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index bd98ce2..0f6f06c 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -112,7 +112,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
>  	}
>  	if (table_group->group) {
>  		iommu_group_put(table_group->group);
> -		BUG_ON(table_group->group);
> +		table_group->group = NULL;
>  	}
>  #endif
>  	iommu_free_table(tbl, node_name);
> @@ -692,7 +692,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>  	iommu_table_setparms(pci->phb, dn, tbl);
>  	tbl->it_ops = &iommu_table_pseries_ops;
>  	iommu_init_table(tbl, pci->phb->node);
> -	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
> +	iommu_register_table_group(pci->table_group, pci_domain_nr(bus), 0,
> +			NULL);
>  
>  	/* Divide the rest (1.75GB) among the children */
>  	pci->phb->dma_window_size = 0x80000000ul;
> @@ -743,8 +744,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
>  		tbl->it_ops = &iommu_table_lpar_multi_ops;
>  		iommu_init_table(tbl, ppci->phb->node);
> -		iommu_register_group(ppci->table_group,
> -				pci_domain_nr(bus), 0);
> +		iommu_register_table_group(ppci->table_group,
> +				pci_domain_nr(bus), 0, NULL);
>  		pr_debug("  created table: %p\n", ppci->table_group);
>  	}
>  }
> @@ -772,8 +773,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>  		iommu_table_setparms(phb, dn, tbl);
>  		tbl->it_ops = &iommu_table_pseries_ops;
>  		iommu_init_table(tbl, phb->node);
> -		iommu_register_group(PCI_DN(dn)->table_group,
> -				pci_domain_nr(phb->bus), 0);
> +		iommu_register_table_group(PCI_DN(dn)->table_group,
> +				pci_domain_nr(phb->bus), 0, NULL);
>  		set_iommu_table_base(&dev->dev, tbl);
>  		iommu_add_device(&dev->dev);
>  		return;
> @@ -1197,8 +1198,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
>  		tbl->it_ops = &iommu_table_lpar_multi_ops;
>  		iommu_init_table(tbl, pci->phb->node);
> -		iommu_register_group(pci->table_group,
> -				pci_domain_nr(pci->phb->bus), 0);
> +		iommu_register_table_group(pci->table_group,
> +				pci_domain_nr(pci->phb->bus), 0, NULL);
>  		pr_debug("  created table: %p\n", pci->table_group);
>  	} else {
>  		pr_debug("  found DMA window, table: %p\n", pci->table_group);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-08  6:36 ` [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal Alexey Kardashevskiy
@ 2016-04-14  1:40   ` David Gibson
  2016-04-15  1:29     ` Alexey Kardashevskiy
  2016-04-21  0:21   ` Gavin Shan
  1 sibling, 1 reply; 14+ messages in thread
From: David Gibson @ 2016-04-14  1:40 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, Gavin Shan

[-- Attachment #1: Type: text/plain, Size: 3953 bytes --]

On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
> When SRIOV is disabled, the existing code presumes there is no
> virtual function (VF) in use and destroys all associated PEs.
> However it is possible to get into the situation when the user
> activated SRIOV disabling while a VF is still in use via VFIO.
> For example, unbinding a physical function (PF) while there is a guest
> running with a VF passed throuhgh via VFIO will trigger the bug.
> 
> This defines an IODA2-specific IOMMU group release() callback.
> This moves all the disposal code from pnv_ioda_release_vf_PE() to this
> new callback so the cleanup happens when the last user of an IOMMU
> group released the reference.
> 
> As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
> iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
> into pnv_ioda_release_vf_PE().
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
>  1 file changed, 14 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index ce9f2bf..8108c54 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>  static void pnv_pci_ioda2_group_release(void *iommu_data)
>  {
>  	struct iommu_table_group *table_group = iommu_data;
> +	struct pnv_ioda_pe *pe = container_of(table_group,
> +			struct pnv_ioda_pe, table_group);
> +	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
> +	struct pnv_phb *phb = hose->private_data;
> +	struct iommu_table *tbl = pe->table_group.tables[0];
> +	int64_t rc;
>  
> -	table_group->group = NULL;
> -}
> -
> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> -{
> -	struct iommu_table    *tbl;
> -	int64_t               rc;
> -
> -	tbl = pe->table_group.tables[0];
>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);

Is it safe to go manipulating the PE windows, etc. after SR-IOV is
disabled?

When SR-IOV is disabled, you need to immediately disable the VF (I'm
guessing that happens somewhere) and stop all access to the VF
"hardware".  Only the iommu group structure *has* to stick around
until the reference count drops to zero.  I think other structures and
hardware reconfiguration can be deferred or done immediately,
whichever is more convenient.

>  	if (rc)
>  		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>  
>  	pnv_pci_ioda2_set_bypass(pe, false);
> -	if (pe->table_group.group) {
> -		iommu_group_put(pe->table_group.group);
> -		BUG_ON(pe->table_group.group);
> -	}
> +
> +	BUG_ON(!tbl);
>  	pnv_pci_ioda2_table_free_pages(tbl);
> -	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> +	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
> +
> +	pnv_ioda_deconfigure_pe(phb, pe);
> +	pnv_ioda_free_pe(phb, pe->pe_number);
>  }
>  
>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> @@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>  		if (pe->parent_dev != pdev)
>  			continue;
>  
> -		pnv_pci_ioda2_release_dma_pe(pdev, pe);
> -
>  		/* Remove from list */
>  		mutex_lock(&phb->ioda.pe_list_mutex);
>  		list_del(&pe->list);
>  		mutex_unlock(&phb->ioda.pe_list_mutex);
>  
> -		pnv_ioda_deconfigure_pe(phb, pe);
> -
> -		pnv_ioda_free_pe(phb, pe->pe_number);
> +		if (pe->table_group.group)
> +			iommu_group_put(pe->table_group.group);
>  	}
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-14  1:40   ` David Gibson
@ 2016-04-15  1:29     ` Alexey Kardashevskiy
  2016-04-15  2:26       ` David Gibson
  0 siblings, 1 reply; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-15  1:29 UTC (permalink / raw)
  To: David Gibson
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, Gavin Shan

On 04/14/2016 11:40 AM, David Gibson wrote:
> On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
>> When SRIOV is disabled, the existing code presumes there is no
>> virtual function (VF) in use and destroys all associated PEs.
>> However it is possible to get into the situation when the user
>> activated SRIOV disabling while a VF is still in use via VFIO.
>> For example, unbinding a physical function (PF) while there is a guest
>> running with a VF passed throuhgh via VFIO will trigger the bug.
>>
>> This defines an IODA2-specific IOMMU group release() callback.
>> This moves all the disposal code from pnv_ioda_release_vf_PE() to this
>> new callback so the cleanup happens when the last user of an IOMMU
>> group released the reference.
>>
>> As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
>> iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
>> into pnv_ioda_release_vf_PE().
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
>>   1 file changed, 14 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index ce9f2bf..8108c54 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>>   static void pnv_pci_ioda2_group_release(void *iommu_data)
>>   {
>>   	struct iommu_table_group *table_group = iommu_data;
>> +	struct pnv_ioda_pe *pe = container_of(table_group,
>> +			struct pnv_ioda_pe, table_group);
>> +	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
>> +	struct pnv_phb *phb = hose->private_data;
>> +	struct iommu_table *tbl = pe->table_group.tables[0];
>> +	int64_t rc;
>>
>> -	table_group->group = NULL;
>> -}
>> -
>> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>> -{
>> -	struct iommu_table    *tbl;
>> -	int64_t               rc;
>> -
>> -	tbl = pe->table_group.tables[0];
>>   	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>
> Is it safe to go manipulating the PE windows, etc. after SR-IOV is
> disabled?

Manipulating windows in this case is just updating 8 bytes in the TVT. At 
this point a VF is expected to be destroyed but PE is expected to remain 
not free so pnv_ioda2_pick_m64_pe() (or pnv_ioda2_reserve_m64_pe()?) won't 
use it.

>
> When SR-IOV is disabled, you need to immediately disable the VF (I'm
> guessing that happens somewhere) and stop all access to the VF
> "hardware".

drivers/pci/iov.c
===
static void sriov_disable(struct pci_dev *dev)
{
...
for (i = 0; i < iov->num_VFs; i++)
         pci_iov_remove_virtfn(dev, i, 0);
...
pcibios_sriov_disable(dev);
===

pcibios_sriov_disable() is where pnv_pci_ioda2_release_dma_pe() is called from.

> Only the iommu group structure *has* to stick around
> until the reference count drops to zero.  I think other structures and
> hardware reconfiguration can be deferred or done immediately,
> whichever is more convenient.

I deferred everything because of convenience as iommu_table_group is 
embedded into pnv_ioda struct, not a pointer.



>>   	if (rc)
>>   		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>
>>   	pnv_pci_ioda2_set_bypass(pe, false);
>> -	if (pe->table_group.group) {
>> -		iommu_group_put(pe->table_group.group);
>> -		BUG_ON(pe->table_group.group);
>> -	}
>> +
>> +	BUG_ON(!tbl);
>>   	pnv_pci_ioda2_table_free_pages(tbl);
>> -	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>> +	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
>> +
>> +	pnv_ioda_deconfigure_pe(phb, pe);
>> +	pnv_ioda_free_pe(phb, pe->pe_number);
>>   }
>>
>>   static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> @@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>   		if (pe->parent_dev != pdev)
>>   			continue;
>>
>> -		pnv_pci_ioda2_release_dma_pe(pdev, pe);
>> -
>>   		/* Remove from list */
>>   		mutex_lock(&phb->ioda.pe_list_mutex);
>>   		list_del(&pe->list);
>>   		mutex_unlock(&phb->ioda.pe_list_mutex);
>>
>> -		pnv_ioda_deconfigure_pe(phb, pe);
>> -
>> -		pnv_ioda_free_pe(phb, pe->pe_number);
>> +		if (pe->table_group.group)
>> +			iommu_group_put(pe->table_group.group);
>>   	}
>>   }
>>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-15  1:29     ` Alexey Kardashevskiy
@ 2016-04-15  2:26       ` David Gibson
  0 siblings, 0 replies; 14+ messages in thread
From: David Gibson @ 2016-04-15  2:26 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, Gavin Shan

[-- Attachment #1: Type: text/plain, Size: 5090 bytes --]

On Fri, Apr 15, 2016 at 11:29:32AM +1000, Alexey Kardashevskiy wrote:
> On 04/14/2016 11:40 AM, David Gibson wrote:
> >On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
> >>When SRIOV is disabled, the existing code presumes there is no
> >>virtual function (VF) in use and destroys all associated PEs.
> >>However it is possible to get into the situation when the user
> >>activated SRIOV disabling while a VF is still in use via VFIO.
> >>For example, unbinding a physical function (PF) while there is a guest
> >>running with a VF passed throuhgh via VFIO will trigger the bug.
> >>
> >>This defines an IODA2-specific IOMMU group release() callback.
> >>This moves all the disposal code from pnv_ioda_release_vf_PE() to this
> >>new callback so the cleanup happens when the last user of an IOMMU
> >>group released the reference.
> >>
> >>As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
> >>iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
> >>into pnv_ioda_release_vf_PE().
> >>
> >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>---
> >>  arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
> >>  1 file changed, 14 insertions(+), 19 deletions(-)
> >>
> >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>index ce9f2bf..8108c54 100644
> >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>@@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> >>  static void pnv_pci_ioda2_group_release(void *iommu_data)
> >>  {
> >>  	struct iommu_table_group *table_group = iommu_data;
> >>+	struct pnv_ioda_pe *pe = container_of(table_group,
> >>+			struct pnv_ioda_pe, table_group);
> >>+	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
> >>+	struct pnv_phb *phb = hose->private_data;
> >>+	struct iommu_table *tbl = pe->table_group.tables[0];
> >>+	int64_t rc;
> >>
> >>-	table_group->group = NULL;
> >>-}
> >>-
> >>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> >>-{
> >>-	struct iommu_table    *tbl;
> >>-	int64_t               rc;
> >>-
> >>-	tbl = pe->table_group.tables[0];
> >>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> >
> >Is it safe to go manipulating the PE windows, etc. after SR-IOV is
> >disabled?
> 
> Manipulating windows in this case is just updating 8 bytes in the TVT. At
> this point a VF is expected to be destroyed but PE is expected to remain not
> free so pnv_ioda2_pick_m64_pe() (or pnv_ioda2_reserve_m64_pe()?) won't use
> it.

Ok.

> >When SR-IOV is disabled, you need to immediately disable the VF (I'm
> >guessing that happens somewhere) and stop all access to the VF
> >"hardware".
> 
> drivers/pci/iov.c
> ===
> static void sriov_disable(struct pci_dev *dev)
> {
> ...
> for (i = 0; i < iov->num_VFs; i++)
>         pci_iov_remove_virtfn(dev, i, 0);
> ...
> pcibios_sriov_disable(dev);
> ===
> 
> pcibios_sriov_disable() is where pnv_pci_ioda2_release_dma_pe() is called from.
> 
> >Only the iommu group structure *has* to stick around
> >until the reference count drops to zero.  I think other structures and
> >hardware reconfiguration can be deferred or done immediately,
> >whichever is more convenient.
> 
> I deferred everything because of convenience as iommu_table_group is
> embedded into pnv_ioda struct, not a pointer.

Ok.


With those queries answered,

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> >>  	if (rc)
> >>  		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
> >>
> >>  	pnv_pci_ioda2_set_bypass(pe, false);
> >>-	if (pe->table_group.group) {
> >>-		iommu_group_put(pe->table_group.group);
> >>-		BUG_ON(pe->table_group.group);
> >>-	}
> >>+
> >>+	BUG_ON(!tbl);
> >>  	pnv_pci_ioda2_table_free_pages(tbl);
> >>-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> >>+	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
> >>+
> >>+	pnv_ioda_deconfigure_pe(phb, pe);
> >>+	pnv_ioda_free_pe(phb, pe->pe_number);
> >>  }
> >>
> >>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> >>@@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> >>  		if (pe->parent_dev != pdev)
> >>  			continue;
> >>
> >>-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
> >>-
> >>  		/* Remove from list */
> >>  		mutex_lock(&phb->ioda.pe_list_mutex);
> >>  		list_del(&pe->list);
> >>  		mutex_unlock(&phb->ioda.pe_list_mutex);
> >>
> >>-		pnv_ioda_deconfigure_pe(phb, pe);
> >>-
> >>-		pnv_ioda_free_pe(phb, pe->pe_number);
> >>+		if (pe->table_group.group)
> >>+			iommu_group_put(pe->table_group.group);
> >>  	}
> >>  }
> >>
> >
> 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release()
  2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
  2016-04-08  7:14   ` kbuild test robot
  2016-04-14  1:35   ` David Gibson
@ 2016-04-21  0:02   ` Gavin Shan
  2016-04-21  3:17     ` Alexey Kardashevskiy
  2 siblings, 1 reply; 14+ messages in thread
From: Gavin Shan @ 2016-04-21  0:02 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson, Gavin Shan

On Fri, Apr 08, 2016 at 04:36:43PM +1000, Alexey Kardashevskiy wrote:
>IBM PPC IOMMU API users always set IOMMU data and IOMMU release callback
>to an IOMMU group. At the moment the callback clears one pointer in
>iommu_table_group and that's it.
>
>The platform code calls iommu_group_put() and counts on _put() being
>called last so they check for table_group->group being reset which
>is conceptually wrong as there may be another user holding a reference.
>
>This removes the default IOMMU group release() callback and adds it
>as a parameter to iommu_register_group(). As we are changing the prototype
>anyway, this also changes the function name to more distinctive
>iommu_register_table_group().
>
>This should cause no behavioral change as it leaves BUG_ON for IODA2
>(where it was reported) and removes BUG_ON for pseries/IODA1 as they
>do not support IOV anyway and this BUG_ON has never been reported for
>these platforms.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>

There is one question at end of the thread...

>---
> arch/powerpc/include/asm/iommu.h          | 12 +++++++-----
> arch/powerpc/kernel/iommu.c               | 14 ++++----------
> arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++++++++++----
> arch/powerpc/platforms/pseries/iommu.c    | 17 +++++++++--------
> 4 files changed, 31 insertions(+), 27 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index 7b87bab..d7ba3b4 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -201,17 +201,19 @@ struct iommu_table_group {
>
> #ifdef CONFIG_IOMMU_API
>
>-extern void iommu_register_group(struct iommu_table_group *table_group,
>-				 int pci_domain_number, unsigned long pe_num);
>+extern void iommu_register_table_group(struct iommu_table_group *table_group,
>+		int pci_domain_number, unsigned long pe_num,
>+		void (*release)(void *iommu_data));
> extern int iommu_add_device(struct device *dev);
> extern void iommu_del_device(struct device *dev);
> extern int __init tce_iommu_bus_notifier_init(void);
> extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
> 		unsigned long *hpa, enum dma_data_direction *direction);
> #else
>-static inline void iommu_register_group(struct iommu_table_group *table_group,
>-					int pci_domain_number,
>-					unsigned long pe_num)
>+static inline void iommu_register_table_group(
>+		struct iommu_table_group *table_group,
>+		int pci_domain_number, unsigned long pe_num,
>+		void (*release)(void *iommu_data))
> {
> }
>
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index a8e3490..8eed2fa 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -887,15 +887,9 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
> /*
>  * SPAPR TCE API
>  */
>-static void group_release(void *iommu_data)
>-{
>-	struct iommu_table_group *table_group = iommu_data;
>-
>-	table_group->group = NULL;
>-}
>-
>-void iommu_register_group(struct iommu_table_group *table_group,
>-		int pci_domain_number, unsigned long pe_num)
>+void iommu_register_table_group(struct iommu_table_group *table_group,
>+		int pci_domain_number, unsigned long pe_num,
>+		void (*release)(void *iommu_data))
> {
> 	struct iommu_group *grp;
> 	char *name;
>@@ -907,7 +901,7 @@ void iommu_register_group(struct iommu_table_group *table_group,
> 		return;
> 	}
> 	table_group->group = grp;
>-	iommu_group_set_iommudata(grp, table_group, group_release);
>+	iommu_group_set_iommudata(grp, table_group, release);
> 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
> 			pci_domain_number, pe_num);
> 	if (!name)
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index c5baaf3..ce9f2bf 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1330,6 +1330,13 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
> 		int num);
> static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>
>+static void pnv_pci_ioda2_group_release(void *iommu_data)
>+{
>+	struct iommu_table_group *table_group = iommu_data;
>+
>+	table_group->group = NULL;
>+}
>+
> static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> {
> 	struct iommu_table    *tbl;
>@@ -1965,8 +1972,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 		return;
>
> 	tbl = pnv_pci_table_alloc(phb->hose->node);
>-	iommu_register_group(&pe->table_group, phb->hose->global_number,
>-			pe->pe_number);
>+	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
>+			pe->pe_number, NULL);
> 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>
> 	/* Grab a 32-bit TCE table */
>@@ -2450,8 +2457,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	/* TVE #1 is selected by PCI address bit 59 */
> 	pe->tce_bypass_base = 1ull << 59;
>
>-	iommu_register_group(&pe->table_group, phb->hose->global_number,
>-			pe->pe_number);
>+	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
>+			pe->pe_number, pnv_pci_ioda2_group_release);
>
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index bd98ce2..0f6f06c 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -112,7 +112,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
> 	}
> 	if (table_group->group) {
> 		iommu_group_put(table_group->group);
>-		BUG_ON(table_group->group);
>+		table_group->group = NULL;
> 	}
> #endif
> 	iommu_free_table(tbl, node_name);
>@@ -692,7 +692,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
> 	iommu_table_setparms(pci->phb, dn, tbl);
> 	tbl->it_ops = &iommu_table_pseries_ops;
> 	iommu_init_table(tbl, pci->phb->node);
>-	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
>+	iommu_register_table_group(pci->table_group, pci_domain_nr(bus), 0,
>+			NULL);
>
> 	/* Divide the rest (1.75GB) among the children */
> 	pci->phb->dma_window_size = 0x80000000ul;
>@@ -743,8 +744,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
> 		iommu_init_table(tbl, ppci->phb->node);
>-		iommu_register_group(ppci->table_group,
>-				pci_domain_nr(bus), 0);
>+		iommu_register_table_group(ppci->table_group,
>+				pci_domain_nr(bus), 0, NULL);
> 		pr_debug("  created table: %p\n", ppci->table_group);
> 	}
> }
>@@ -772,8 +773,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 		iommu_table_setparms(phb, dn, tbl);
> 		tbl->it_ops = &iommu_table_pseries_ops;
> 		iommu_init_table(tbl, phb->node);
>-		iommu_register_group(PCI_DN(dn)->table_group,
>-				pci_domain_nr(phb->bus), 0);
>+		iommu_register_table_group(PCI_DN(dn)->table_group,
>+				pci_domain_nr(phb->bus), 0, NULL);
> 		set_iommu_table_base(&dev->dev, tbl);
> 		iommu_add_device(&dev->dev);
> 		return;
>@@ -1197,8 +1198,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
> 		iommu_init_table(tbl, pci->phb->node);
>-		iommu_register_group(pci->table_group,
>-				pci_domain_nr(pci->phb->bus), 0);
>+		iommu_register_table_group(pci->table_group,
>+				pci_domain_nr(pci->phb->bus), 0, NULL);
> 		pr_debug("  created table: %p\n", pci->table_group);
> 	} else {
> 		pr_debug("  found DMA window, table: %p\n", pci->table_group);

Here seems one issue not introduced by this patch: all PE# passed to
iommu_register_table_group() on pSeries platform are zero. When there
are multiple PEs under one PHB, their IOMMU groups will have same
names. It seems not correct. Maybe we don't have two PEs under one
PHB yet?

>-- 
>2.5.0.rc3
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-08  6:36 ` [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal Alexey Kardashevskiy
  2016-04-14  1:40   ` David Gibson
@ 2016-04-21  0:21   ` Gavin Shan
  2016-04-21  3:20     ` Alexey Kardashevskiy
  1 sibling, 1 reply; 14+ messages in thread
From: Gavin Shan @ 2016-04-21  0:21 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson, Gavin Shan

On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
>When SRIOV is disabled, the existing code presumes there is no
>virtual function (VF) in use and destroys all associated PEs.
>However it is possible to get into the situation when the user
>activated SRIOV disabling while a VF is still in use via VFIO.
>For example, unbinding a physical function (PF) while there is a guest
>running with a VF passed throuhgh via VFIO will trigger the bug.
>
>This defines an IODA2-specific IOMMU group release() callback.
>This moves all the disposal code from pnv_ioda_release_vf_PE() to this
>new callback so the cleanup happens when the last user of an IOMMU
>group released the reference.
>
>As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
>iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
>into pnv_ioda_release_vf_PE().
>

Sorry, I don't understand how it works. When PF's driver disables
IOV capability, the VF cannnot work. The guest is unlikely to know
that and still continue accessing the VF's resources (e.g. config
space and MMIO registers). It would cause EEH errors.

>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
> 1 file changed, 14 insertions(+), 19 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index ce9f2bf..8108c54 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> static void pnv_pci_ioda2_group_release(void *iommu_data)
> {
> 	struct iommu_table_group *table_group = iommu_data;
>+	struct pnv_ioda_pe *pe = container_of(table_group,
>+			struct pnv_ioda_pe, table_group);
>+	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);

pe->parent_dev would be NULL for non-VF-PEs and it's protected by CONFIG_PCI_IOV
in pci.h.

>+	struct pnv_phb *phb = hose->private_data;
>+	struct iommu_table *tbl = pe->table_group.tables[0];
>+	int64_t rc;
>
>-	table_group->group = NULL;
>-}
>-
>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>-{
>-	struct iommu_table    *tbl;
>-	int64_t               rc;
>-
>-	tbl = pe->table_group.tables[0];
> 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> 	if (rc)
> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>
> 	pnv_pci_ioda2_set_bypass(pe, false);
>-	if (pe->table_group.group) {
>-		iommu_group_put(pe->table_group.group);
>-		BUG_ON(pe->table_group.group);
>-	}
>+
>+	BUG_ON(!tbl);
> 	pnv_pci_ioda2_table_free_pages(tbl);
>-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>+	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
>+
>+	pnv_ioda_deconfigure_pe(phb, pe);
>+	pnv_ioda_free_pe(phb, pe->pe_number);
> }

It's not correct enough. One PE is comprised of DMA, MMIO, mapping info etc.
This function disposes all of them when DMA finishes its job. I don't figure
out a better way to represent all of them and their relationship. I guess it's
worthy to have something in long term though it's not trival work.

>
> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>@@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> 		if (pe->parent_dev != pdev)
> 			continue;
>
>-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
>-
> 		/* Remove from list */
> 		mutex_lock(&phb->ioda.pe_list_mutex);
> 		list_del(&pe->list);
> 		mutex_unlock(&phb->ioda.pe_list_mutex);
>
>-		pnv_ioda_deconfigure_pe(phb, pe);
>-
>-		pnv_ioda_free_pe(phb, pe->pe_number);
>+		if (pe->table_group.group)
>+			iommu_group_put(pe->table_group.group);
> 	}
> }
>
>-- 
>2.5.0.rc3
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release()
  2016-04-21  0:02   ` Gavin Shan
@ 2016-04-21  3:17     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-21  3:17 UTC (permalink / raw)
  To: Gavin Shan; +Cc: David Gibson, linuxppc-dev, Daniel Axtens

On 04/21/2016 10:02 AM, Gavin Shan wrote:
> On Fri, Apr 08, 2016 at 04:36:43PM +1000, Alexey Kardashevskiy wrote:
>> IBM PPC IOMMU API users always set IOMMU data and IOMMU release callback
>> to an IOMMU group. At the moment the callback clears one pointer in
>> iommu_table_group and that's it.
>>
>> The platform code calls iommu_group_put() and counts on _put() being
>> called last so they check for table_group->group being reset which
>> is conceptually wrong as there may be another user holding a reference.
>>
>> This removes the default IOMMU group release() callback and adds it
>> as a parameter to iommu_register_group(). As we are changing the prototype
>> anyway, this also changes the function name to more distinctive
>> iommu_register_table_group().
>>
>> This should cause no behavioral change as it leaves BUG_ON for IODA2
>> (where it was reported) and removes BUG_ON for pseries/IODA1 as they
>> do not support IOV anyway and this BUG_ON has never been reported for
>> these platforms.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
> There is one question at end of the thread...
>
>> ---
>> arch/powerpc/include/asm/iommu.h          | 12 +++++++-----
>> arch/powerpc/kernel/iommu.c               | 14 ++++----------
>> arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++++++++++----
>> arch/powerpc/platforms/pseries/iommu.c    | 17 +++++++++--------
>> 4 files changed, 31 insertions(+), 27 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index 7b87bab..d7ba3b4 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -201,17 +201,19 @@ struct iommu_table_group {
>>
>> #ifdef CONFIG_IOMMU_API
>>
>> -extern void iommu_register_group(struct iommu_table_group *table_group,
>> -				 int pci_domain_number, unsigned long pe_num);
>> +extern void iommu_register_table_group(struct iommu_table_group *table_group,
>> +		int pci_domain_number, unsigned long pe_num,
>> +		void (*release)(void *iommu_data));
>> extern int iommu_add_device(struct device *dev);
>> extern void iommu_del_device(struct device *dev);
>> extern int __init tce_iommu_bus_notifier_init(void);
>> extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
>> 		unsigned long *hpa, enum dma_data_direction *direction);
>> #else
>> -static inline void iommu_register_group(struct iommu_table_group *table_group,
>> -					int pci_domain_number,
>> -					unsigned long pe_num)
>> +static inline void iommu_register_table_group(
>> +		struct iommu_table_group *table_group,
>> +		int pci_domain_number, unsigned long pe_num,
>> +		void (*release)(void *iommu_data))
>> {
>> }
>>
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index a8e3490..8eed2fa 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -887,15 +887,9 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>> /*
>>   * SPAPR TCE API
>>   */
>> -static void group_release(void *iommu_data)
>> -{
>> -	struct iommu_table_group *table_group = iommu_data;
>> -
>> -	table_group->group = NULL;
>> -}
>> -
>> -void iommu_register_group(struct iommu_table_group *table_group,
>> -		int pci_domain_number, unsigned long pe_num)
>> +void iommu_register_table_group(struct iommu_table_group *table_group,
>> +		int pci_domain_number, unsigned long pe_num,
>> +		void (*release)(void *iommu_data))
>> {
>> 	struct iommu_group *grp;
>> 	char *name;
>> @@ -907,7 +901,7 @@ void iommu_register_group(struct iommu_table_group *table_group,
>> 		return;
>> 	}
>> 	table_group->group = grp;
>> -	iommu_group_set_iommudata(grp, table_group, group_release);
>> +	iommu_group_set_iommudata(grp, table_group, release);
>> 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
>> 			pci_domain_number, pe_num);
>> 	if (!name)
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index c5baaf3..ce9f2bf 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1330,6 +1330,13 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>> 		int num);
>> static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>>
>> +static void pnv_pci_ioda2_group_release(void *iommu_data)
>> +{
>> +	struct iommu_table_group *table_group = iommu_data;
>> +
>> +	table_group->group = NULL;
>> +}
>> +
>> static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>> {
>> 	struct iommu_table    *tbl;
>> @@ -1965,8 +1972,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>> 		return;
>>
>> 	tbl = pnv_pci_table_alloc(phb->hose->node);
>> -	iommu_register_group(&pe->table_group, phb->hose->global_number,
>> -			pe->pe_number);
>> +	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
>> +			pe->pe_number, NULL);
>> 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>>
>> 	/* Grab a 32-bit TCE table */
>> @@ -2450,8 +2457,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>> 	/* TVE #1 is selected by PCI address bit 59 */
>> 	pe->tce_bypass_base = 1ull << 59;
>>
>> -	iommu_register_group(&pe->table_group, phb->hose->global_number,
>> -			pe->pe_number);
>> +	iommu_register_table_group(&pe->table_group, phb->hose->global_number,
>> +			pe->pe_number, pnv_pci_ioda2_group_release);
>>
>> 	/* The PE will reserve all possible 32-bits space */
>> 	pe->tce32_seg = 0;
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index bd98ce2..0f6f06c 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -112,7 +112,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
>> 	}
>> 	if (table_group->group) {
>> 		iommu_group_put(table_group->group);
>> -		BUG_ON(table_group->group);
>> +		table_group->group = NULL;
>> 	}
>> #endif
>> 	iommu_free_table(tbl, node_name);
>> @@ -692,7 +692,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>> 	iommu_table_setparms(pci->phb, dn, tbl);
>> 	tbl->it_ops = &iommu_table_pseries_ops;
>> 	iommu_init_table(tbl, pci->phb->node);
>> -	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
>> +	iommu_register_table_group(pci->table_group, pci_domain_nr(bus), 0,
>> +			NULL);
>>
>> 	/* Divide the rest (1.75GB) among the children */
>> 	pci->phb->dma_window_size = 0x80000000ul;
>> @@ -743,8 +744,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>> 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
>> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
>> 		iommu_init_table(tbl, ppci->phb->node);
>> -		iommu_register_group(ppci->table_group,
>> -				pci_domain_nr(bus), 0);
>> +		iommu_register_table_group(ppci->table_group,
>> +				pci_domain_nr(bus), 0, NULL);
>> 		pr_debug("  created table: %p\n", ppci->table_group);
>> 	}
>> }
>> @@ -772,8 +773,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>> 		iommu_table_setparms(phb, dn, tbl);
>> 		tbl->it_ops = &iommu_table_pseries_ops;
>> 		iommu_init_table(tbl, phb->node);
>> -		iommu_register_group(PCI_DN(dn)->table_group,
>> -				pci_domain_nr(phb->bus), 0);
>> +		iommu_register_table_group(PCI_DN(dn)->table_group,
>> +				pci_domain_nr(phb->bus), 0, NULL);
>> 		set_iommu_table_base(&dev->dev, tbl);
>> 		iommu_add_device(&dev->dev);
>> 		return;
>> @@ -1197,8 +1198,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>> 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
>> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
>> 		iommu_init_table(tbl, pci->phb->node);
>> -		iommu_register_group(pci->table_group,
>> -				pci_domain_nr(pci->phb->bus), 0);
>> +		iommu_register_table_group(pci->table_group,
>> +				pci_domain_nr(pci->phb->bus), 0, NULL);
>> 		pr_debug("  created table: %p\n", pci->table_group);
>> 	} else {
>> 		pr_debug("  found DMA window, table: %p\n", pci->table_group);
>
> Here seems one issue not introduced by this patch: all PE# passed to
> iommu_register_table_group() on pSeries platform are zero. When there
> are multiple PEs under one PHB, their IOMMU groups will have same
> names. It seems not correct. Maybe we don't have two PEs under one
> PHB yet?

This is a minor issue though. I cannot recall anything depending on the 
name and at the moment VFIO does not work on pseries anyway, when/if I add 
it, I'll fix this too.



-- 
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-21  0:21   ` Gavin Shan
@ 2016-04-21  3:20     ` Alexey Kardashevskiy
  2016-04-26  2:29       ` Alexey Kardashevskiy
  2016-04-27  1:07       ` Gavin Shan
  0 siblings, 2 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-21  3:20 UTC (permalink / raw)
  To: Gavin Shan
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, David Gibson

On 04/21/2016 10:21 AM, Gavin Shan wrote:
> On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
>> When SRIOV is disabled, the existing code presumes there is no
>> virtual function (VF) in use and destroys all associated PEs.
>> However it is possible to get into the situation when the user
>> activated SRIOV disabling while a VF is still in use via VFIO.
>> For example, unbinding a physical function (PF) while there is a guest
>> running with a VF passed throuhgh via VFIO will trigger the bug.
>>
>> This defines an IODA2-specific IOMMU group release() callback.
>> This moves all the disposal code from pnv_ioda_release_vf_PE() to this
>> new callback so the cleanup happens when the last user of an IOMMU
>> group released the reference.
>>
>> As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
>> iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
>> into pnv_ioda_release_vf_PE().
>>
>
> Sorry, I don't understand how it works. When PF's driver disables
> IOV capability, the VF cannnot work. The guest is unlikely to know
> that and still continue accessing the VF's resources (e.g. config
> space and MMIO registers). It would cause EEH errors.

The host disables IOV which removes VF devices which unbinds vfio_pci 
driver and does all the cleanup, eventually we get to QEMU's 
vfio_req_notifier_handler() and PCI hot unplug is initiated and the device 
disappears from the guest.

If the guest cannot do PCI hotunplug, then EEH will make host stop it anyway.

Here we do not really care what happens to the guest (it can detect EEH or 
hotunplug or simply crash), we need to make sure that the _host_ does not 
crash in any case because the root user did something weird.


>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
>> 1 file changed, 14 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index ce9f2bf..8108c54 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>> static void pnv_pci_ioda2_group_release(void *iommu_data)
>> {
>> 	struct iommu_table_group *table_group = iommu_data;
>> +	struct pnv_ioda_pe *pe = container_of(table_group,
>> +			struct pnv_ioda_pe, table_group);
>> +	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
>
> pe->parent_dev would be NULL for non-VF-PEs and it's protected by CONFIG_PCI_IOV
> in pci.h.


Yeah, I'll fix it.

>
>> +	struct pnv_phb *phb = hose->private_data;
>> +	struct iommu_table *tbl = pe->table_group.tables[0];
>> +	int64_t rc;
>>
>> -	table_group->group = NULL;
>> -}
>> -
>> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>> -{
>> -	struct iommu_table    *tbl;
>> -	int64_t               rc;
>> -
>> -	tbl = pe->table_group.tables[0];
>> 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>> 	if (rc)
>> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>
>> 	pnv_pci_ioda2_set_bypass(pe, false);
>> -	if (pe->table_group.group) {
>> -		iommu_group_put(pe->table_group.group);
>> -		BUG_ON(pe->table_group.group);
>> -	}
>> +
>> +	BUG_ON(!tbl);
>> 	pnv_pci_ioda2_table_free_pages(tbl);
>> -	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>> +	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
>> +
>> +	pnv_ioda_deconfigure_pe(phb, pe);
>> +	pnv_ioda_free_pe(phb, pe->pe_number);
>> }
>
> It's not correct enough. One PE is comprised of DMA, MMIO, mapping info etc.
> This function disposes all of them when DMA finishes its job. I don't figure
> out a better way to represent all of them and their relationship. I guess it's
> worthy to have something in long term though it's not trival work.


Sorry, I am missing your point here. I am not changing the resource 
deallocation here, I am just doing it slightly later and all I wonder at 
the moment is if there are races - like having 2 scripts - one doing unbind 
PF and another doing bind PF - will this crash the host in theory?


>
>>
>> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> @@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> 		if (pe->parent_dev != pdev)
>> 			continue;
>>
>> -		pnv_pci_ioda2_release_dma_pe(pdev, pe);
>> -
>> 		/* Remove from list */
>> 		mutex_lock(&phb->ioda.pe_list_mutex);
>> 		list_del(&pe->list);
>> 		mutex_unlock(&phb->ioda.pe_list_mutex);
>>
>> -		pnv_ioda_deconfigure_pe(phb, pe);
>> -
>> -		pnv_ioda_free_pe(phb, pe->pe_number);
>> +		if (pe->table_group.group)
>> +			iommu_group_put(pe->table_group.group);
>> 	}
>> }
>>
>> --
>> 2.5.0.rc3
>>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-21  3:20     ` Alexey Kardashevskiy
@ 2016-04-26  2:29       ` Alexey Kardashevskiy
  2016-04-27  1:07       ` Gavin Shan
  1 sibling, 0 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2016-04-26  2:29 UTC (permalink / raw)
  To: Gavin Shan
  Cc: linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens, David Gibson

On 04/21/2016 01:20 PM, Alexey Kardashevskiy wrote:
> On 04/21/2016 10:21 AM, Gavin Shan wrote:
>> On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
>>> When SRIOV is disabled, the existing code presumes there is no
>>> virtual function (VF) in use and destroys all associated PEs.
>>> However it is possible to get into the situation when the user
>>> activated SRIOV disabling while a VF is still in use via VFIO.
>>> For example, unbinding a physical function (PF) while there is a guest
>>> running with a VF passed throuhgh via VFIO will trigger the bug.
>>>
>>> This defines an IODA2-specific IOMMU group release() callback.
>>> This moves all the disposal code from pnv_ioda_release_vf_PE() to this
>>> new callback so the cleanup happens when the last user of an IOMMU
>>> group released the reference.
>>>
>>> As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
>>> iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
>>> into pnv_ioda_release_vf_PE().
>>>
>>
>> Sorry, I don't understand how it works. When PF's driver disables
>> IOV capability, the VF cannnot work. The guest is unlikely to know
>> that and still continue accessing the VF's resources (e.g. config
>> space and MMIO registers). It would cause EEH errors.
>
> The host disables IOV which removes VF devices which unbinds vfio_pci
> driver and does all the cleanup, eventually we get to QEMU's
> vfio_req_notifier_handler() and PCI hot unplug is initiated and the device
> disappears from the guest.
>
> If the guest cannot do PCI hotunplug, then EEH will make host stop it anyway.
>
> Here we do not really care what happens to the guest (it can detect EEH or
> hotunplug or simply crash), we need to make sure that the _host_ does not
> crash in any case because the root user did something weird.


Ping?

>
>
>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> arch/powerpc/platforms/powernv/pci-ioda.c | 33
>>> +++++++++++++------------------
>>> 1 file changed, 14 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index ce9f2bf..8108c54 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct
>>> pnv_ioda_pe *pe, bool enable);
>>> static void pnv_pci_ioda2_group_release(void *iommu_data)
>>> {
>>>     struct iommu_table_group *table_group = iommu_data;
>>> +    struct pnv_ioda_pe *pe = container_of(table_group,
>>> +            struct pnv_ioda_pe, table_group);
>>> +    struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
>>
>> pe->parent_dev would be NULL for non-VF-PEs and it's protected by
>> CONFIG_PCI_IOV
>> in pci.h.
>
>
> Yeah, I'll fix it.
>
>>
>>> +    struct pnv_phb *phb = hose->private_data;
>>> +    struct iommu_table *tbl = pe->table_group.tables[0];
>>> +    int64_t rc;
>>>
>>> -    table_group->group = NULL;
>>> -}
>>> -
>>> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct
>>> pnv_ioda_pe *pe)
>>> -{
>>> -    struct iommu_table    *tbl;
>>> -    int64_t               rc;
>>> -
>>> -    tbl = pe->table_group.tables[0];
>>>     rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>>>     if (rc)
>>>         pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>>
>>>     pnv_pci_ioda2_set_bypass(pe, false);
>>> -    if (pe->table_group.group) {
>>> -        iommu_group_put(pe->table_group.group);
>>> -        BUG_ON(pe->table_group.group);
>>> -    }
>>> +
>>> +    BUG_ON(!tbl);
>>>     pnv_pci_ioda2_table_free_pages(tbl);
>>> -    iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>> +    iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
>>> +
>>> +    pnv_ioda_deconfigure_pe(phb, pe);
>>> +    pnv_ioda_free_pe(phb, pe->pe_number);
>>> }
>>
>> It's not correct enough. One PE is comprised of DMA, MMIO, mapping info etc.
>> This function disposes all of them when DMA finishes its job. I don't figure
>> out a better way to represent all of them and their relationship. I guess
>> it's
>> worthy to have something in long term though it's not trival work.
>
>
> Sorry, I am missing your point here. I am not changing the resource
> deallocation here, I am just doing it slightly later and all I wonder at
> the moment is if there are races - like having 2 scripts - one doing unbind
> PF and another doing bind PF - will this crash the host in theory?
>
>
>>
>>>
>>> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>> @@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct
>>> pci_dev *pdev)
>>>         if (pe->parent_dev != pdev)
>>>             continue;
>>>
>>> -        pnv_pci_ioda2_release_dma_pe(pdev, pe);
>>> -
>>>         /* Remove from list */
>>>         mutex_lock(&phb->ioda.pe_list_mutex);
>>>         list_del(&pe->list);
>>>         mutex_unlock(&phb->ioda.pe_list_mutex);
>>>
>>> -        pnv_ioda_deconfigure_pe(phb, pe);
>>> -
>>> -        pnv_ioda_free_pe(phb, pe->pe_number);
>>> +        if (pe->table_group.group)
>>> +            iommu_group_put(pe->table_group.group);
>>>     }
>>> }
>>>
>>> --
>>> 2.5.0.rc3
>>>
>>
>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal
  2016-04-21  3:20     ` Alexey Kardashevskiy
  2016-04-26  2:29       ` Alexey Kardashevskiy
@ 2016-04-27  1:07       ` Gavin Shan
  1 sibling, 0 replies; 14+ messages in thread
From: Gavin Shan @ 2016-04-27  1:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Gavin Shan, linuxppc-dev, Benjamin Herrenschmidt, Daniel Axtens,
	David Gibson

On Thu, Apr 21, 2016 at 01:20:09PM +1000, Alexey Kardashevskiy wrote:
>On 04/21/2016 10:21 AM, Gavin Shan wrote:
>>On Fri, Apr 08, 2016 at 04:36:44PM +1000, Alexey Kardashevskiy wrote:
>>>When SRIOV is disabled, the existing code presumes there is no
>>>virtual function (VF) in use and destroys all associated PEs.
>>>However it is possible to get into the situation when the user
>>>activated SRIOV disabling while a VF is still in use via VFIO.
>>>For example, unbinding a physical function (PF) while there is a guest
>>>running with a VF passed throuhgh via VFIO will trigger the bug.
>>>
>>>This defines an IODA2-specific IOMMU group release() callback.
>>>This moves all the disposal code from pnv_ioda_release_vf_PE() to this
>>>new callback so the cleanup happens when the last user of an IOMMU
>>>group released the reference.
>>>
>>>As pnv_pci_ioda2_release_dma_pe() was reduced to just calling
>>>iommu_group_put(), this merges pnv_pci_ioda2_release_dma_pe()
>>>into pnv_ioda_release_vf_PE().
>>>
>>
>>Sorry, I don't understand how it works. When PF's driver disables
>>IOV capability, the VF cannnot work. The guest is unlikely to know
>>that and still continue accessing the VF's resources (e.g. config
>>space and MMIO registers). It would cause EEH errors.
>
>The host disables IOV which removes VF devices which unbinds vfio_pci driver
>and does all the cleanup, eventually we get to QEMU's
>vfio_req_notifier_handler() and PCI hot unplug is initiated and the device
>disappears from the guest.
>
>If the guest cannot do PCI hotunplug, then EEH will make host stop it anyway.
>
>Here we do not really care what happens to the guest (it can detect EEH or
>hotunplug or simply crash), we need to make sure that the _host_ does not
>crash in any case because the root user did something weird.
>

Thanks for the detailed explanation. My concern is there should have some
race: the guest doesn't receive the signal and unplug it in time. The guest
could possibly produces PCI traffic that cannnot be handled by VF. My first
impression would be raised EEH as the result. If it's a fenced PHB and the
host is going to be affected. Would it be a problem? I think it would be nice
if the guest can work without issue under this situation.

As you claimed, it's a _weird_ behaviour for user to unbind PF's driver that
in turn disables IOV capability when VFs are owned by guest. Why not stop user
from doing it?

>
>>
>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>---
>>>arch/powerpc/platforms/powernv/pci-ioda.c | 33 +++++++++++++------------------
>>>1 file changed, 14 insertions(+), 19 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index ce9f2bf..8108c54 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -1333,27 +1333,25 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>>>static void pnv_pci_ioda2_group_release(void *iommu_data)
>>>{
>>>	struct iommu_table_group *table_group = iommu_data;
>>>+	struct pnv_ioda_pe *pe = container_of(table_group,
>>>+			struct pnv_ioda_pe, table_group);
>>>+	struct pci_controller *hose = pci_bus_to_host(pe->parent_dev->bus);
>>
>>pe->parent_dev would be NULL for non-VF-PEs and it's protected by CONFIG_PCI_IOV
>>in pci.h.
>
>
>Yeah, I'll fix it.
>
>>
>>>+	struct pnv_phb *phb = hose->private_data;
>>>+	struct iommu_table *tbl = pe->table_group.tables[0];
>>>+	int64_t rc;
>>>
>>>-	table_group->group = NULL;
>>>-}
>>>-
>>>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>>>-{
>>>-	struct iommu_table    *tbl;
>>>-	int64_t               rc;
>>>-
>>>-	tbl = pe->table_group.tables[0];
>>>	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>>>	if (rc)
>>>		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>>
>>>	pnv_pci_ioda2_set_bypass(pe, false);
>>>-	if (pe->table_group.group) {
>>>-		iommu_group_put(pe->table_group.group);
>>>-		BUG_ON(pe->table_group.group);
>>>-	}
>>>+
>>>+	BUG_ON(!tbl);
>>>	pnv_pci_ioda2_table_free_pages(tbl);
>>>-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>>+	iommu_free_table(tbl, of_node_full_name(pe->parent_dev->dev.of_node));
>>>+
>>>+	pnv_ioda_deconfigure_pe(phb, pe);
>>>+	pnv_ioda_free_pe(phb, pe->pe_number);
>>>}
>>
>>It's not correct enough. One PE is comprised of DMA, MMIO, mapping info etc.
>>This function disposes all of them when DMA finishes its job. I don't figure
>>out a better way to represent all of them and their relationship. I guess it's
>>worthy to have something in long term though it's not trival work.
>
>
>Sorry, I am missing your point here. I am not changing the resource
>deallocation here, I am just doing it slightly later and all I wonder at the
>moment is if there are races - like having 2 scripts - one doing unbind PF
>and another doing bind PF - will this crash the host in theory?
>

One PE is comprised of multiple facets like DMA/MMIO/PELTM/EEH as I
explained in last reply. pnv_pci_ioda2_release_dma_pe() called to
destroy DMA window is going to tear down all (or most) of them.
That means DMA and PE are equivalent, but they're not. Furturemore,
not all PE have associated enabled DMA windows.

>>
>>>
>>>static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>>@@ -1376,16 +1374,13 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>>		if (pe->parent_dev != pdev)
>>>			continue;
>>>
>>>-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
>>>-
>>>		/* Remove from list */
>>>		mutex_lock(&phb->ioda.pe_list_mutex);
>>>		list_del(&pe->list);
>>>		mutex_unlock(&phb->ioda.pe_list_mutex);
>>>
>>>-		pnv_ioda_deconfigure_pe(phb, pe);
>>>-
>>>-		pnv_ioda_free_pe(phb, pe->pe_number);
>>>+		if (pe->table_group.group)
>>>+			iommu_group_put(pe->table_group.group);
>>>	}
>>>}
>>>
>>>--
>>>2.5.0.rc3
>>>
>>
>
>
>-- 
>Alexey
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-04-27  1:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-08  6:36 [PATCH kernel 0/2] powerpc/powernv: Fix crash on PF unbind when VF is passed Alexey Kardashevskiy
2016-04-08  6:36 ` [PATCH kernel 1/2] powerpc/iommu: Get rid of default group_release() Alexey Kardashevskiy
2016-04-08  7:14   ` kbuild test robot
2016-04-14  1:35   ` David Gibson
2016-04-21  0:02   ` Gavin Shan
2016-04-21  3:17     ` Alexey Kardashevskiy
2016-04-08  6:36 ` [PATCH kernel 2/2] powerpc/powernv/ioda2: Delay PE disposal Alexey Kardashevskiy
2016-04-14  1:40   ` David Gibson
2016-04-15  1:29     ` Alexey Kardashevskiy
2016-04-15  2:26       ` David Gibson
2016-04-21  0:21   ` Gavin Shan
2016-04-21  3:20     ` Alexey Kardashevskiy
2016-04-26  2:29       ` Alexey Kardashevskiy
2016-04-27  1:07       ` Gavin Shan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.