All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass
@ 2018-06-01  8:10 Alexey Kardashevskiy
  2018-06-01  8:10 ` [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for " Alexey Kardashevskiy
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-01  8:10 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Benjamin Herrenschmidt,
	Russell Currey

I came across this adhoc implementation and thought it could use some
polishing. This fixes memory leaks and add P9 support. Based on the current
upstream.


Please comment. Thanks.



Alexey Kardashevskiy (2):
  powerpc/powernv: Reuse existing TCE code for sketchy bypass
  powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9

 arch/powerpc/platforms/powernv/pci.h      |  1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 76 +++++++++++++++++--------------
 2 files changed, 44 insertions(+), 33 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for sketchy bypass
  2018-06-01  8:10 [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
@ 2018-06-01  8:10 ` Alexey Kardashevskiy
  2018-06-16  1:04   ` Benjamin Herrenschmidt
  2018-06-01  8:10 ` [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9 Alexey Kardashevskiy
  2018-06-15  9:01 ` [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
  2 siblings, 1 reply; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-01  8:10 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Benjamin Herrenschmidt,
	Russell Currey

The existing sketchy bypass ignores the existing default 32bit TCE table
(created by default for every PE at boot time or after being used by
VFIO) and it allocates another table instead without updating PE DMA
config (pe->table_group). So if we decide to use such device for VFIO
later, this new table will also leak memory.

This replaces adhoc table allocation and programming with the existing
API which handles memory leaks.

This programs the default 32bit table back to TVE#0 if configuring
the new table failed for some reason.

While we are at it, switch from the hardcoded 256MB TCEs to the biggest
size supported by the hardware and reported by the firmware. This allows
the sketchy bypass (originally made for POWER8 only) to work on POWER9
too assuming that PHB4 type is defined and pnv_pci_ioda_dma_64bit_bypass()
is called (coming next).

This does not call iommu_init_table() for the new table as the caller
will use &dma_nommu_ops and therefore ::it_map is not needed.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

Tested with:
 	if (pe->tce_bypass_enabled) {
 		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
-		bypass = (dma_mask >= top);
+		bypass = false;//(dma_mask >= top);
 	}
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 71 +++++++++++++++++--------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ceb7e64..9239142 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1791,54 +1791,61 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
  *
  * Currently this will only work on PHB3 (POWER8).
  */
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl);
+
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl);
+
+static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
 static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
 {
-	u64 window_size, table_size, tce_count, addr;
-	struct page *table_pages;
-	u64 tce_order = 28; /* 256MB TCEs */
-	__be64 *tces;
+	u64 window_size;
 	s64 rc;
+	struct iommu_table *tbl, *oldtbl = NULL;
+	unsigned long shift, offset;
 
 	/*
 	 * Window size needs to be a power of two, but needs to account for
 	 * shifting memory by the 4GB offset required to skip 32bit space.
 	 */
-	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
-	tce_count = window_size >> tce_order;
-	table_size = tce_count << 3;
-
-	if (table_size < PAGE_SIZE)
-		table_size = PAGE_SIZE;
+	window_size = roundup_pow_of_two(memory_hotplug_max() + SZ_4G);
+	shift = ilog2(pnv_ioda_parse_tce_sizes(pe->phb));
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, shift, window_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 64-bypass TCE table, err %ld", rc);
+		return rc;
+	}
 
-	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
-				       get_order(table_size));
-	if (!table_pages)
+	offset = SZ_4G >> shift;
+	rc = tbl->it_ops->set(tbl, offset, tbl->it_size - offset,
+			0 /* uaddr */, DMA_BIDIRECTIONAL, 0 /* attrs */);
+	if (rc)
 		goto err;
 
-	tces = page_address(table_pages);
-	if (!tces)
+	if (pe->table_group.tables[0]) {
+		oldtbl = pe->table_group.tables[0];
+		pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	}
+
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (rc != OPAL_SUCCESS) {
+		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, oldtbl);
 		goto err;
+	}
 
-	memset(tces, 0, table_size);
+	if (oldtbl)
+		iommu_tce_table_put(oldtbl);
 
-	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
-		tces[(addr + (1ULL << 32)) >> tce_order] =
-			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
-	}
+	pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+	return 0;
 
-	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
-					pe->pe_number,
-					/* reconfigure window 0 */
-					(pe->pe_number << 1) + 0,
-					1,
-					__pa(tces),
-					table_size,
-					1 << tce_order);
-	if (rc == OPAL_SUCCESS) {
-		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
-		return 0;
-	}
 err:
+	iommu_tce_table_put(tbl);
+
 	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
 	return -EIO;
 }
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
  2018-06-01  8:10 [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
  2018-06-01  8:10 ` [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for " Alexey Kardashevskiy
@ 2018-06-01  8:10 ` Alexey Kardashevskiy
  2018-06-16  1:05   ` Benjamin Herrenschmidt
  2018-06-15  9:01 ` [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
  2 siblings, 1 reply; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-01  8:10 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Alistair Popple, Benjamin Herrenschmidt,
	Russell Currey

These are found in POWER9 chips. Right now these PHBs have unknown type
so changing it to PHB4 won't make much of a difference except enabling
sketchy bypass for POWER9 as this does below.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/pci.h      | 1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index eada4b6..1408247 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -23,6 +23,7 @@ enum pnv_phb_model {
 	PNV_PHB_MODEL_UNKNOWN,
 	PNV_PHB_MODEL_P7IOC,
 	PNV_PHB_MODEL_PHB3,
+	PNV_PHB_MODEL_PHB4,
 	PNV_PHB_MODEL_NPU,
 	PNV_PHB_MODEL_NPU2,
 };
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9239142..66c2804 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1882,7 +1882,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 		if (dma_mask >> 32 &&
 		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
 		    pnv_pci_ioda_pe_single_vendor(pe) &&
-		    phb->model == PNV_PHB_MODEL_PHB3) {
+		    (phb->model == PNV_PHB_MODEL_PHB3 ||
+		     phb->model == PNV_PHB_MODEL_PHB4)) {
 			/* Configure the bypass mode */
 			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
 			if (rc)
@@ -3930,6 +3931,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 		phb->model = PNV_PHB_MODEL_P7IOC;
 	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
 		phb->model = PNV_PHB_MODEL_PHB3;
+	else if (of_device_is_compatible(np, "ibm,power9-pciex"))
+		phb->model = PNV_PHB_MODEL_PHB4;
 	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
 		phb->model = PNV_PHB_MODEL_NPU;
 	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass
  2018-06-01  8:10 [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
  2018-06-01  8:10 ` [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for " Alexey Kardashevskiy
  2018-06-01  8:10 ` [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9 Alexey Kardashevskiy
@ 2018-06-15  9:01 ` Alexey Kardashevskiy
  2 siblings, 0 replies; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-15  9:01 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alistair Popple, Benjamin Herrenschmidt, Russell Currey

On Fri,  1 Jun 2018 18:10:26 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> I came across this adhoc implementation and thought it could use some
> polishing. This fixes memory leaks and add P9 support. Based on the current
> upstream.
> 
> 
> Please comment. Thanks.


Ping?


> 
> 
> 
> Alexey Kardashevskiy (2):
>   powerpc/powernv: Reuse existing TCE code for sketchy bypass
>   powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
> 
>  arch/powerpc/platforms/powernv/pci.h      |  1 +
>  arch/powerpc/platforms/powernv/pci-ioda.c | 76 +++++++++++++++++--------------
>  2 files changed, 44 insertions(+), 33 deletions(-)
> 



--
Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for sketchy bypass
  2018-06-01  8:10 ` [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for " Alexey Kardashevskiy
@ 2018-06-16  1:04   ` Benjamin Herrenschmidt
  2018-07-02  8:50     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2018-06-16  1:04 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: Alistair Popple, Russell Currey

On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> The existing sketchy bypass ignores the existing default 32bit TCE table
> (created by default for every PE at boot time or after being used by
> VFIO) and it allocates another table instead without updating PE DMA
> config (pe->table_group). So if we decide to use such device for VFIO
> later, this new table will also leak memory.
> 
> This replaces adhoc table allocation and programming with the existing
> API which handles memory leaks.
> 
> This programs the default 32bit table back to TVE#0 if configuring
> the new table failed for some reason.
> 
> While we are at it, switch from the hardcoded 256MB TCEs to the biggest
> size supported by the hardware and reported by the firmware. This allows
> the sketchy bypass (originally made for POWER8 only) to work on POWER9
> too assuming that PHB4 type is defined and pnv_pci_ioda_dma_64bit_bypass()
> is called (coming next).

It won't work on POWER9 for other reasons. Mostly memory isn't
contiguous there.

What we shuld look into rather than removing the 32-bit table is
exploit DD2 P9 feature allowing to overlay TVE0 and TVE1 so that 0..4G
is in control of TVE0 and the rest goes to TVE1.
 
> This does not call iommu_init_table() for the new table as the caller
> will use &dma_nommu_ops and therefore ::it_map is not needed.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> Tested with:
>  	if (pe->tce_bypass_enabled) {
>  		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
> -		bypass = (dma_mask >= top);
> +		bypass = false;//(dma_mask >= top);
>  	}
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 71 +++++++++++++++++--------------
>  1 file changed, 39 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index ceb7e64..9239142 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1791,54 +1791,61 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
>   *
>   * Currently this will only work on PHB3 (POWER8).
>   */
> +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table **ptbl);
> +
> +static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
> +		int num, struct iommu_table *tbl);
> +
> +static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
> +
>  static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
>  {
> -	u64 window_size, table_size, tce_count, addr;
> -	struct page *table_pages;
> -	u64 tce_order = 28; /* 256MB TCEs */
> -	__be64 *tces;
> +	u64 window_size;
>  	s64 rc;
> +	struct iommu_table *tbl, *oldtbl = NULL;
> +	unsigned long shift, offset;
>  
>  	/*
>  	 * Window size needs to be a power of two, but needs to account for
>  	 * shifting memory by the 4GB offset required to skip 32bit space.
>  	 */
> -	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
> -	tce_count = window_size >> tce_order;
> -	table_size = tce_count << 3;
> -
> -	if (table_size < PAGE_SIZE)
> -		table_size = PAGE_SIZE;
> +	window_size = roundup_pow_of_two(memory_hotplug_max() + SZ_4G);
> +	shift = ilog2(pnv_ioda_parse_tce_sizes(pe->phb));
> +	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, shift, window_size,
> +			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
> +	if (rc) {
> +		pe_err(pe, "Failed to create 64-bypass TCE table, err %ld", rc);
> +		return rc;
> +	}
>  
> -	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
> -				       get_order(table_size));
> -	if (!table_pages)
> +	offset = SZ_4G >> shift;
> +	rc = tbl->it_ops->set(tbl, offset, tbl->it_size - offset,
> +			0 /* uaddr */, DMA_BIDIRECTIONAL, 0 /* attrs */);
> +	if (rc)
>  		goto err;
>  
> -	tces = page_address(table_pages);
> -	if (!tces)
> +	if (pe->table_group.tables[0]) {
> +		oldtbl = pe->table_group.tables[0];
> +		pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> +	}
> +
> +	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> +	if (rc != OPAL_SUCCESS) {
> +		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, oldtbl);
>  		goto err;
> +	}
>  
> -	memset(tces, 0, table_size);
> +	if (oldtbl)
> +		iommu_tce_table_put(oldtbl);
>  
> -	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
> -		tces[(addr + (1ULL << 32)) >> tce_order] =
> -			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
> -	}
> +	pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
> +	return 0;
>  
> -	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
> -					pe->pe_number,
> -					/* reconfigure window 0 */
> -					(pe->pe_number << 1) + 0,
> -					1,
> -					__pa(tces),
> -					table_size,
> -					1 << tce_order);
> -	if (rc == OPAL_SUCCESS) {
> -		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
> -		return 0;
> -	}
>  err:
> +	iommu_tce_table_put(tbl);
> +
>  	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
>  	return -EIO;
>  }

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
  2018-06-01  8:10 ` [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9 Alexey Kardashevskiy
@ 2018-06-16  1:05   ` Benjamin Herrenschmidt
  2018-06-18  2:13     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2018-06-16  1:05 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: Alistair Popple, Russell Currey

On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> These are found in POWER9 chips. Right now these PHBs have unknown type
> so changing it to PHB4 won't make much of a difference except enabling
> sketchy bypass for POWER9 as this does below.

And that will break on multi-chip systems since P9 doesn't have the
memory contiguous (it has the chip ID in the top bits).

Russell is working on a different implementation that should be much
more imune to the system physical memory layout.

> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/powernv/pci.h      | 1 +
>  arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index eada4b6..1408247 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -23,6 +23,7 @@ enum pnv_phb_model {
>  	PNV_PHB_MODEL_UNKNOWN,
>  	PNV_PHB_MODEL_P7IOC,
>  	PNV_PHB_MODEL_PHB3,
> +	PNV_PHB_MODEL_PHB4,
>  	PNV_PHB_MODEL_NPU,
>  	PNV_PHB_MODEL_NPU2,
>  };
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 9239142..66c2804 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1882,7 +1882,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
>  		if (dma_mask >> 32 &&
>  		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
>  		    pnv_pci_ioda_pe_single_vendor(pe) &&
> -		    phb->model == PNV_PHB_MODEL_PHB3) {
> +		    (phb->model == PNV_PHB_MODEL_PHB3 ||
> +		     phb->model == PNV_PHB_MODEL_PHB4)) {
>  			/* Configure the bypass mode */
>  			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
>  			if (rc)
> @@ -3930,6 +3931,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  		phb->model = PNV_PHB_MODEL_P7IOC;
>  	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
>  		phb->model = PNV_PHB_MODEL_PHB3;
> +	else if (of_device_is_compatible(np, "ibm,power9-pciex"))
> +		phb->model = PNV_PHB_MODEL_PHB4;
>  	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
>  		phb->model = PNV_PHB_MODEL_NPU;
>  	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
  2018-06-16  1:05   ` Benjamin Herrenschmidt
@ 2018-06-18  2:13     ` Alexey Kardashevskiy
  2018-06-18  4:44       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-18  2:13 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Alistair Popple, Russell Currey

On Sat, 16 Jun 2018 11:05:19 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> > These are found in POWER9 chips. Right now these PHBs have unknown type
> > so changing it to PHB4 won't make much of a difference except enabling
> > sketchy bypass for POWER9 as this does below.  
> 
> And that will break on multi-chip systems since P9 doesn't have the
> memory contiguous (it has the chip ID in the top bits).


This did not break mine and it is hard to see why it would break at all
if we use 1G pages and the maximum we need to cover is 48 bits (this
is what we are trying to support here - all these gpus, right?), or is
it more now? If so, I have posted v2 of tce multilevel dynamic
allocation which helps with enormous tce tables.



> 
> Russell is working on a different implementation that should be much
> more imune to the system physical memory layout.
> 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> >  arch/powerpc/platforms/powernv/pci.h      | 1 +
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
> >  2 files changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > index eada4b6..1408247 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -23,6 +23,7 @@ enum pnv_phb_model {
> >  	PNV_PHB_MODEL_UNKNOWN,
> >  	PNV_PHB_MODEL_P7IOC,
> >  	PNV_PHB_MODEL_PHB3,
> > +	PNV_PHB_MODEL_PHB4,
> >  	PNV_PHB_MODEL_NPU,
> >  	PNV_PHB_MODEL_NPU2,
> >  };
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index 9239142..66c2804 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -1882,7 +1882,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
> >  		if (dma_mask >> 32 &&
> >  		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
> >  		    pnv_pci_ioda_pe_single_vendor(pe) &&
> > -		    phb->model == PNV_PHB_MODEL_PHB3) {
> > +		    (phb->model == PNV_PHB_MODEL_PHB3 ||
> > +		     phb->model == PNV_PHB_MODEL_PHB4)) {
> >  			/* Configure the bypass mode */
> >  			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
> >  			if (rc)
> > @@ -3930,6 +3931,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> >  		phb->model = PNV_PHB_MODEL_P7IOC;
> >  	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
> >  		phb->model = PNV_PHB_MODEL_PHB3;
> > +	else if (of_device_is_compatible(np, "ibm,power9-pciex"))
> > +		phb->model = PNV_PHB_MODEL_PHB4;
> >  	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
> >  		phb->model = PNV_PHB_MODEL_NPU;
> >  	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))  



--
Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
  2018-06-18  2:13     ` Alexey Kardashevskiy
@ 2018-06-18  4:44       ` Benjamin Herrenschmidt
  2018-06-18  7:20         ` Alexey Kardashevskiy
  0 siblings, 1 reply; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2018-06-18  4:44 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Alistair Popple, Russell Currey

On Mon, 2018-06-18 at 12:13 +1000, Alexey Kardashevskiy wrote:
> On Sat, 16 Jun 2018 11:05:19 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> > > These are found in POWER9 chips. Right now these PHBs have unknown type
> > > so changing it to PHB4 won't make much of a difference except enabling
> > > sketchy bypass for POWER9 as this does below.  
> > 
> > And that will break on multi-chip systems since P9 doesn't have the
> > memory contiguous (it has the chip ID in the top bits).
> 
> 
> This did not break mine and it is hard to see why it would break at all
> if we use 1G pages and the maximum we need to cover is 48 bits (this
> is what we are trying to support here - all these gpus, right?), or is
> it more now? If so, I have posted v2 of tce multilevel dynamic
> allocation which helps with enormous tce tables.

The whole point of sketchy bypass is to deal with devices with small
amount of DMA bits... most Radeon's have 40 for example. So that won't
work terribly well.

Cheers,
Ben.

> 
> 
> > 
> > Russell is working on a different implementation that should be much
> > more imune to the system physical memory layout.
> > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > ---
> > >  arch/powerpc/platforms/powernv/pci.h      | 1 +
> > >  arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
> > >  2 files changed, 5 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > > index eada4b6..1408247 100644
> > > --- a/arch/powerpc/platforms/powernv/pci.h
> > > +++ b/arch/powerpc/platforms/powernv/pci.h
> > > @@ -23,6 +23,7 @@ enum pnv_phb_model {
> > >  	PNV_PHB_MODEL_UNKNOWN,
> > >  	PNV_PHB_MODEL_P7IOC,
> > >  	PNV_PHB_MODEL_PHB3,
> > > +	PNV_PHB_MODEL_PHB4,
> > >  	PNV_PHB_MODEL_NPU,
> > >  	PNV_PHB_MODEL_NPU2,
> > >  };
> > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > index 9239142..66c2804 100644
> > > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > @@ -1882,7 +1882,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
> > >  		if (dma_mask >> 32 &&
> > >  		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
> > >  		    pnv_pci_ioda_pe_single_vendor(pe) &&
> > > -		    phb->model == PNV_PHB_MODEL_PHB3) {
> > > +		    (phb->model == PNV_PHB_MODEL_PHB3 ||
> > > +		     phb->model == PNV_PHB_MODEL_PHB4)) {
> > >  			/* Configure the bypass mode */
> > >  			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
> > >  			if (rc)
> > > @@ -3930,6 +3931,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> > >  		phb->model = PNV_PHB_MODEL_P7IOC;
> > >  	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
> > >  		phb->model = PNV_PHB_MODEL_PHB3;
> > > +	else if (of_device_is_compatible(np, "ibm,power9-pciex"))
> > > +		phb->model = PNV_PHB_MODEL_PHB4;
> > >  	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
> > >  		phb->model = PNV_PHB_MODEL_NPU;
> > >  	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))  
> 
> 
> 
> --
> Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9
  2018-06-18  4:44       ` Benjamin Herrenschmidt
@ 2018-06-18  7:20         ` Alexey Kardashevskiy
  0 siblings, 0 replies; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-06-18  7:20 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Alistair Popple, Russell Currey

On Mon, 18 Jun 2018 14:44:56 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Mon, 2018-06-18 at 12:13 +1000, Alexey Kardashevskiy wrote:
> > On Sat, 16 Jun 2018 11:05:19 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> >   
> > > On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:  
> > > > These are found in POWER9 chips. Right now these PHBs have unknown type
> > > > so changing it to PHB4 won't make much of a difference except enabling
> > > > sketchy bypass for POWER9 as this does below.    
> > > 
> > > And that will break on multi-chip systems since P9 doesn't have the
> > > memory contiguous (it has the chip ID in the top bits).  
> > 
> > 
> > This did not break mine and it is hard to see why it would break at all
> > if we use 1G pages and the maximum we need to cover is 48 bits (this
> > is what we are trying to support here - all these gpus, right?), or is
> > it more now? If so, I have posted v2 of tce multilevel dynamic
> > allocation which helps with enormous tce tables.  
> 
> The whole point of sketchy bypass is to deal with devices with small
> amount of DMA bits... most Radeon's have 40 for example. So that won't
> work terribly well.


40 with 1GB pages needs 40-30=10 bits i.e. (1<<10)*8 = 8KB for the
entire TCE table. I am definitely missing something here, what is it?



> 
> Cheers,
> Ben.
> 
> > 
> >   
> > > 
> > > Russell is working on a different implementation that should be much
> > > more imune to the system physical memory layout.
> > >   
> > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > ---
> > > >  arch/powerpc/platforms/powernv/pci.h      | 1 +
> > > >  arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++++-
> > > >  2 files changed, 5 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > > > index eada4b6..1408247 100644
> > > > --- a/arch/powerpc/platforms/powernv/pci.h
> > > > +++ b/arch/powerpc/platforms/powernv/pci.h
> > > > @@ -23,6 +23,7 @@ enum pnv_phb_model {
> > > >  	PNV_PHB_MODEL_UNKNOWN,
> > > >  	PNV_PHB_MODEL_P7IOC,
> > > >  	PNV_PHB_MODEL_PHB3,
> > > > +	PNV_PHB_MODEL_PHB4,
> > > >  	PNV_PHB_MODEL_NPU,
> > > >  	PNV_PHB_MODEL_NPU2,
> > > >  };
> > > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > index 9239142..66c2804 100644
> > > > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > @@ -1882,7 +1882,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
> > > >  		if (dma_mask >> 32 &&
> > > >  		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
> > > >  		    pnv_pci_ioda_pe_single_vendor(pe) &&
> > > > -		    phb->model == PNV_PHB_MODEL_PHB3) {
> > > > +		    (phb->model == PNV_PHB_MODEL_PHB3 ||
> > > > +		     phb->model == PNV_PHB_MODEL_PHB4)) {
> > > >  			/* Configure the bypass mode */
> > > >  			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
> > > >  			if (rc)
> > > > @@ -3930,6 +3931,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> > > >  		phb->model = PNV_PHB_MODEL_P7IOC;
> > > >  	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
> > > >  		phb->model = PNV_PHB_MODEL_PHB3;
> > > > +	else if (of_device_is_compatible(np, "ibm,power9-pciex"))
> > > > +		phb->model = PNV_PHB_MODEL_PHB4;
> > > >  	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
> > > >  		phb->model = PNV_PHB_MODEL_NPU;
> > > >  	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))    
> > 
> > 
> > 
> > --
> > Alexey  



--
Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for sketchy bypass
  2018-06-16  1:04   ` Benjamin Herrenschmidt
@ 2018-07-02  8:50     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 10+ messages in thread
From: Alexey Kardashevskiy @ 2018-07-02  8:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Alistair Popple, Russell Currey

On Sat, 16 Jun 2018 11:04:32 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> > The existing sketchy bypass ignores the existing default 32bit TCE table
> > (created by default for every PE at boot time or after being used by
> > VFIO) and it allocates another table instead without updating PE DMA
> > config (pe->table_group). So if we decide to use such device for VFIO
> > later, this new table will also leak memory.
> > 
> > This replaces adhoc table allocation and programming with the existing
> > API which handles memory leaks.
> > 
> > This programs the default 32bit table back to TVE#0 if configuring
> > the new table failed for some reason.
> > 
> > While we are at it, switch from the hardcoded 256MB TCEs to the biggest
> > size supported by the hardware and reported by the firmware. This allows
> > the sketchy bypass (originally made for POWER8 only) to work on POWER9
> > too assuming that PHB4 type is defined and pnv_pci_ioda_dma_64bit_bypass()
> > is called (coming next).  
> 
> It won't work on POWER9 for other reasons. Mostly memory isn't
> contiguous there.


This simply allocates a table using the existing API which makes
things more accurate and manageable. "[PATCH 0/3] PCI DMA pseudo-bypass
for powernv" still allocates a table and programs it via OPAL, why does
it have to replicate all the helpers I put there? Do they all suck and
the new ones are better? If so, how exactly are they better? And let's
fix them then.



> 
> What we shuld look into rather than removing the 32-bit table is
> exploit DD2 P9 feature allowing to overlay TVE0 and TVE1 so that 0..4G
> is in control of TVE0 and the rest goes to TVE1.
>  
> > This does not call iommu_init_table() for the new table as the caller
> > will use &dma_nommu_ops and therefore ::it_map is not needed.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> > 
> > Tested with:
> >  	if (pe->tce_bypass_enabled) {
> >  		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
> > -		bypass = (dma_mask >= top);
> > +		bypass = false;//(dma_mask >= top);
> >  	}
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 71 +++++++++++++++++--------------
> >  1 file changed, 39 insertions(+), 32 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index ceb7e64..9239142 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -1791,54 +1791,61 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
> >   *
> >   * Currently this will only work on PHB3 (POWER8).
> >   */
> > +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
> > +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> > +		struct iommu_table **ptbl);
> > +
> > +static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
> > +		int num, struct iommu_table *tbl);
> > +
> > +static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
> > +
> >  static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
> >  {
> > -	u64 window_size, table_size, tce_count, addr;
> > -	struct page *table_pages;
> > -	u64 tce_order = 28; /* 256MB TCEs */
> > -	__be64 *tces;
> > +	u64 window_size;
> >  	s64 rc;
> > +	struct iommu_table *tbl, *oldtbl = NULL;
> > +	unsigned long shift, offset;
> >  
> >  	/*
> >  	 * Window size needs to be a power of two, but needs to account for
> >  	 * shifting memory by the 4GB offset required to skip 32bit space.
> >  	 */
> > -	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
> > -	tce_count = window_size >> tce_order;
> > -	table_size = tce_count << 3;
> > -
> > -	if (table_size < PAGE_SIZE)
> > -		table_size = PAGE_SIZE;
> > +	window_size = roundup_pow_of_two(memory_hotplug_max() + SZ_4G);
> > +	shift = ilog2(pnv_ioda_parse_tce_sizes(pe->phb));
> > +	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, shift, window_size,
> > +			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
> > +	if (rc) {
> > +		pe_err(pe, "Failed to create 64-bypass TCE table, err %ld", rc);
> > +		return rc;
> > +	}
> >  
> > -	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
> > -				       get_order(table_size));
> > -	if (!table_pages)
> > +	offset = SZ_4G >> shift;
> > +	rc = tbl->it_ops->set(tbl, offset, tbl->it_size - offset,
> > +			0 /* uaddr */, DMA_BIDIRECTIONAL, 0 /* attrs */);
> > +	if (rc)
> >  		goto err;
> >  
> > -	tces = page_address(table_pages);
> > -	if (!tces)
> > +	if (pe->table_group.tables[0]) {
> > +		oldtbl = pe->table_group.tables[0];
> > +		pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> > +	}
> > +
> > +	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> > +	if (rc != OPAL_SUCCESS) {
> > +		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, oldtbl);
> >  		goto err;
> > +	}
> >  
> > -	memset(tces, 0, table_size);
> > +	if (oldtbl)
> > +		iommu_tce_table_put(oldtbl);
> >  
> > -	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
> > -		tces[(addr + (1ULL << 32)) >> tce_order] =
> > -			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
> > -	}
> > +	pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
> > +	return 0;
> >  
> > -	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
> > -					pe->pe_number,
> > -					/* reconfigure window 0 */
> > -					(pe->pe_number << 1) + 0,
> > -					1,
> > -					__pa(tces),
> > -					table_size,
> > -					1 << tce_order);
> > -	if (rc == OPAL_SUCCESS) {
> > -		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
> > -		return 0;
> > -	}
> >  err:
> > +	iommu_tce_table_put(tbl);
> > +
> >  	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
> >  	return -EIO;
> >  }  



--
Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-07-02  8:50 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-01  8:10 [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy
2018-06-01  8:10 ` [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for " Alexey Kardashevskiy
2018-06-16  1:04   ` Benjamin Herrenschmidt
2018-07-02  8:50     ` Alexey Kardashevskiy
2018-06-01  8:10 ` [PATCH kernel 2/2] powerpc/powernv: Define PHB4 type and enable sketchy bypass on POWER9 Alexey Kardashevskiy
2018-06-16  1:05   ` Benjamin Herrenschmidt
2018-06-18  2:13     ` Alexey Kardashevskiy
2018-06-18  4:44       ` Benjamin Herrenschmidt
2018-06-18  7:20         ` Alexey Kardashevskiy
2018-06-15  9:01 ` [PATCH kernel 0/2] powerpc/powernv: Rework sketchy bypass Alexey Kardashevskiy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.