linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] powerpc: Dynamic DMA zone limits
@ 2014-08-08 23:40 Scott Wood
  2014-08-08 23:40 ` [PATCH 2/4] powerpc/64: Honor swiotlb limit in coherent allocations Scott Wood
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Scott Wood @ 2014-08-08 23:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

Platform code can call limit_zone_pfn() to set appropriate limits
for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will
select a suitable zone based on a device's mask and the pfn limits that
platform code has configured.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
---
 arch/powerpc/Kconfig               |  4 +++
 arch/powerpc/include/asm/pgtable.h |  3 ++
 arch/powerpc/kernel/dma.c          | 20 +++++++++++++
 arch/powerpc/mm/mem.c              | 61 ++++++++++++++++++++++++++++++++++----
 4 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 80b94b0..56dc47a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -286,6 +286,10 @@ config PPC_EMULATE_SSTEP
 	bool
 	default y if KPROBES || UPROBES || XMON || HAVE_HW_BREAKPOINT
 
+config ZONE_DMA32
+	bool
+	default y if PPC64
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ec..6d74167 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -4,6 +4,7 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/mmdebug.h>
+#include <linux/mmzone.h>
 #include <asm/processor.h>		/* For TASK_SIZE */
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -281,6 +282,8 @@ extern unsigned long empty_zero_page[];
 
 extern pgd_t swapper_pg_dir[];
 
+void limit_zone_pfn(enum zone_type zone, unsigned long max_pfn);
+int dma_pfn_limit_to_zone(u64 pfn_limit);
 extern void paging_init(void);
 
 /*
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index ee78f6e..dfd99ef 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -40,6 +40,26 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 #else
 	struct page *page;
 	int node = dev_to_node(dev);
+	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
+	int zone;
+
+	zone = dma_pfn_limit_to_zone(pfn);
+	if (zone < 0) {
+		dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
+			__func__, pfn);
+		return NULL;
+	}
+
+	switch (zone) {
+	case ZONE_DMA:
+		flag |= GFP_DMA;
+		break;
+#ifdef CONFIG_ZONE_DMA32
+	case ZONE_DMA32:
+		flag |= GFP_DMA32;
+		break;
+#endif
+	};
 
 	/* ignore region specifiers */
 	flag  &= ~(__GFP_HIGHMEM);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e0f7a18..3b23e17 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -261,6 +261,54 @@ static int __init mark_nonram_nosave(void)
 	return 0;
 }
 
+static bool zone_limits_final;
+
+static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+	[0 ... MAX_NR_ZONES - 1] = ~0UL
+};
+
+/*
+ * Restrict the specified zone and all more restrictive zones
+ * to be below the specified pfn.  May not be called after
+ * paging_init().
+ */
+void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
+{
+	int i;
+
+	if (WARN_ON(zone_limits_final))
+		return;
+
+	for (i = zone; i >= 0; i--) {
+		if (max_zone_pfns[i] > pfn_limit)
+			max_zone_pfns[i] = pfn_limit;
+	}
+}
+
+/*
+ * Find the least restrictive zone that is entirely below the
+ * specified pfn limit.  Returns < 0 if no suitable zone is found.
+ *
+ * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
+ * systems -- the DMA limit can be higher than any possible real pfn.
+ */
+int dma_pfn_limit_to_zone(u64 pfn_limit)
+{
+	enum zone_type top_zone = ZONE_NORMAL;
+	int i;
+
+#ifdef CONFIG_HIGHMEM
+	top_zone = ZONE_HIGHMEM;
+#endif
+
+	for (i = top_zone; i >= 0; i--) {
+		if (max_zone_pfns[i] <= pfn_limit)
+			return i;
+	}
+
+	return -EPERM;
+}
+
 /*
  * paging_init() sets up the page tables - in fact we've already done this.
  */
@@ -268,7 +316,7 @@ void __init paging_init(void)
 {
 	unsigned long long total_ram = memblock_phys_mem_size();
 	phys_addr_t top_of_ram = memblock_end_of_DRAM();
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	enum zone_type top_zone;
 
 #ifdef CONFIG_PPC32
 	unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -290,13 +338,16 @@ void __init paging_init(void)
 	       (unsigned long long)top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (long int)((top_of_ram - total_ram) >> 20));
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
 #ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
+	top_zone = ZONE_HIGHMEM;
+	limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
 #else
-	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+	top_zone = ZONE_NORMAL;
 #endif
+
+	limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+	zone_limits_final = true;
 	free_area_init_nodes(max_zone_pfns);
 
 	mark_nonram_nosave();
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] powerpc/64: Honor swiotlb limit in coherent allocations
  2014-08-08 23:40 [PATCH 1/4] powerpc: Dynamic DMA zone limits Scott Wood
@ 2014-08-08 23:40 ` Scott Wood
  2014-08-08 23:40 ` [PATCH 3/4] powerpc/64: Limit ZONE_DMA32 to 4GiB in swiotlb_detect_4g() Scott Wood
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Scott Wood @ 2014-08-08 23:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

FSL PCI cannot directly address the whole lower 4 GiB due to
conflicts with PCICSRBAR and outbound windows, and thus
max_direct_dma_addr is less than 4GiB.  Honor that limit in
dma_direct_alloc_coherent().

Note that setting the DMA mask to 31 bits is not an option, since many
PCI drivers would fail if we reject 32-bit DMA in dma_supported(), and
we have no control over the setting of coherent_dma_mask if
dma_supported() returns true.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
---
 arch/powerpc/kernel/dma.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index dfd99ef..a7b0156 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -15,6 +15,7 @@
 #include <asm/vio.h>
 #include <asm/bug.h>
 #include <asm/machdep.h>
+#include <asm/swiotlb.h>
 
 /*
  * Generic direct DMA implementation
@@ -25,6 +26,18 @@
  * default the offset is PCI_DRAM_OFFSET.
  */
 
+static u64 __maybe_unused get_pfn_limit(struct device *dev)
+{
+	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
+	struct dev_archdata __maybe_unused *sd = &dev->archdata;
+
+#ifdef CONFIG_SWIOTLB
+	if (sd->max_direct_dma_addr && sd->dma_ops == &swiotlb_dma_ops)
+		pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
+#endif
+
+	return pfn;
+}
 
 void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 				dma_addr_t *dma_handle, gfp_t flag,
@@ -40,7 +53,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 #else
 	struct page *page;
 	int node = dev_to_node(dev);
-	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
+	u64 pfn = get_pfn_limit(dev);
 	int zone;
 
 	zone = dma_pfn_limit_to_zone(pfn);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/4] powerpc/64: Limit ZONE_DMA32 to 4GiB in swiotlb_detect_4g()
  2014-08-08 23:40 [PATCH 1/4] powerpc: Dynamic DMA zone limits Scott Wood
  2014-08-08 23:40 ` [PATCH 2/4] powerpc/64: Honor swiotlb limit in coherent allocations Scott Wood
@ 2014-08-08 23:40 ` Scott Wood
  2014-08-08 23:40 ` [PATCH 4/4] powerpc/fsl-pci: Limit ZONE_DMA32 to 2GiB on 64-bit platforms Scott Wood
  2014-10-13  7:14 ` [PATCH 1/4] powerpc: Dynamic DMA zone limits Anton Blanchard
  3 siblings, 0 replies; 10+ messages in thread
From: Scott Wood @ 2014-08-08 23:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

A DMA zone is still needed with swiotlb, for coherent allocations.
This doesn't affect platforms that don't use swiotlb or that don't call
swiotlb_detect_4g().

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
---
 arch/powerpc/kernel/dma-swiotlb.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index bd1a2ab..7359797 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -106,10 +106,14 @@ int __init swiotlb_setup_bus_notifier(void)
 	return 0;
 }
 
-void swiotlb_detect_4g(void)
+void __init swiotlb_detect_4g(void)
 {
-	if ((memblock_end_of_DRAM() - 1) > 0xffffffff)
+	if ((memblock_end_of_DRAM() - 1) > 0xffffffff) {
 		ppc_swiotlb_enable = 1;
+#ifdef CONFIG_ZONE_DMA32
+		limit_zone_pfn(ZONE_DMA32, (1ULL << 32) >> PAGE_SHIFT);
+#endif
+	}
 }
 
 static int __init swiotlb_late_init(void)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/4] powerpc/fsl-pci: Limit ZONE_DMA32 to 2GiB on 64-bit platforms
  2014-08-08 23:40 [PATCH 1/4] powerpc: Dynamic DMA zone limits Scott Wood
  2014-08-08 23:40 ` [PATCH 2/4] powerpc/64: Honor swiotlb limit in coherent allocations Scott Wood
  2014-08-08 23:40 ` [PATCH 3/4] powerpc/64: Limit ZONE_DMA32 to 4GiB in swiotlb_detect_4g() Scott Wood
@ 2014-08-08 23:40 ` Scott Wood
  2014-10-13  7:14 ` [PATCH 1/4] powerpc: Dynamic DMA zone limits Anton Blanchard
  3 siblings, 0 replies; 10+ messages in thread
From: Scott Wood @ 2014-08-08 23:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

FSL PCI cannot directly address the whole lower 4 GiB due to
conflicts with PCICSRBAR and outbound windows.  By the time
max_direct_dma_addr is set to the precise limit, it will be too late to
alter the zone limits, but we should always have at least 2 GiB mapped
(unless RAM is smaller than that).

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
---
 arch/powerpc/platforms/85xx/corenet_generic.c | 11 +++++++++++
 arch/powerpc/platforms/85xx/qemu_e500.c       | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index d22dd85..c2adb00 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -20,6 +20,7 @@
 #include <asm/time.h>
 #include <asm/machdep.h>
 #include <asm/pci-bridge.h>
+#include <asm/pgtable.h>
 #include <asm/ppc-pci.h>
 #include <mm/mmu_decl.h>
 #include <asm/prom.h>
@@ -67,6 +68,16 @@ void __init corenet_gen_setup_arch(void)
 
 	swiotlb_detect_4g();
 
+#if defined(CONFIG_FSL_PCI) && defined(CONFIG_ZONE_DMA32)
+	/*
+	 * Inbound windows don't cover the full lower 4 GiB
+	 * due to conflicts with PCICSRBAR and outbound windows,
+	 * so limit the DMA32 zone to 2 GiB, to allow consistent
+	 * allocations to succeed.
+	 */
+	limit_zone_pfn(ZONE_DMA32, 1UL << (31 - PAGE_SHIFT));
+#endif
+
 	pr_info("%s board\n", ppc_md.name);
 
 	mpc85xx_qe_init();
diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c
index 7f26732..8ad2fe6 100644
--- a/arch/powerpc/platforms/85xx/qemu_e500.c
+++ b/arch/powerpc/platforms/85xx/qemu_e500.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/of_fdt.h>
 #include <asm/machdep.h>
+#include <asm/pgtable.h>
 #include <asm/time.h>
 #include <asm/udbg.h>
 #include <asm/mpic.h>
@@ -44,6 +45,15 @@ static void __init qemu_e500_setup_arch(void)
 
 	fsl_pci_assign_primary();
 	swiotlb_detect_4g();
+#if defined(CONFIG_FSL_PCI) && defined(CONFIG_ZONE_DMA32)
+	/*
+	 * Inbound windows don't cover the full lower 4 GiB
+	 * due to conflicts with PCICSRBAR and outbound windows,
+	 * so limit the DMA32 zone to 2 GiB, to allow consistent
+	 * allocations to succeed.
+	 */
+	limit_zone_pfn(ZONE_DMA32, 1UL << (31 - PAGE_SHIFT));
+#endif
 	mpc85xx_smp_init();
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-08-08 23:40 [PATCH 1/4] powerpc: Dynamic DMA zone limits Scott Wood
                   ` (2 preceding siblings ...)
  2014-08-08 23:40 ` [PATCH 4/4] powerpc/fsl-pci: Limit ZONE_DMA32 to 2GiB on 64-bit platforms Scott Wood
@ 2014-10-13  7:14 ` Anton Blanchard
  2014-10-13  7:30   ` Benjamin Herrenschmidt
  2014-10-13  9:00   ` Michael Ellerman
  3 siblings, 2 replies; 10+ messages in thread
From: Anton Blanchard @ 2014-10-13  7:14 UTC (permalink / raw)
  To: Scott Wood, Benjamin Herrenschmidt, Shaohui Xie, Michael Ellerman
  Cc: linuxppc-dev

Hi Scott,

> Platform code can call limit_zone_pfn() to set appropriate limits
> for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will
> select a suitable zone based on a device's mask and the pfn limits
> that platform code has configured.

This patch breaks my POWER8 box:

ipr 0001:08:00.0: Using 64-bit DMA iommu bypass
ipr 0001:08:00.0: dma_direct_alloc_coherent: No suitable zone for pfn 0x10000
ipr 0001:08:00.0: Couldn't allocate enough memory for device driver! 
ipr: probe of 0001:08:00.0 failed with error -12

ipr isn't setting a coherent mask, but we shouldn't care on these boxes.
Could we ignore the coherent mask or copy the dma mask to it?

Anton
--

> Signed-off-by: Scott Wood <scottwood@freescale.com>
> Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
> ---
>  arch/powerpc/Kconfig               |  4 +++
>  arch/powerpc/include/asm/pgtable.h |  3 ++
>  arch/powerpc/kernel/dma.c          | 20 +++++++++++++
>  arch/powerpc/mm/mem.c              | 61
> ++++++++++++++++++++++++++++++++++---- 4 files changed, 83
> insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 80b94b0..56dc47a 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -286,6 +286,10 @@ config PPC_EMULATE_SSTEP
>  	bool
>  	default y if KPROBES || UPROBES || XMON || HAVE_HW_BREAKPOINT
>  
> +config ZONE_DMA32
> +	bool
> +	default y if PPC64
> +
>  source "init/Kconfig"
>  
>  source "kernel/Kconfig.freezer"
> diff --git a/arch/powerpc/include/asm/pgtable.h
> b/arch/powerpc/include/asm/pgtable.h index d98c1ec..6d74167 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -4,6 +4,7 @@
>  
>  #ifndef __ASSEMBLY__
>  #include <linux/mmdebug.h>
> +#include <linux/mmzone.h>
>  #include <asm/processor.h>		/* For TASK_SIZE */
>  #include <asm/mmu.h>
>  #include <asm/page.h>
> @@ -281,6 +282,8 @@ extern unsigned long empty_zero_page[];
>  
>  extern pgd_t swapper_pg_dir[];
>  
> +void limit_zone_pfn(enum zone_type zone, unsigned long max_pfn);
> +int dma_pfn_limit_to_zone(u64 pfn_limit);
>  extern void paging_init(void);
>  
>  /*
> diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
> index ee78f6e..dfd99ef 100644
> --- a/arch/powerpc/kernel/dma.c
> +++ b/arch/powerpc/kernel/dma.c
> @@ -40,6 +40,26 @@ void *dma_direct_alloc_coherent(struct device
> *dev, size_t size, #else
>  	struct page *page;
>  	int node = dev_to_node(dev);
> +	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
> +	int zone;
> +
> +	zone = dma_pfn_limit_to_zone(pfn);
> +	if (zone < 0) {
> +		dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
> +			__func__, pfn);
> +		return NULL;
> +	}
> +
> +	switch (zone) {
> +	case ZONE_DMA:
> +		flag |= GFP_DMA;
> +		break;
> +#ifdef CONFIG_ZONE_DMA32
> +	case ZONE_DMA32:
> +		flag |= GFP_DMA32;
> +		break;
> +#endif
> +	};
>  
>  	/* ignore region specifiers */
>  	flag  &= ~(__GFP_HIGHMEM);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index e0f7a18..3b23e17 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -261,6 +261,54 @@ static int __init mark_nonram_nosave(void)
>  	return 0;
>  }
>  
> +static bool zone_limits_final;
> +
> +static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
> +	[0 ... MAX_NR_ZONES - 1] = ~0UL
> +};
> +
> +/*
> + * Restrict the specified zone and all more restrictive zones
> + * to be below the specified pfn.  May not be called after
> + * paging_init().
> + */
> +void __init limit_zone_pfn(enum zone_type zone, unsigned long
> pfn_limit) +{
> +	int i;
> +
> +	if (WARN_ON(zone_limits_final))
> +		return;
> +
> +	for (i = zone; i >= 0; i--) {
> +		if (max_zone_pfns[i] > pfn_limit)
> +			max_zone_pfns[i] = pfn_limit;
> +	}
> +}
> +
> +/*
> + * Find the least restrictive zone that is entirely below the
> + * specified pfn limit.  Returns < 0 if no suitable zone is found.
> + *
> + * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
> + * systems -- the DMA limit can be higher than any possible real pfn.
> + */
> +int dma_pfn_limit_to_zone(u64 pfn_limit)
> +{
> +	enum zone_type top_zone = ZONE_NORMAL;
> +	int i;
> +
> +#ifdef CONFIG_HIGHMEM
> +	top_zone = ZONE_HIGHMEM;
> +#endif
> +
> +	for (i = top_zone; i >= 0; i--) {
> +		if (max_zone_pfns[i] <= pfn_limit)
> +			return i;
> +	}
> +
> +	return -EPERM;
> +}
> +
>  /*
>   * paging_init() sets up the page tables - in fact we've already
> done this. */
> @@ -268,7 +316,7 @@ void __init paging_init(void)
>  {
>  	unsigned long long total_ram = memblock_phys_mem_size();
>  	phys_addr_t top_of_ram = memblock_end_of_DRAM();
> -	unsigned long max_zone_pfns[MAX_NR_ZONES];
> +	enum zone_type top_zone;
>  
>  #ifdef CONFIG_PPC32
>  	unsigned long v = __fix_to_virt(__end_of_fixed_addresses -
> 1); @@ -290,13 +338,16 @@ void __init paging_init(void)
>  	       (unsigned long long)top_of_ram, total_ram);
>  	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
>  	       (long int)((top_of_ram - total_ram) >> 20));
> -	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
> +
>  #ifdef CONFIG_HIGHMEM
> -	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
> -	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
> +	top_zone = ZONE_HIGHMEM;
> +	limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
>  #else
> -	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
> +	top_zone = ZONE_NORMAL;
>  #endif
> +
> +	limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
> +	zone_limits_final = true;
>  	free_area_init_nodes(max_zone_pfns);
>  
>  	mark_nonram_nosave();

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-10-13  7:14 ` [PATCH 1/4] powerpc: Dynamic DMA zone limits Anton Blanchard
@ 2014-10-13  7:30   ` Benjamin Herrenschmidt
  2014-10-13  9:00   ` Michael Ellerman
  1 sibling, 0 replies; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2014-10-13  7:30 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

On Mon, 2014-10-13 at 18:14 +1100, Anton Blanchard wrote:
> Hi Scott,
> 
> > Platform code can call limit_zone_pfn() to set appropriate limits
> > for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will
> > select a suitable zone based on a device's mask and the pfn limits
> > that platform code has configured.
> 
> This patch breaks my POWER8 box:
> 
> ipr 0001:08:00.0: Using 64-bit DMA iommu bypass
> ipr 0001:08:00.0: dma_direct_alloc_coherent: No suitable zone for pfn 0x10000
> ipr 0001:08:00.0: Couldn't allocate enough memory for device driver! 
> ipr: probe of 0001:08:00.0 failed with error -12
> 
> ipr isn't setting a coherent mask, but we shouldn't care on these boxes.
> Could we ignore the coherent mask or copy the dma mask to it?

So this depends what the coherent_mask actually means vs. the dma_mask.
I've always been extremely confused by the distinction. Since the
coherent_mask is set by the driver, I assume it represents a driver
limitation on coherent memory which might be *different* from the
restriction on streaming mappings, in which case we might have to honor
it...

The problem is that our whole mechanism for switching dma_ops is based
on having one mask.

So even if we somewhat "fix" IPR, we still have an issue in that we
don't honor the coherent mask properly in case a driver really wants a
different mask.

If we new have two, I think we need to (in the long run that is, for
3.18 we can probably find an ifdef based band-aid):

 - Either have a ppc_md hook for set_coherent_mask along with
dma_set_mask and make the decision to flip based on the AND of both
masks (gross)

 - Or, since that's basically what some of our HW can do, basically make
the decision on a per-hook basis. That is, something like powernv would
no longer need to hook dma_set_mask to switch the ops. Instead, it could
permanently set a set of pnv_dma_ops that for each hook chose the
"right" mask and route the mapping toward either the iommu or the bypass
accordingly.

Both seem like quite a bit of refactoring and the latter would be tricky
for some pseries cases where we actually *remove* the 32-bit window to
establish the 64-bit one (DDW cases).

Any better idea ? Are there any drivers that don't actually have the
same mask for both that we care about ?

Ben.



> Anton
> --
> 
> > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
> > ---
> >  arch/powerpc/Kconfig               |  4 +++
> >  arch/powerpc/include/asm/pgtable.h |  3 ++
> >  arch/powerpc/kernel/dma.c          | 20 +++++++++++++
> >  arch/powerpc/mm/mem.c              | 61
> > ++++++++++++++++++++++++++++++++++---- 4 files changed, 83
> > insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> > index 80b94b0..56dc47a 100644
> > --- a/arch/powerpc/Kconfig
> > +++ b/arch/powerpc/Kconfig
> > @@ -286,6 +286,10 @@ config PPC_EMULATE_SSTEP
> >  	bool
> >  	default y if KPROBES || UPROBES || XMON || HAVE_HW_BREAKPOINT
> >  
> > +config ZONE_DMA32
> > +	bool
> > +	default y if PPC64
> > +
> >  source "init/Kconfig"
> >  
> >  source "kernel/Kconfig.freezer"
> > diff --git a/arch/powerpc/include/asm/pgtable.h
> > b/arch/powerpc/include/asm/pgtable.h index d98c1ec..6d74167 100644
> > --- a/arch/powerpc/include/asm/pgtable.h
> > +++ b/arch/powerpc/include/asm/pgtable.h
> > @@ -4,6 +4,7 @@
> >  
> >  #ifndef __ASSEMBLY__
> >  #include <linux/mmdebug.h>
> > +#include <linux/mmzone.h>
> >  #include <asm/processor.h>		/* For TASK_SIZE */
> >  #include <asm/mmu.h>
> >  #include <asm/page.h>
> > @@ -281,6 +282,8 @@ extern unsigned long empty_zero_page[];
> >  
> >  extern pgd_t swapper_pg_dir[];
> >  
> > +void limit_zone_pfn(enum zone_type zone, unsigned long max_pfn);
> > +int dma_pfn_limit_to_zone(u64 pfn_limit);
> >  extern void paging_init(void);
> >  
> >  /*
> > diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
> > index ee78f6e..dfd99ef 100644
> > --- a/arch/powerpc/kernel/dma.c
> > +++ b/arch/powerpc/kernel/dma.c
> > @@ -40,6 +40,26 @@ void *dma_direct_alloc_coherent(struct device
> > *dev, size_t size, #else
> >  	struct page *page;
> >  	int node = dev_to_node(dev);
> > +	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
> > +	int zone;
> > +
> > +	zone = dma_pfn_limit_to_zone(pfn);
> > +	if (zone < 0) {
> > +		dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
> > +			__func__, pfn);
> > +		return NULL;
> > +	}
> > +
> > +	switch (zone) {
> > +	case ZONE_DMA:
> > +		flag |= GFP_DMA;
> > +		break;
> > +#ifdef CONFIG_ZONE_DMA32
> > +	case ZONE_DMA32:
> > +		flag |= GFP_DMA32;
> > +		break;
> > +#endif
> > +	};
> >  
> >  	/* ignore region specifiers */
> >  	flag  &= ~(__GFP_HIGHMEM);
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index e0f7a18..3b23e17 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -261,6 +261,54 @@ static int __init mark_nonram_nosave(void)
> >  	return 0;
> >  }
> >  
> > +static bool zone_limits_final;
> > +
> > +static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
> > +	[0 ... MAX_NR_ZONES - 1] = ~0UL
> > +};
> > +
> > +/*
> > + * Restrict the specified zone and all more restrictive zones
> > + * to be below the specified pfn.  May not be called after
> > + * paging_init().
> > + */
> > +void __init limit_zone_pfn(enum zone_type zone, unsigned long
> > pfn_limit) +{
> > +	int i;
> > +
> > +	if (WARN_ON(zone_limits_final))
> > +		return;
> > +
> > +	for (i = zone; i >= 0; i--) {
> > +		if (max_zone_pfns[i] > pfn_limit)
> > +			max_zone_pfns[i] = pfn_limit;
> > +	}
> > +}
> > +
> > +/*
> > + * Find the least restrictive zone that is entirely below the
> > + * specified pfn limit.  Returns < 0 if no suitable zone is found.
> > + *
> > + * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
> > + * systems -- the DMA limit can be higher than any possible real pfn.
> > + */
> > +int dma_pfn_limit_to_zone(u64 pfn_limit)
> > +{
> > +	enum zone_type top_zone = ZONE_NORMAL;
> > +	int i;
> > +
> > +#ifdef CONFIG_HIGHMEM
> > +	top_zone = ZONE_HIGHMEM;
> > +#endif
> > +
> > +	for (i = top_zone; i >= 0; i--) {
> > +		if (max_zone_pfns[i] <= pfn_limit)
> > +			return i;
> > +	}
> > +
> > +	return -EPERM;
> > +}
> > +
> >  /*
> >   * paging_init() sets up the page tables - in fact we've already
> > done this. */
> > @@ -268,7 +316,7 @@ void __init paging_init(void)
> >  {
> >  	unsigned long long total_ram = memblock_phys_mem_size();
> >  	phys_addr_t top_of_ram = memblock_end_of_DRAM();
> > -	unsigned long max_zone_pfns[MAX_NR_ZONES];
> > +	enum zone_type top_zone;
> >  
> >  #ifdef CONFIG_PPC32
> >  	unsigned long v = __fix_to_virt(__end_of_fixed_addresses -
> > 1); @@ -290,13 +338,16 @@ void __init paging_init(void)
> >  	       (unsigned long long)top_of_ram, total_ram);
> >  	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
> >  	       (long int)((top_of_ram - total_ram) >> 20));
> > -	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
> > +
> >  #ifdef CONFIG_HIGHMEM
> > -	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
> > -	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
> > +	top_zone = ZONE_HIGHMEM;
> > +	limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
> >  #else
> > -	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
> > +	top_zone = ZONE_NORMAL;
> >  #endif
> > +
> > +	limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
> > +	zone_limits_final = true;
> >  	free_area_init_nodes(max_zone_pfns);
> >  
> >  	mark_nonram_nosave();

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-10-13  7:14 ` [PATCH 1/4] powerpc: Dynamic DMA zone limits Anton Blanchard
  2014-10-13  7:30   ` Benjamin Herrenschmidt
@ 2014-10-13  9:00   ` Michael Ellerman
  2014-10-14  7:39     ` Scott Wood
  1 sibling, 1 reply; 10+ messages in thread
From: Michael Ellerman @ 2014-10-13  9:00 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: Scott Wood, linuxppc-dev, Shaohui Xie

On Mon, 2014-10-13 at 18:14 +1100, Anton Blanchard wrote:
> Hi Scott,
> 
> > Platform code can call limit_zone_pfn() to set appropriate limits
> > for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will
> > select a suitable zone based on a device's mask and the pfn limits
> > that platform code has configured.
> 
> This patch breaks my POWER8 box:
> 
> ipr 0001:08:00.0: Using 64-bit DMA iommu bypass
> ipr 0001:08:00.0: dma_direct_alloc_coherent: No suitable zone for pfn 0x10000
> ipr 0001:08:00.0: Couldn't allocate enough memory for device driver! 
> ipr: probe of 0001:08:00.0 failed with error -12
> 
> ipr isn't setting a coherent mask, but we shouldn't care on these boxes.
> Could we ignore the coherent mask or copy the dma mask to it?

Talking to Ben the answer seems to be "it's complicated".

We shouldn't be ignoring the coherent mask, but we have been, and have been
getting away with it.

The PCI code sets a default 32-bit mask, so we can't even detect when a device
hasn't set it. Though maybe the powernv PCI code could be initialising it to
64-bit ?

For this cycle I'm thinking of the below patch.

Scott & Anton can you test please?

Also the depends on FSL_PCI was totally a guess, so please correct that if it's
wrong Scott.

cheers


[PATCH] powerpc: Only do dynamic DMA zone limits on platforms that need it

Scott's patch 1c98025c6c95 "Dynamic DMA zone limits" changed
dma_direct_alloc_coherent() to start using dev->coherent_dma_mask.

That seems fair enough, but it exposes the fact that some of the drivers
we care about on IBM platforms aren't setting the coherent mask.

The proper fix is to have drivers set the coherent mask and also have
the platform code honor it.

For now, just restrict the dynamic DMA zone limits to the platforms that
need it, which is those using FSL_PCI.

Fixes: 1c98025c6c95 ("powerpc: Dynamic DMA zone limits")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig      | 2 +-
 arch/powerpc/kernel/dma.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 88eace4e28c3..9b9044aec217 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -290,7 +290,7 @@ config PPC_EMULATE_SSTEP
 
 config ZONE_DMA32
 	bool
-	default y if PPC64
+	default y if PPC64 && FSL_PCI
 
 source "init/Kconfig"
 
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index adac9dc54aee..bd443a2e4426 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -53,6 +53,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 #else
 	struct page *page;
 	int node = dev_to_node(dev);
+#ifdef CONFIG_ZONE_DMA32
 	u64 pfn = get_pfn_limit(dev);
 	int zone;
 
@@ -67,12 +68,11 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 	case ZONE_DMA:
 		flag |= GFP_DMA;
 		break;
-#ifdef CONFIG_ZONE_DMA32
 	case ZONE_DMA32:
 		flag |= GFP_DMA32;
 		break;
-#endif
 	};
+#endif /* CONFIG_ZONE_DMA32 */
 
 	/* ignore region specifiers */
 	flag  &= ~(__GFP_HIGHMEM);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-10-13  9:00   ` Michael Ellerman
@ 2014-10-14  7:39     ` Scott Wood
  2014-10-14  7:57       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 10+ messages in thread
From: Scott Wood @ 2014-10-14  7:39 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Anton Blanchard, Shaohui Xie

On Mon, 2014-10-13 at 20:00 +1100, Michael Ellerman wrote:
> On Mon, 2014-10-13 at 18:14 +1100, Anton Blanchard wrote:
> > Hi Scott,
> > 
> > > Platform code can call limit_zone_pfn() to set appropriate limits
> > > for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will
> > > select a suitable zone based on a device's mask and the pfn limits
> > > that platform code has configured.
> > 
> > This patch breaks my POWER8 box:
> > 
> > ipr 0001:08:00.0: Using 64-bit DMA iommu bypass
> > ipr 0001:08:00.0: dma_direct_alloc_coherent: No suitable zone for pfn 0x10000
> > ipr 0001:08:00.0: Couldn't allocate enough memory for device driver! 
> > ipr: probe of 0001:08:00.0 failed with error -12
> > 
> > ipr isn't setting a coherent mask, but we shouldn't care on these boxes.
> > Could we ignore the coherent mask or copy the dma mask to it?
> 
> Talking to Ben the answer seems to be "it's complicated".
> 
> We shouldn't be ignoring the coherent mask, but we have been, and have been
> getting away with it.
> 
> The PCI code sets a default 32-bit mask, so we can't even detect when a device
> hasn't set it. Though maybe the powernv PCI code could be initialising it to
> 64-bit ?
> 
> For this cycle I'm thinking of the below patch.
> 
> Scott & Anton can you test please?
> 
> Also the depends on FSL_PCI was totally a guess, so please correct that if it's
> wrong Scott.

We need the DMA zone for non-PCI devices, so FSL_SOC would be a better
choice.

> diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
> index adac9dc54aee..bd443a2e4426 100644
> --- a/arch/powerpc/kernel/dma.c
> +++ b/arch/powerpc/kernel/dma.c
> @@ -53,6 +53,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
>  #else
>  	struct page *page;
>  	int node = dev_to_node(dev);
> +#ifdef CONFIG_ZONE_DMA32
>  	u64 pfn = get_pfn_limit(dev);
>  	int zone;
>  
> @@ -67,12 +68,11 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
>  	case ZONE_DMA:
>  		flag |= GFP_DMA;
>  		break;
> -#ifdef CONFIG_ZONE_DMA32
>  	case ZONE_DMA32:
>  		flag |= GFP_DMA32;
>  		break;
> -#endif
>  	};
> +#endif /* CONFIG_ZONE_DMA32 */

For a short-term workaround, I'd rather leave CONFIG_ZONE_DMA32 where it
is and put #ifdef CONFIG_FSL_SOC (with a comment) around the whole
thing.

-Scott

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-10-14  7:39     ` Scott Wood
@ 2014-10-14  7:57       ` Benjamin Herrenschmidt
  2014-10-14  9:44         ` Scott Wood
  0 siblings, 1 reply; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2014-10-14  7:57 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, Anton Blanchard, Shaohui Xie

On Tue, 2014-10-14 at 09:39 +0200, Scott Wood wrote:
> For a short-term workaround, I'd rather leave CONFIG_ZONE_DMA32 where
> it
> is and put #ifdef CONFIG_FSL_SOC (with a comment) around the whole
> thing.

I'd like to not enable CONFIG_ZONE_DMA32 when we don't need it, ie,
on !BOOKE 64-bit

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] powerpc: Dynamic DMA zone limits
  2014-10-14  7:57       ` Benjamin Herrenschmidt
@ 2014-10-14  9:44         ` Scott Wood
  0 siblings, 0 replies; 10+ messages in thread
From: Scott Wood @ 2014-10-14  9:44 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Anton Blanchard, Shaohui Xie

On Tue, 2014-10-14 at 18:57 +1100, Benjamin Herrenschmidt wrote:
> On Tue, 2014-10-14 at 09:39 +0200, Scott Wood wrote:
> > For a short-term workaround, I'd rather leave CONFIG_ZONE_DMA32 where
> > it
> > is and put #ifdef CONFIG_FSL_SOC (with a comment) around the whole
> > thing.
> 
> I'd like to not enable CONFIG_ZONE_DMA32 when we don't need it, ie,
> on !BOOKE 64-bit

How does 64-bit powermac deal with it, with random PCI cards?  Does it
use an IOMMU for coherent allocations?

In any case, CONFIG_ZONE_DMA32 should be harmless if the zone isn't
dynamically created, but we can limit it to booke if you want.  The
current breakage has nothing to do with ZONE_DMA32, but rather to the
fact that we now check zones at all.  This is why I'd rather have the
short term workaround be a new ifdef, rather than moving ifdef
CONFIG_ZONE_DMA32 (moving it also obscures the way the code should look
if and when the workaround can be removed).

-Scott

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-10-14  9:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-08 23:40 [PATCH 1/4] powerpc: Dynamic DMA zone limits Scott Wood
2014-08-08 23:40 ` [PATCH 2/4] powerpc/64: Honor swiotlb limit in coherent allocations Scott Wood
2014-08-08 23:40 ` [PATCH 3/4] powerpc/64: Limit ZONE_DMA32 to 4GiB in swiotlb_detect_4g() Scott Wood
2014-08-08 23:40 ` [PATCH 4/4] powerpc/fsl-pci: Limit ZONE_DMA32 to 2GiB on 64-bit platforms Scott Wood
2014-10-13  7:14 ` [PATCH 1/4] powerpc: Dynamic DMA zone limits Anton Blanchard
2014-10-13  7:30   ` Benjamin Herrenschmidt
2014-10-13  9:00   ` Michael Ellerman
2014-10-14  7:39     ` Scott Wood
2014-10-14  7:57       ` Benjamin Herrenschmidt
2014-10-14  9:44         ` Scott Wood

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).