linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
@ 2009-10-23  1:21 Chris Wright
  2009-10-23  1:21 ` [RFC PATCH 1/3] [RFC PATCH] bootmem: refactor free_all_bootmem_core Chris Wright
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Chris Wright @ 2009-10-23  1:21 UTC (permalink / raw)
  To: David Woodhouse; +Cc: iommu, linux-kernel

This short series gives us the ability to allocate the swiotlb and then
conditionally free it if we discover it isn't needed.  This allows us to
put swiotlb to use when the hw iommu fails to initialize properly.

This needs some changes to the bootmem allocator to give the ability to
free reserved bootmem directly to the page allocator after bootmem is
torn down.

 arch/x86/include/asm/swiotlb.h |    4 ++
 arch/x86/kernel/pci-dma.c      |    4 +-
 arch/x86/kernel/pci-swiotlb.c  |   27 +++++++++---
 include/linux/bootmem.h        |    1 +
 include/linux/swiotlb.h        |    3 +
 lib/swiotlb.c                  |   10 ++++
 mm/bootmem.c                   |   98 +++++++++++++++++++++++++++++++---------
 7 files changed, 118 insertions(+), 29 deletions(-)

thanks,
-chris


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH 1/3] [RFC PATCH] bootmem: refactor free_all_bootmem_core
  2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
@ 2009-10-23  1:21 ` Chris Wright
  2009-10-23  1:22 ` [RFC PATCH 2/3] [RFC PATCH] bootmem: add free_bootmem_late Chris Wright
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Chris Wright @ 2009-10-23  1:21 UTC (permalink / raw)
  To: David Woodhouse; +Cc: iommu, linux-kernel

[-- Attachment #1: bootmem-break-out-free_pages_bootmem-loop.patch --]
[-- Type: text/plain, Size: 3307 bytes --]

Move the loop that frees all bootmem pages back to page allocator into
its own function.  This should have not functional effect and allows the
function to be reused later.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 mm/bootmem.c |   61 +++++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2..94ef2e7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,17 +143,22 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
 
-static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+/**
+ * free_bootmem_pages - frees bootmem pages to page allocator
+ * @start: start pfn
+ * @end: end pfn
+ * @map: bootmem bitmap of reserved pages
+ *
+ * This will free the pages in the range @start to @end, making them
+ * available to the page allocator.  The @map will be used to skip
+ * reserved pages.  Returns the count of pages freed.
+ */
+static unsigned long __init free_bootmem_pages(unsigned long start,
+					       unsigned long end,
+					       unsigned long *map)
 {
+	unsigned long cursor, count = 0;
 	int aligned;
-	struct page *page;
-	unsigned long start, end, pages, count = 0;
-
-	if (!bdata->node_bootmem_map)
-		return 0;
-
-	start = bdata->node_min_pfn;
-	end = bdata->node_low_pfn;
 
 	/*
 	 * If the start is aligned to the machines wordsize, we might
@@ -161,27 +166,25 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 	 */
 	aligned = !(start & (BITS_PER_LONG - 1));
 
-	bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
-		bdata - bootmem_node_data, start, end, aligned);
+	for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
+		unsigned long idx, vec;
 
-	while (start < end) {
-		unsigned long *map, idx, vec;
-
-		map = bdata->node_bootmem_map;
-		idx = start - bdata->node_min_pfn;
+		idx = cursor - start;
 		vec = ~map[idx / BITS_PER_LONG];
 
-		if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+		if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
 			int order = ilog2(BITS_PER_LONG);
 
-			__free_pages_bootmem(pfn_to_page(start), order);
+			__free_pages_bootmem(pfn_to_page(cursor), order);
 			count += BITS_PER_LONG;
 		} else {
 			unsigned long off = 0;
 
 			while (vec && off < BITS_PER_LONG) {
 				if (vec & 1) {
-					page = pfn_to_page(start + off);
+					struct page *page;
+
+					page = pfn_to_page(cursor + off);
 					__free_pages_bootmem(page, 0);
 					count++;
 				}
@@ -189,8 +192,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 				off++;
 			}
 		}
-		start += BITS_PER_LONG;
 	}
+	return count;
+}
+
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+	struct page *page;
+	unsigned long start, end, *map, pages, count = 0;
+
+	if (!bdata->node_bootmem_map)
+		return 0;
+
+	start = bdata->node_min_pfn;
+	end = bdata->node_low_pfn;
+	map = bdata->node_bootmem_map;
+
+	bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+		start, end);
+
+	count = free_bootmem_pages(start, end, map);
 
 	page = virt_to_page(bdata->node_bootmem_map);
 	pages = bdata->node_low_pfn - bdata->node_min_pfn;


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC PATCH 2/3] [RFC PATCH] bootmem: add free_bootmem_late
  2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
  2009-10-23  1:21 ` [RFC PATCH 1/3] [RFC PATCH] bootmem: refactor free_all_bootmem_core Chris Wright
@ 2009-10-23  1:22 ` Chris Wright
  2009-10-23  1:22 ` [RFC PATCH 3/3] [RFC PATCH] iommu: allow fallback to swiotlb upon hw iommu initialization failure Chris Wright
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Chris Wright @ 2009-10-23  1:22 UTC (permalink / raw)
  To: David Woodhouse; +Cc: iommu, linux-kernel

[-- Attachment #1: bootmem-make-free_pages_bootmem-generic.patch --]
[-- Type: text/plain, Size: 3083 bytes --]

Add a new function for freeing bootmem after the bootmem allocator has
been released and the unreserved pages given to the page allocator.
This allows us to reserve bootmem and then release it if we later
discover it was not needed.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/linux/bootmem.h |    1 +
 mm/bootmem.c            |   43 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 5 deletions(-)

--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t 
 			      unsigned long addr,
 			      unsigned long size);
 extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void free_bootmem_late(unsigned long addr, unsigned long size);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -151,7 +151,9 @@ unsigned long __init init_bootmem(unsign
  *
  * This will free the pages in the range @start to @end, making them
  * available to the page allocator.  The @map will be used to skip
- * reserved pages.  Returns the count of pages freed.
+ * reserved pages.  In the case that @map is NULL, the bootmem allocator
+ * is already free and the range is contiguous.  Returns the count of
+ * pages freed.
  */
 static unsigned long __init free_bootmem_pages(unsigned long start,
 					       unsigned long end,
@@ -164,13 +166,23 @@ static unsigned long __init free_bootmem
 	 * If the start is aligned to the machines wordsize, we might
 	 * be able to free pages in bulks of that order.
 	 */
-	aligned = !(start & (BITS_PER_LONG - 1));
+	if (map)
+		aligned = !(start & (BITS_PER_LONG - 1));
+	else
+		aligned = 1;
 
 	for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
-		unsigned long idx, vec;
+		unsigned long vec;
 
-		idx = cursor - start;
-		vec = ~map[idx / BITS_PER_LONG];
+		if (map) {
+			unsigned long idx = cursor - start;
+			vec = ~map[idx / BITS_PER_LONG];
+		} else {
+			if (end - cursor >= BITS_PER_LONG)
+				vec = ~0UL;
+			else
+				vec = (1UL << (end - cursor)) - 1;
+		}
 
 		if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
 			int order = ilog2(BITS_PER_LONG);
@@ -387,6 +399,27 @@ void __init free_bootmem(unsigned long a
 }
 
 /**
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+	unsigned long start, end;
+
+	kmemleak_free_part(__va(addr), size);
+
+	start = PFN_UP(addr);
+	end = PFN_DOWN(addr + size);
+
+	totalram_pages += free_bootmem_pages(start, end, NULL);
+}
+
+/**
  * reserve_bootmem_node - mark a page range as reserved
  * @pgdat: node the range resides on
  * @physaddr: starting address of the range


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH 3/3] [RFC PATCH] iommu: allow fallback to swiotlb upon hw iommu initialization failure
  2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
  2009-10-23  1:21 ` [RFC PATCH 1/3] [RFC PATCH] bootmem: refactor free_all_bootmem_core Chris Wright
  2009-10-23  1:22 ` [RFC PATCH 2/3] [RFC PATCH] bootmem: add free_bootmem_late Chris Wright
@ 2009-10-23  1:22 ` Chris Wright
  2009-10-23  5:51 ` [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures FUJITA Tomonori
  2009-10-26  7:10 ` Andi Kleen
  4 siblings, 0 replies; 12+ messages in thread
From: Chris Wright @ 2009-10-23  1:22 UTC (permalink / raw)
  To: David Woodhouse; +Cc: iommu, linux-kernel, Joerg Roedel

[-- Attachment #1: vtd-swiotlb-fallback-v3.patch --]
[-- Type: text/plain, Size: 3923 bytes --]

When a hw iommu is detected during pci_iommu_alloc() it will disable
the swiotlb setup.  If the subsequent hw iommu initialization in
pci_iommu_init() fails, the box may be left in an unusable state.

The swiotlb is normally allocated early from bootmem to ensure a large
(64M) contiguous allocation.  This patch adds some logic to go ahead
and allocate the swiotlb despite the presence of a hw iommu, and later
free the swiotlb if it is not needed or enable it if it is.  Because
pci_iommu_init() is called after bootmem has been released to the page
allocator, we use free_bootmem_late, a new mechanism for freeing pages
directly back to the allocator.

This patch relies on (iommu_detected && !dma_ops) being true as a way
to see the failed hw iommu initialization.  This will not work w/ AMD
IOMMU in passthrough mode.

https://bugzilla.redhat.com/show_bug.cgi?id=524808

Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/x86/include/asm/swiotlb.h |    4 ++++
 arch/x86/kernel/pci-dma.c      |    4 +++-
 arch/x86/kernel/pci-swiotlb.c  |   27 +++++++++++++++++++++------
 include/linux/swiotlb.h        |    3 +++
 lib/swiotlb.c                  |   10 ++++++++++
 5 files changed, 41 insertions(+), 7 deletions(-)

--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -9,9 +9,13 @@ extern int swiotlb_force;
 
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
+extern void pci_swiotlb_alloc(void);
 extern void pci_swiotlb_init(void);
 #else
 #define swiotlb 0
+static inline void pci_swiotlb_alloc(void)
+{
+}
 static inline void pci_swiotlb_init(void)
 {
 }
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -141,7 +141,7 @@ void __init pci_iommu_alloc(void)
 
 	amd_iommu_detect();
 
-	pci_swiotlb_init();
+	pci_swiotlb_alloc();
 }
 
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -300,6 +300,8 @@ static int __init pci_iommu_init(void)
 
 	gart_iommu_init();
 
+	pci_swiotlb_init();
+
 	no_iommu_init();
 	return 0;
 }
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,33 @@ static struct dma_map_ops swiotlb_dma_op
 	.dma_supported = NULL,
 };
 
-void __init pci_swiotlb_init(void)
+static int swiotlb_try_init;
+
+void __init pci_swiotlb_alloc(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
-		swiotlb = 1;
+	if (!no_iommu && max_pfn > MAX_DMA32_PFN) {
+		if (!iommu_detected)
+			swiotlb = 1;
+		else
+			swiotlb_try_init = 1;
+	}
 #endif
 	if (swiotlb_force)
 		swiotlb = 1;
-	if (swiotlb) {
-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+	if (swiotlb || swiotlb_try_init)
 		swiotlb_init();
+}
+
+void __init pci_swiotlb_init(void)
+{
+	if (!swiotlb && !swiotlb_try_init)
+		return;
+
+	if (iommu_detected && !dma_ops) {
+		pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
 		dma_ops = &swiotlb_dma_ops;
-	}
+	} else
+		swiotlb_free();
 }
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -24,6 +24,9 @@ extern void
 swiotlb_init(void);
 
 extern void
+swiotlb_free(void);
+
+extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
 
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -181,6 +181,16 @@ swiotlb_init_with_default_size(size_t de
 }
 
 void __init
+swiotlb_free(void)
+{
+
+	free_bootmem_late(__pa(io_tlb_overflow_buffer), io_tlb_overflow);
+	free_bootmem_late(__pa(io_tlb_orig_addr), io_tlb_nslabs * sizeof(phys_addr_t));
+	free_bootmem_late(__pa(io_tlb_list), io_tlb_nslabs * sizeof(int));
+	free_bootmem_late(__pa(io_tlb_start) , io_tlb_nslabs << IO_TLB_SHIFT);
+}
+
+void __init
 swiotlb_init(void)
 {
 	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
                   ` (2 preceding siblings ...)
  2009-10-23  1:22 ` [RFC PATCH 3/3] [RFC PATCH] iommu: allow fallback to swiotlb upon hw iommu initialization failure Chris Wright
@ 2009-10-23  5:51 ` FUJITA Tomonori
  2009-10-23 16:39   ` Chris Wright
  2009-10-26  7:10 ` Andi Kleen
  4 siblings, 1 reply; 12+ messages in thread
From: FUJITA Tomonori @ 2009-10-23  5:51 UTC (permalink / raw)
  To: chrisw; +Cc: dwmw2, iommu, linux-kernel

On Thu, 22 Oct 2009 18:21:58 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> This short series gives us the ability to allocate the swiotlb and then
> conditionally free it if we discover it isn't needed.  This allows us to
> put swiotlb to use when the hw iommu fails to initialize properly.
> 
> This needs some changes to the bootmem allocator to give the ability to
> free reserved bootmem directly to the page allocator after bootmem is
> torn down.

The concept sounds fine but the third patch doesn't look correct.

Seems that the third patch doesn't take into account enabling both hw
iommu and swiotlb (Calgary does and I guess VT-d and AMD need that
too). Also (iommu_detected && !dma_ops) trick doesn't work for
Calgary, IIRC. The third patch also makes the dma startup code more
complicated.

I have half-baked patches to clean up the dma startup code supporting
the concept. I can work on the top of the first and second
patches. They need to be CC'ed to the memory people and ACKs, don't
they?


Thanks,

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-23  5:51 ` [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures FUJITA Tomonori
@ 2009-10-23 16:39   ` Chris Wright
  2009-10-24  3:06     ` FUJITA Tomonori
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wright @ 2009-10-23 16:39 UTC (permalink / raw)
  To: FUJITA Tomonori; +Cc: chrisw, dwmw2, iommu, linux-kernel

* FUJITA Tomonori (fujita.tomonori@lab.ntt.co.jp) wrote:
> On Thu, 22 Oct 2009 18:21:58 -0700
> Chris Wright <chrisw@sous-sol.org> wrote:
> 
> > This short series gives us the ability to allocate the swiotlb and then
> > conditionally free it if we discover it isn't needed.  This allows us to
> > put swiotlb to use when the hw iommu fails to initialize properly.
> > 
> > This needs some changes to the bootmem allocator to give the ability to
> > free reserved bootmem directly to the page allocator after bootmem is
> > torn down.
> 
> The concept sounds fine but the third patch doesn't look correct.
> 
> Seems that the third patch doesn't take into account enabling both hw
> iommu and swiotlb (Calgary does and I guess VT-d and AMD need that
> too).

VT-d isn't using swiotlb.  Nor is AMD, although I think it will pick up
no_iommu on passthrough (seems like it would benefit from swiotlb in
that case).

>  Also (iommu_detected && !dma_ops) trick doesn't work for
> Calgary, IIRC.

Yes, I think you are right.  I had stared at the calgary code and thought
it would DTRT due to calgary's use of no_iommu as fallback, but instead
it will never pick up swiotlb_dma_ops.  The calgary shouldn't even need
to be manually setting up nommu_dma_ops.

>  The third patch also makes the dma startup code more
> complicated.

I completely agree.  The whole dma/iommu startup is already complex
and fragile.  Issues like above made getting the right combination
like whack-a-mole.

> I have half-baked patches to clean up the dma startup code supporting
> the concept. I can work on the top of the first and second
> patches. They need to be CC'ed to the memory people and ACKs, don't
> they?

Great.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-23 16:39   ` Chris Wright
@ 2009-10-24  3:06     ` FUJITA Tomonori
  2009-10-24  6:57       ` Chris Wright
  0 siblings, 1 reply; 12+ messages in thread
From: FUJITA Tomonori @ 2009-10-24  3:06 UTC (permalink / raw)
  To: chrisw; +Cc: fujita.tomonori, dwmw2, iommu, linux-kernel

On Fri, 23 Oct 2009 09:39:23 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> > The concept sounds fine but the third patch doesn't look correct.
> > 
> > Seems that the third patch doesn't take into account enabling both hw
> > iommu and swiotlb (Calgary does and I guess VT-d and AMD need that
> > too).
> 
> VT-d isn't using swiotlb.  Nor is AMD, although I think it will pick up
> no_iommu on passthrough (seems like it would benefit from swiotlb in
> that case).

I think that they need swiotlb for the same reason why Calgray needs
it. IIRC, someone in VT-d camp said so.


> >  Also (iommu_detected && !dma_ops) trick doesn't work for
> > Calgary, IIRC.
> 
> Yes, I think you are right.  I had stared at the calgary code and thought
> it would DTRT due to calgary's use of no_iommu as fallback, but instead
> it will never pick up swiotlb_dma_ops.

Note that Calgary comment 'falling back to no_iommu' is misleading. It
actually falls back to swiotlb or nommu properly.

Calgary doesn't set to dma_ops to calgary_dma_ops so it doesn't need
to pick up swiotlb_dma_ops.


> The calgary shouldn't even need to be manually setting up
> nommu_dma_ops.

Yeah, but it needs because of how the dma startup code works.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-24  3:06     ` FUJITA Tomonori
@ 2009-10-24  6:57       ` Chris Wright
  2009-10-28  6:53         ` FUJITA Tomonori
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wright @ 2009-10-24  6:57 UTC (permalink / raw)
  To: FUJITA Tomonori; +Cc: chrisw, dwmw2, iommu, linux-kernel

* FUJITA Tomonori (fujita.tomonori@lab.ntt.co.jp) wrote:
> On Fri, 23 Oct 2009 09:39:23 -0700
> Chris Wright <chrisw@sous-sol.org> wrote:
> 
> > > The concept sounds fine but the third patch doesn't look correct.
> > > 
> > > Seems that the third patch doesn't take into account enabling both hw
> > > iommu and swiotlb (Calgary does and I guess VT-d and AMD need that
> > > too).
> > 
> > VT-d isn't using swiotlb.  Nor is AMD, although I think it will pick up
> > no_iommu on passthrough (seems like it would benefit from swiotlb in
> > that case).
> 
> I think that they need swiotlb for the same reason why Calgray needs
> it. IIRC, someone in VT-d camp said so.

Right, it was used as fallback for a bit when pass through mode was
first enabled to handle case where device has dma mask smaller than
physical memory.  That was removed, it was just recently w/ Alex's
comments that the idea of putting it back came up.

> > >  Also (iommu_detected && !dma_ops) trick doesn't work for
> > > Calgary, IIRC.
> > 
> > Yes, I think you are right.  I had stared at the calgary code and thought
> > it would DTRT due to calgary's use of no_iommu as fallback, but instead
> > it will never pick up swiotlb_dma_ops.
> 
> Note that Calgary comment 'falling back to no_iommu' is misleading. It
> actually falls back to swiotlb or nommu properly.
> 
> Calgary doesn't set to dma_ops to calgary_dma_ops so it doesn't need
> to pick up swiotlb_dma_ops.

It does need swiotlb_dma_ops even when calgary init succeeds for the devices
that aren't behind Calgary to deal w/ the case of those devices having
dma mask smaller than physical memory (i.e those that don't get device
specific dma_ops set to calgary_dma_ops).  So, the problem with this
patch is that it would fall back to nommu_dma_ops in cases where it was
expecting to fall back to swiotlb.

> > The calgary shouldn't even need to be manually setting up
> > nommu_dma_ops.
> 
> Yeah, but it needs because of how the dma startup code works.

Be much better to have the core handle all of this. Basically register
ops and then put the top one on the stack to actual use.  So the
fallback would be automatic, just pick the top of the stack and go.
Were you thinking of something along those lines?

thanks,
-chris

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
                   ` (3 preceding siblings ...)
  2009-10-23  5:51 ` [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures FUJITA Tomonori
@ 2009-10-26  7:10 ` Andi Kleen
  2009-10-26 16:26   ` Chris Wright
  4 siblings, 1 reply; 12+ messages in thread
From: Andi Kleen @ 2009-10-26  7:10 UTC (permalink / raw)
  To: Chris Wright; +Cc: David Woodhouse, iommu, linux-kernel

Chris Wright <chrisw@sous-sol.org> writes:

> This short series gives us the ability to allocate the swiotlb and then
> conditionally free it if we discover it isn't needed.  This allows us to
> put swiotlb to use when the hw iommu fails to initialize properly.
>
> This needs some changes to the bootmem allocator to give the ability to
> free reserved bootmem directly to the page allocator after bootmem is
> torn down.

You forgot to state what motivated you to that change?

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-26  7:10 ` Andi Kleen
@ 2009-10-26 16:26   ` Chris Wright
  2009-10-26 21:56     ` Andi Kleen
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wright @ 2009-10-26 16:26 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, David Woodhouse, iommu, linux-kernel

* Andi Kleen (andi@firstfloor.org) wrote:
> Chris Wright <chrisw@sous-sol.org> writes:
> 
> > This short series gives us the ability to allocate the swiotlb and then
> > conditionally free it if we discover it isn't needed.  This allows us to
> > put swiotlb to use when the hw iommu fails to initialize properly.
> >
> > This needs some changes to the bootmem allocator to give the ability to
> > free reserved bootmem directly to the page allocator after bootmem is
> > torn down.
> 
> You forgot to state what motivated you to that change?

I thought I did ;-)  Here's another more verbose attempt:

The HW IOMMU, for example Intel VT-d, may fail initialization (typically
due to BIOS bugs).  In that case the existing fallback is nommu, which is
clearly insufficient for many boxen which need some bounce buffering if
there is no HW IOMMU.  The problem is that at the point of this failure
the decision to allocate and initialize the swiotlb has long since past.

There are 4 ways to handle this:

1) Give up and panic the box.  This is not a user friendly option since
the box will boot and function fine (minus any isolation gained from the
HW IOMMU) if there is either not much phys mem or an swiotlb.

2) Do the discovery that causes the initialization failure earlier so
that HW IOMMU detection fails.  Compilcated by the HW IOMMU's use of the
run time env that includes a real allocator and PCI enumeration, etc.

3) Allow the swiotlb to be allocated later in pci_iommu_init() instead
of early in pci_iommu_alloc(), IOW don't use bootmem for the swiotlb.
This is possible, although it will hit 2 limitations.  The first is any
possible fragmentation that limits the availability of a 64M region
by the time this runs.  The second is that x86 has MAX_ORDER of 11,
so at most we can allocate a 4M region from the page allocator which is
inusufficient for swiotlb.

4) Allow the swiotlb to be allocated in pci_iommu_alloc(), but not
initialized until pci_iommu_init().  This allows using bootmem allocator
to reserve a nice large contiguous chunk of memory, but requires some
way to give that memory back in the case that a HW IOMMU is properly
both detected and initialized (else it'd be a 64M memory leak for
effectively all HW IOMMU enabled boxen).

This series implements the fourth option.

thanks,
-chris

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-26 16:26   ` Chris Wright
@ 2009-10-26 21:56     ` Andi Kleen
  0 siblings, 0 replies; 12+ messages in thread
From: Andi Kleen @ 2009-10-26 21:56 UTC (permalink / raw)
  To: Chris Wright; +Cc: Andi Kleen, David Woodhouse, iommu, linux-kernel

[... 4 options described ...]
> 
> This series implements the fourth option.

Ok makes sense.

This should be in the git log somewhere.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures
  2009-10-24  6:57       ` Chris Wright
@ 2009-10-28  6:53         ` FUJITA Tomonori
  0 siblings, 0 replies; 12+ messages in thread
From: FUJITA Tomonori @ 2009-10-28  6:53 UTC (permalink / raw)
  To: chrisw; +Cc: fujita.tomonori, dwmw2, iommu, linux-kernel

On Fri, 23 Oct 2009 23:57:12 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> > Note that Calgary comment 'falling back to no_iommu' is misleading. It
> > actually falls back to swiotlb or nommu properly.
> > 
> > Calgary doesn't set to dma_ops to calgary_dma_ops so it doesn't need
> > to pick up swiotlb_dma_ops.
> 
> It does need swiotlb_dma_ops even when calgary init succeeds for the devices
> that aren't behind Calgary to deal w/ the case of those devices having
> dma mask smaller than physical memory (i.e those that don't get device
> specific dma_ops set to calgary_dma_ops).

I know since I wrote that code.


> > > The calgary shouldn't even need to be manually setting up
> > > nommu_dma_ops.
> > 
> > Yeah, but it needs because of how the dma startup code works.
> 
> Be much better to have the core handle all of this. Basically register
> ops and then put the top one on the stack to actual use.  So the
> fallback would be automatic, just pick the top of the stack and go.
> Were you thinking of something along those lines?

I don't know what 'register ops and then put the top one on the stack'
means but it sounds overdoing.

I think that we can handle this issue more simply. I've just posted
the patchset.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-10-28  6:54 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-23  1:21 [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures Chris Wright
2009-10-23  1:21 ` [RFC PATCH 1/3] [RFC PATCH] bootmem: refactor free_all_bootmem_core Chris Wright
2009-10-23  1:22 ` [RFC PATCH 2/3] [RFC PATCH] bootmem: add free_bootmem_late Chris Wright
2009-10-23  1:22 ` [RFC PATCH 3/3] [RFC PATCH] iommu: allow fallback to swiotlb upon hw iommu initialization failure Chris Wright
2009-10-23  5:51 ` [RFC PATCH 0/3] allow fallback to swiotlb on hw iommu init failures FUJITA Tomonori
2009-10-23 16:39   ` Chris Wright
2009-10-24  3:06     ` FUJITA Tomonori
2009-10-24  6:57       ` Chris Wright
2009-10-28  6:53         ` FUJITA Tomonori
2009-10-26  7:10 ` Andi Kleen
2009-10-26 16:26   ` Chris Wright
2009-10-26 21:56     ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).