linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* alpha iommu fixes
@ 2001-05-18 17:46 Ivan Kokshaysky
  2001-05-19  2:34 ` Tom Vier
                   ` (2 more replies)
  0 siblings, 3 replies; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-18 17:46 UTC (permalink / raw)
  To: Richard Henderson; +Cc: linux-kernel

The most interesting thing here is the pyxis "tbia" fix.
Whee! I can now copy files from SCSI to bus-master IDE, or
between two IDE drives on separate channels, or do other nice
things without hanging lx/sx164. :-)
The pyxis "tbia" turned out to be broken in a more nastier way
than one could expect - tech details are commented in the patch.

Another problem, I think, is that we need extra locking in
pci_unmap_xx(). It seems to be possible that after the scatter-gather
table "wraps" and some SG ptes get free, these ptes might be
immediately allocated and next_entry pointer advanced by pci_map_xx()
from interrupt or another CPU *before* the test for mv_pci_tbi().
In this case we'd have stale TLB entries.

Also small compile fix for 2.4.5-pre3.

Ivan.

--- 2.4.5p3/arch/alpha/kernel/core_cia.c	Fri Mar 30 19:02:39 2001
+++ linux/arch/alpha/kernel/core_cia.c	Fri May 18 13:50:51 2001
@@ -297,106 +297,82 @@ struct pci_ops cia_pci_ops = 
  * It cannot be invalidated.  Rather than hard code the pass numbers,
  * actually try the tbia to see if it works.
  */
-
-void
-cia_pci_tbi(struct pci_controller *hose, dma_addr_t start, dma_addr_t end)
-{
-	wmb();
-	*(vip)CIA_IOC_PCI_TBIA = 3;	/* Flush all locked and unlocked.  */
-	mb();
-	*(vip)CIA_IOC_PCI_TBIA;
-}
-
-/*
- * Fixup attempt number 1.
- *
- * Write zeros directly into the tag registers.
+/* Even if the tbia works, we cannot use it. It effectively locks the
+ * chip (as well as direct write to the tag registers) if there is a
+ * SG DMA operation in progress. This is true at least for PYXIS rev. 1.
  */
-
-static void
-cia_pci_tbi_try1(struct pci_controller *hose,
-		 dma_addr_t start, dma_addr_t end)
-{
-	wmb();
-	*(vip)CIA_IOC_TB_TAGn(0) = 0;
-	*(vip)CIA_IOC_TB_TAGn(1) = 0;
-	*(vip)CIA_IOC_TB_TAGn(2) = 0;
-	*(vip)CIA_IOC_TB_TAGn(3) = 0;
-	*(vip)CIA_IOC_TB_TAGn(4) = 0;
-	*(vip)CIA_IOC_TB_TAGn(5) = 0;
-	*(vip)CIA_IOC_TB_TAGn(6) = 0;
-	*(vip)CIA_IOC_TB_TAGn(7) = 0;
-	mb();
-	*(vip)CIA_IOC_TB_TAGn(0);
-}
-
-#if 0
 /*
- * Fixup attempt number 2.  This is the method NT and NetBSD use.
+ * This is the method NT and NetBSD use.
  *
  * Allocate mappings, and put the chip into DMA loopback mode to read a
  * garbage page.  This works by causing TLB misses, causing old entries to
  * be purged to make room for the new entries coming in for the garbage page.
  */
 
-#define CIA_BROKEN_TBI_TRY2_BASE	0xE0000000
+#define CIA_BROKEN_TBIA_BASE	0xE0000000
+#define CIA_BROKEN_TBIA_SIZE	1024
 
-static void __init
-cia_enable_broken_tbi_try2(void)
+static inline void
+cia_enable_broken_tbia(void)
 {
 	unsigned long *ppte, pte;
 	long i;
 
-	ppte = __alloc_bootmem(PAGE_SIZE, 32768, 0);
+	/* Use minimal 1K map. */
+	ppte = __alloc_bootmem(CIA_BROKEN_TBIA_SIZE, 32768, 0);
 	pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1;
 
-	for (i = 0; i < PAGE_SIZE / sizeof(unsigned long); ++i)
+	for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i)
 		ppte[i] = pte;
 
-	*(vip)CIA_IOC_PCI_W3_BASE = CIA_BROKEN_TBI_TRY2_BASE | 3;
-	*(vip)CIA_IOC_PCI_W3_MASK = (PAGE_SIZE - 1) & 0xfff00000;
+	*(vip)CIA_IOC_PCI_W3_BASE = CIA_BROKEN_TBIA_BASE | 3;
+	*(vip)CIA_IOC_PCI_W3_MASK = (CIA_BROKEN_TBIA_SIZE*1024 - 1)
+				    & 0xfff00000;
 	*(vip)CIA_IOC_PCI_T3_BASE = virt_to_phys(ppte) >> 2;
 }
 
-static void
-cia_pci_tbi_try2(struct pci_controller *hose,
+void
+cia_pci_tbi(struct pci_controller *hose,
 		 dma_addr_t start, dma_addr_t end)
 {
-	unsigned long flags;
 	unsigned long bus_addr;
 	int ctrl;
-	long i;
 
-	__save_and_cli(flags);
+/*	__save_and_cli(flags);	Don't need this -- we're called from
+				pci_unmap_xx() or iommu_arena_alloc()
+				with IPL_MAX after spin_lock_irqsave() */
 
 	/* Put the chip into PCI loopback mode.  */
 	mb();
 	ctrl = *(vip)CIA_IOC_CIA_CTRL;
 	*(vip)CIA_IOC_CIA_CTRL = ctrl | CIA_CTRL_PCI_LOOP_EN;
 	mb();
-	*(vip)CIA_IOC_CIA_CTRL;
-	mb();
 
 	/* Read from PCI dense memory space at TBI_ADDR, skipping 32k on
 	   each read.  This forces SG TLB misses.  NetBSD claims that the
 	   TLB entries are not quite LRU, meaning that we need to read more
 	   times than there are actual tags.  The 2117x docs claim strict
 	   round-robin.  Oh well, we've come this far...  */
-
-	bus_addr = cia_ioremap(CIA_BROKEN_TBI_TRY2_BASE);
-	for (i = 0; i < 12; ++i, bus_addr += 32768)
-		cia_readl(bus_addr);
+	/* Even better - as seen on the PYXIS rev 1 the TLB tags 0-3 can
+	   be filled by the TLB misses *only once* after being invalidated
+	   (by tbia or direct write). Next misses won't update them even
+	   though the lock bits are cleared. Tags 4-7 are "quite LRU" though,
+	   so use them and read at window 3 base exactly 4 times. -ink */
+
+	bus_addr = cia_ioremap(CIA_BROKEN_TBIA_BASE);
+
+	cia_readl(bus_addr + 0x00000);
+	cia_readl(bus_addr + 0x08000);
+	cia_readl(bus_addr + 0x10000);
+	cia_readl(bus_addr + 0x18000);
 
 	/* Restore normal PCI operation.  */
 	mb();
 	*(vip)CIA_IOC_CIA_CTRL = ctrl;
 	mb();
-	*(vip)CIA_IOC_CIA_CTRL;
-	mb();
 
-	__restore_flags(flags);
+/*	__restore_flags(flags);	*/
 }
-#endif
 
 static void __init
 verify_tb_operation(void)
@@ -440,10 +416,6 @@ verify_tb_operation(void)
 	/* First, verify we can read back what we've written.  If
 	   this fails, we can't be sure of any of the other testing
 	   we're going to do, so bail.  */
-	/* ??? Actually, we could do the work with machine checks.
-	   By passing this register update test, we pretty much
-	   guarantee that cia_pci_tbi_try1 works.  If this test
-	   fails, cia_pci_tbi_try2 might still work.  */
 
 	temp = *(vip)CIA_IOC_TB_TAGn(0);
 	if (temp != tag0) {
@@ -487,27 +459,7 @@ verify_tb_operation(void)
 	}
 	printk("pci: passed sg loopback i/o read test\n");
 
-	/* Third, try to invalidate the TLB.  */
-
-	cia_pci_tbi(arena->hose, 0, -1);
-	temp = *(vip)CIA_IOC_TB_TAGn(0);
-	if (temp & 1) {
-		cia_pci_tbi_try1(arena->hose, 0, -1);
-	
-		temp = *(vip)CIA_IOC_TB_TAGn(0);
-		if (temp & 1) {
-			printk("pci: failed tbia test; "
-			       "no usable workaround\n");
-			goto failed;
-		}
-
-		alpha_mv.mv_pci_tbi = cia_pci_tbi_try1;
-		printk("pci: failed tbia test; workaround 1 succeeded\n");
-	} else {
-		printk("pci: passed tbia test\n");
-	}
-
-	/* Fourth, verify the TLB snoops the EV5's caches when
+	/* Third, verify the TLB snoops the EV5's caches when
 	   doing a tlb fill.  */
 
 	data0 = 0x5adda15e;
@@ -531,7 +483,7 @@ verify_tb_operation(void)
 	}
 	printk("pci: passed pte write cache snoop test\n");
 
-	/* Fifth, verify that a previously invalid PTE entry gets
+	/* Fourth, verify that a previously invalid PTE entry gets
 	   filled from the page table.  */
 
 	data0 = 0xabcdef12;
@@ -558,7 +510,7 @@ verify_tb_operation(void)
 		printk("pci: passed valid tag invalid pte reload test\n");
 	}
 
-	/* Sixth, verify machine checks are working.  Test invalid
+	/* Fifth, verify machine checks are working.  Test invalid
 	   pte under the same valid tag as we used above.  */
 
 	mcheck_expected(0) = 1;
@@ -571,9 +523,19 @@ verify_tb_operation(void)
 	printk("pci: %s pci machine check test\n",
 	       mcheck_taken(0) ? "passed" : "failed");
 
+	printk("pci: broken tbia workaround enabled\n");
+
 	/* Clean up after the tests.  */
 	arena->ptes[4] = 0;
 	arena->ptes[5] = 0;
+
+	/* Disable tags 0-3 with the lock bit */
+	wmb();
+	*(vip)CIA_IOC_TB_TAGn(0) = 2;
+	*(vip)CIA_IOC_TB_TAGn(1) = 2;
+	*(vip)CIA_IOC_TB_TAGn(2) = 2;
+	*(vip)CIA_IOC_TB_TAGn(3) = 2;
+
 	alpha_mv.mv_pci_tbi(arena->hose, 0, -1);
 
 exit:
@@ -706,7 +668,7 @@ do_init_arch(int is_pyxis)
 	*(vip)CIA_IOC_PCI_W2_MASK = (0x40000000 - 1) & 0xfff00000;
 	*(vip)CIA_IOC_PCI_T2_BASE = 0x40000000 >> 2;
 
-	*(vip)CIA_IOC_PCI_W3_BASE = 0;
+	cia_enable_broken_tbia();
 }
 
 void __init
--- 2.4.5p3/arch/alpha/kernel/pci_iommu.c	Fri Mar 30 19:02:39 2001
+++ linux/arch/alpha/kernel/pci_iommu.c	Fri May 18 14:05:19 2001
@@ -175,7 +175,7 @@ pci_map_single(struct pci_dev *pdev, voi
 	/* If the machine doesn't define a pci_tbi routine, we have to
 	   assume it doesn't support sg mapping.  */
 	if (! alpha_mv.mv_pci_tbi) {
-		printk(KERN_INFO "pci_map_single failed: no hw sg\n");
+		printk(KERN_WARNING "pci_map_single failed: no hw sg\n");
 		return 0;
 	}
 		
@@ -186,7 +186,7 @@ pci_map_single(struct pci_dev *pdev, voi
 	npages = calc_npages((paddr & ~PAGE_MASK) + size);
 	dma_ofs = iommu_arena_alloc(arena, npages);
 	if (dma_ofs < 0) {
-		printk(KERN_INFO "pci_map_single failed: "
+		printk(KERN_WARNING "pci_map_single failed: "
 		       "could not allocate dma page tables\n");
 		return 0;
 	}
@@ -215,6 +215,7 @@ void
 pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, long size,
 		 int direction)
 {
+	unsigned long flags;
 	struct pci_controller *hose = pdev ? pdev->sysdata : pci_isa_hose;
 	struct pci_iommu_arena *arena;
 	long dma_ofs, npages;
@@ -248,6 +249,9 @@ pci_unmap_single(struct pci_dev *pdev, d
 	}
 
 	npages = calc_npages((dma_addr & ~PAGE_MASK) + size);
+
+	spin_lock_irqsave(&arena->lock, flags);
+
 	iommu_arena_free(arena, dma_ofs, npages);
 
 
@@ -259,6 +263,8 @@ pci_unmap_single(struct pci_dev *pdev, d
 	if (dma_ofs >= arena->next_entry)
 		alpha_mv.mv_pci_tbi(hose, dma_addr, dma_addr + size - 1);
 
+	spin_unlock_irqrestore(&arena->lock, flags);
+
 	DBGA("pci_unmap_single: sg [%x,%lx] np %ld from %p\n",
 	     dma_addr, size, npages, __builtin_return_address(0));
 }
@@ -503,13 +509,13 @@ pci_map_sg(struct pci_dev *pdev, struct 
 		out->dma_length = 0;
 
 	if (out - start == 0)
-		printk(KERN_INFO "pci_map_sg failed: no entries?\n");
+		printk(KERN_WARNING "pci_map_sg failed: no entries?\n");
 	DBGA("pci_map_sg: %ld entries\n", out - start);
 
 	return out - start;
 
 error:
-	printk(KERN_INFO "pci_map_sg failed: "
+	printk(KERN_WARNING "pci_map_sg failed: "
 	       "could not allocate dma page tables\n");
 
 	/* Some allocation failed while mapping the scatterlist
@@ -528,6 +534,7 @@ void
 pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 	     int direction)
 {
+	unsigned long flags;
 	struct pci_controller *hose;
 	struct pci_iommu_arena *arena;
 	struct scatterlist *end;
@@ -547,6 +554,9 @@ pci_unmap_sg(struct pci_dev *pdev, struc
 		arena = hose->sg_isa;
 
 	fbeg = -1, fend = 0;
+
+	spin_lock_irqsave(&arena->lock, flags);
+
 	for (end = sg + nents; sg < end; ++sg) {
 		unsigned long addr, size;
 		long npages, ofs;
@@ -586,6 +596,8 @@ pci_unmap_sg(struct pci_dev *pdev, struc
 	*/
 	if ((fend - arena->dma_base) >> PAGE_SHIFT >= arena->next_entry)
 		alpha_mv.mv_pci_tbi(hose, fbeg, fend);
+
+	spin_unlock_irqrestore(&arena->lock, flags);
 
 	DBGA("pci_unmap_sg: %d entries\n", nents - (end - sg));
 }
--- 2.4.5p3/arch/alpha/kernel/setup.c	Fri Mar  2 22:12:07 2001
+++ linux/arch/alpha/kernel/setup.c	Fri May 18 19:43:15 2001
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/bootmem.h>
+#include <linux/pci.h>
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #include <linux/blk.h>
@@ -49,7 +50,6 @@ static struct notifier_block alpha_panic
 #include <asm/hwrpb.h>
 #include <asm/dma.h>
 #include <asm/io.h>
-#include <asm/pci.h>
 #include <asm/mmu_context.h>
 #include <asm/console.h>
 

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-18 17:46 alpha iommu fixes Ivan Kokshaysky
@ 2001-05-19  2:34 ` Tom Vier
  2001-05-19 10:48   ` Ivan Kokshaysky
  2001-05-19 13:55 ` Andrea Arcangeli
  2001-05-20  1:11 ` Richard Henderson
  2 siblings, 1 reply; 97+ messages in thread
From: Tom Vier @ 2001-05-19  2:34 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel

maybe this third window stuff in cia_enable_broken_tbia() is why i can't
seem to get the third window to open up. from my reading of the 21174 docs,
my code should work. since T2_BASE is at 0x40000000 for 1gig, i'd think
T3_BASE should be at 0x80000000. am i missing something?

	hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, 32768);
	*(vip)CIA_IOC_PCI_W3_BASE = 0xc0000000 | 1;
	*(vip)CIA_IOC_PCI_W3_MASK = (0x08000000 - 1) & 0xfff00000;
	*(vip)CIA_IOC_PCI_T3_BASE = 0x80000000 >> 2;


On Fri, May 18, 2001 at 09:46:17PM +0400, Ivan Kokshaysky wrote:
> -	*(vip)CIA_IOC_PCI_W3_BASE = CIA_BROKEN_TBI_TRY2_BASE | 3;
> -	*(vip)CIA_IOC_PCI_W3_MASK = (PAGE_SIZE - 1) & 0xfff00000;
> +	*(vip)CIA_IOC_PCI_W3_BASE = CIA_BROKEN_TBIA_BASE | 3;
> +	*(vip)CIA_IOC_PCI_W3_MASK = (CIA_BROKEN_TBIA_SIZE*1024 - 1)
> +				    & 0xfff00000;
>  	*(vip)CIA_IOC_PCI_T3_BASE = virt_to_phys(ppte) >> 2;
>  }


-- 
Tom Vier <tmv5@home.com>
DSA Key id 0x27371A2C

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-19  2:34 ` Tom Vier
@ 2001-05-19 10:48   ` Ivan Kokshaysky
  2001-05-19 20:58     ` Tom Vier
  0 siblings, 1 reply; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-19 10:48 UTC (permalink / raw)
  To: Tom Vier; +Cc: Richard Henderson, linux-kernel

On Fri, May 18, 2001 at 10:34:36PM -0400, Tom Vier wrote:
> 	hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, 32768);
> 	*(vip)CIA_IOC_PCI_W3_BASE = 0xc0000000 | 1;
> 	*(vip)CIA_IOC_PCI_W3_MASK = (0x08000000 - 1) & 0xfff00000;
> 	*(vip)CIA_IOC_PCI_T3_BASE = 0x80000000 >> 2;

This is incorrect. If you want directly mapped PCI window then you don't
need the iommu_arena for it. If you want scatter-gather mapping, you
should write address of the SG page table into the T3_BASE register.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-18 17:46 alpha iommu fixes Ivan Kokshaysky
  2001-05-19  2:34 ` Tom Vier
@ 2001-05-19 13:55 ` Andrea Arcangeli
  2001-05-19 19:11   ` Ivan Kokshaysky
  2001-05-20  1:11 ` Richard Henderson
  2 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-19 13:55 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel

On Fri, May 18, 2001 at 09:46:17PM +0400, Ivan Kokshaysky wrote:
> The most interesting thing here is the pyxis "tbia" fix.
> Whee! I can now copy files from SCSI to bus-master IDE, or
> between two IDE drives on separate channels, or do other nice
> things without hanging lx/sx164. :-)
> The pyxis "tbia" turned out to be broken in a more nastier way
> than one could expect - tech details are commented in the patch.
> 
> Another problem, I think, is that we need extra locking in
> pci_unmap_xx(). It seems to be possible that after the scatter-gather
> table "wraps" and some SG ptes get free, these ptes might be
> immediately allocated and next_entry pointer advanced by pci_map_xx()
> from interrupt or another CPU *before* the test for mv_pci_tbi().
> In this case we'd have stale TLB entries.
> 
> Also small compile fix for 2.4.5-pre3.
> 

I fixed the same race condition in the unmap (not flushed pte after
next_entry was visible) two days ago and it's ovbiosuly correct, but it
was not nearly enough here, there was a very nasty other race condition
that triggers at least on all ds10 ds20 es40 tsunami/clibber based
boards that is necessary to fix too to make the machine stable (fixed
yesterday and getting tested today).

Reading the tsunami specs I learnt 1 tlb entry caches 8 pagetables (not 1)
so the tlb flush will be invalidate immediatly by any PCI DMA run after
the flush on any of the other 7 mappings cached in the same tlb entry.


This is the fix:

diff -urN alpha-ref/arch/alpha/kernel/pci_iommu.c
alpha-works/arch/alpha/kernel/pci_iommu.c
--- alpha-ref/arch/alpha/kernel/pci_iommu.c	Sun Apr  1 01:17:07 2001
+++ alpha-works/arch/alpha/kernel/pci_iommu.c	Fri May 18 18:07:40 2001
@@ -69,7 +69,7 @@
 
 	/* Align allocations to a multiple of a page size.  Not needed
 	   unless there are chip bugs.  */
-	arena->align_entry = 1;
+	arena->align_entry = 8;
 
 	return arena;
 }
@@

However thsi is just the production fix, the real fix will only change
that for the tsunami chipset

since I didn't wanted to deal with the optimizations yet I also disabled
the optimizations (I will audit the optimizations shortly). Then I fixed
at least the eppro100 driver to check if it runs of pci map entries (all
drivers out there are broken, they don't check the retval from pci_map*
etc...).

then I also enlarged the pci SG space to 1G beause runing out of entries
right now breaks the whole world:

@@ -358,7 +360,7 @@
 	 * address range.
 	 */
 	hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0);
-	hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, 0);
+	hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x40000000, 0);
 	__direct_map_base = 0x40000000;
 	__direct_map_size = 0x80000000;
 
diff

With all this stuff plus the same fix you posted the es40 8g runs rock
solid on top of 2.4.5pre3aa1.

I was going to wait to cleanup all those fixes but I'm posting this half
curroputed email here now just so we don't duplicate further efforts ;)

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-19 13:55 ` Andrea Arcangeli
@ 2001-05-19 19:11   ` Ivan Kokshaysky
  2001-05-20  2:40     ` Andrea Arcangeli
  0 siblings, 1 reply; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-19 19:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Richard Henderson, linux-kernel

On Sat, May 19, 2001 at 03:55:02PM +0200, Andrea Arcangeli wrote:
> Reading the tsunami specs I learnt 1 tlb entry caches 8 pagetables (not 1)
> so the tlb flush will be invalidate immediatly by any PCI DMA run after
> the flush on any of the other 7 mappings cached in the same tlb entry.

I have neither tsunami docs nor the tsunami box to play with :-(
so my guesses might be totally wrong...
But -- assuming that tsunami is similar to cia/pyxis, that is incorrect.
We're invalidating not the cached ptes, but the TLB tags, with all 4 (on
pyxis, and 8 on tsunami, I guess) associated ptes. The reason why we
align new entries at 4*PAGE_SIZE on cia/pyxis is a hardware bug -- if cached
pte is invalid, it doesn't cause TLB miss. I wouldn't be surprised at all if
tsunami has the same bug; in this case your fix is urgently needed, of course.
BTW, look at Richard's code in core_cia.c/verify_tb_operation() for
"valid tag invalid pte reload" test, it could be easily ported to tsunami.

> then I also enlarged the pci SG space to 1G beause runing out of entries
> right now breaks the whole world:

It would just delay the painful death, I think ;-)
I'm almost sure that all these "pci_map_sg failed" reports are caused
by some buggy driver[s], which calls pci_map_xx() without proper
pci_unmap_xx(). This is harmless on i386, and on alpha if all IO is going
through direct-access windows.
I've got some debugging code checking for this (perhaps it worth
posting or even porting to i386 ;-)
For now I can confirm that all drivers I'm currently using are fine
wrt pci_map/unmap:
3c59x, tulip, sym53c8xx, IDE.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-19 10:48   ` Ivan Kokshaysky
@ 2001-05-19 20:58     ` Tom Vier
  0 siblings, 0 replies; 97+ messages in thread
From: Tom Vier @ 2001-05-19 20:58 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel

On Sat, May 19, 2001 at 02:48:15PM +0400, Ivan Kokshaysky wrote:
> This is incorrect. If you want directly mapped PCI window then you don't
> need the iommu_arena for it. If you want scatter-gather mapping, you
> should write address of the SG page table into the T3_BASE register.

i've tried both direct mapped and sg, but it still get pci_map_sg() failures
in sym53c8xx. the sg version, below, won't boot (scsi commands all timeout).
while the added direct map version does boot, it suffers the same problem as
the stock code.

third direct map:
	hose->sg_pci = NULL;
	hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 32768);
	__direct_map_base = 0x40000000;
	__direct_map_size = 0x88000000;

	*(vip)CIA_IOC_PCI_W3_BASE = 0xc0000000 | 1;
	*(vip)CIA_IOC_PCI_W3_MASK = (0x08000000 - 1) & 0xfff00000;
	*(vip)CIA_IOC_PCI_T3_BASE = 0x80000000 >> 2;

sg (doesn't work):

	hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 32768);
	hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, 32768);
	__direct_map_base = 0x40000000;
	__direct_map_size = 0x80000000;

	*(vip)CIA_IOC_PCI_W3_BASE = hose->sg_pci->dma_base | 3;
	*(vip)CIA_IOC_PCI_W3_MASK = (hose->sg_pci->size - 1) & 0xfff00000;
	*(vip)CIA_IOC_PCI_T3_BASE = virt_to_phys(hose->sg_pci->ptes) >> 2;

-- 
Tom Vier <tmv5@home.com>
DSA Key id 0x27371A2C

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-18 17:46 alpha iommu fixes Ivan Kokshaysky
  2001-05-19  2:34 ` Tom Vier
  2001-05-19 13:55 ` Andrea Arcangeli
@ 2001-05-20  1:11 ` Richard Henderson
  2001-05-20 12:05   ` Ivan Kokshaysky
  2 siblings, 1 reply; 97+ messages in thread
From: Richard Henderson @ 2001-05-20  1:11 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: linux-kernel

On Fri, May 18, 2001 at 09:46:17PM +0400, Ivan Kokshaysky wrote:
> -void
> -cia_pci_tbi(struct pci_controller *hose, dma_addr_t start, dma_addr_t end)
> -{
> -	wmb();
> -	*(vip)CIA_IOC_PCI_TBIA = 3;	/* Flush all locked and unlocked.  */
> -	mb();
> -	*(vip)CIA_IOC_PCI_TBIA;
> -}

I'd rather keep this around.  It should be possible to use on CIA2.

> +/* Even if the tbia works, we cannot use it. It effectively locks the
> + * chip (as well as direct write to the tag registers) if there is a
> + * SG DMA operation in progress. This is true at least for PYXIS rev. 1.

Uggg.  How did you discover this?

> +/*	__save_and_cli(flags);	Don't need this -- we're called from
> +				pci_unmap_xx() or iommu_arena_alloc()
> +				with IPL_MAX after spin_lock_irqsave() */

Just delete it, don't comment it out.  You might mention in the
function header comment that we're called with interrupts disabled.

>  	*(vip)CIA_IOC_CIA_CTRL = ctrl;
>  	mb();
> -	*(vip)CIA_IOC_CIA_CTRL;
> -	mb();

I'm pretty sure you don't want to do this.  You're risking a
subsequent i/o being posted through the PCI bridge before this
takes effect.  An "mb" is only effective inside the CPU.



r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-19 19:11   ` Ivan Kokshaysky
@ 2001-05-20  2:40     ` Andrea Arcangeli
  2001-05-20 12:12       ` Ivan Kokshaysky
       [not found]       ` <3B07AF49.5A85205F@uow.edu.au>
  0 siblings, 2 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20  2:40 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel

On Sat, May 19, 2001 at 11:11:31PM +0400, Ivan Kokshaysky wrote:
> On Sat, May 19, 2001 at 03:55:02PM +0200, Andrea Arcangeli wrote:
> > Reading the tsunami specs I learnt 1 tlb entry caches 8 pagetables (not 1)
> > so the tlb flush will be invalidate immediatly by any PCI DMA run after
> > the flush on any of the other 7 mappings cached in the same tlb entry.
> 
> I have neither tsunami docs nor the tsunami box to play with :-(
> so my guesses might be totally wrong...
> But -- assuming that tsunami is similar to cia/pyxis, that is incorrect.
> We're invalidating not the cached ptes, but the TLB tags, with all 4 (on
> pyxis, and 8 on tsunami, I guess) associated ptes. The reason why we

exactly.

> align new entries at 4*PAGE_SIZE on cia/pyxis is a hardware bug -- if cached
> pte is invalid, it doesn't cause TLB miss. I wouldn't be surprised at all if
> tsunami has the same bug; in this case your fix is urgently needed, of course.

It only depends on the specs if it has to be called a bug or a feature.

> BTW, look at Richard's code in core_cia.c/verify_tb_operation() for
> "valid tag invalid pte reload" test, it could be easily ported to tsunami.

I didn't checked very closely what this code is doing but it seems it's
not triggering any DMA transaction from a DMA bus master so it shouldn't
be able to trigger the race, and as far I can tell as soon as you do DMA
on an invalid pagetable cached in a tlb the machine will lock hard. So I
expect if you try to probe if you need the 8 alignment at runtime you
won't be able to finish the probe ;).

> > then I also enlarged the pci SG space to 1G beause runing out of entries
> > right now breaks the whole world:
> 
> It would just delay the painful death, I think ;-)

I _have_ to completly hide the painful death because as soon as I run
out of entries the machine crashes immedatly because lots of drivers
aren't checking for running out of ptes.

Fixing that is a brainer thing, it may need to partly redesign the
driver so you can take a fail path where you coulnd't previously, in
some place you may need to re-issue a softirq later to try again, in
others you must run_task_queue(&tq_disk) and sched_yield and try again
later (you have the guarantee those entries will return available so it
would be not deadlock prone to do that), in other places you can just
drop the skb.  Each driver has to be fixed in its right way. It seems
for a lot of cases people just replaced the virt_to_bus with the
pci_map_single and they didn't thought pci_map_single may even return
0 which _doesn't_ mean bus address 0 ;)

The reason it's not too bad to hide it is that you can usually calculate
an high bound of how many pci mappings a certain given machine may need
at the same time at runtime, so I can give you the guarantee that you
won't be able to reproduce any of that kind of driver bugs on a certain
given machine, this is the only point of the change, just to get this
guarantee on a larger subset of machines until all those bugs are fixed.

> I'm almost sure that all these "pci_map_sg failed" reports are caused
> by some buggy driver[s], which calls pci_map_xx() without proper
> pci_unmap_xx(). This is harmless on i386, and on alpha if all IO is going

I'm not talking about that kind of leak bug. The fact some driver is
leaking ptes is a completly different kind of bug.

I was only talking about when you get the "pci_map_sg failed" because
you have not 3 but 300 scsi disks connected to your system and you are
writing to all them at the same time allocating zillons of pte, and one
of your drivers (possibly not even a storage driver) is actually not
checking the reval of the pci_map_* functions. You don't need a pte
memleak to trigger it, even regardless of the fact I grown the dynamic
window to 1G which makes it 8 times harder to trigger than in mainline.

For now with a a couple of disks and a few nics and a 1G of dynamic
window size it doesn't trigger and the 1G thing gives a fairly large
margin for most machines out there. I cannot care less about the 2M-128k
memory wastage at this point in time, but as I said I wanted at least
to optimize the 2M pte arena allocation away completly if the machine
has less than 2G, that would be a very worthwhile optimization.

> I've got some debugging code checking for this (perhaps it worth
> posting or even porting to i386 ;-)
> For now I can confirm that all drivers I'm currently using are fine
> wrt pci_map/unmap:
> 3c59x, tulip, sym53c8xx, IDE.

all the drivers I'm using are definitely not leaking pte entries either,
but if you give me not 10 but 1000 scsi disks be sure I will trigger the
missing checks for pci_map_* retval without the need of any driver
leaking ptes.

The bug you are talking about is even worse and I never experienced it
myself, but I agree it's likely some driver has it too because it's not
reproducible on x86, so if you have the x86 debugging code that's
welcome so we will be able to reproduce it on a larger variety of
machines. thanks!

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20  1:11 ` Richard Henderson
@ 2001-05-20 12:05   ` Ivan Kokshaysky
  2001-05-21  0:37     ` Richard Henderson
  0 siblings, 1 reply; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-20 12:05 UTC (permalink / raw)
  To: linux-kernel

On Sat, May 19, 2001 at 06:11:27PM -0700, Richard Henderson wrote:
> I'd rather keep this around.  It should be possible to use on CIA2.

Ok. What do you think about reorg like this:
basically leave the old code as is, and add
        if (is_pyxis)
                alpha_mv.mv_pci_tbi = cia_pci_tbi_try2;
        else
                tbia test
                ...

> Uggg.  How did you discover this?

21174 docs confirm that (though in a very low voice ;-) :
 "The 21174 may hang with TBIA=3."
It hangs with TBIA=2 as well. I was able to reproduce it reliably
on sx164 with direct windows disabled just by copying 10-20 Mb via 3c905b
card -- this driver allocates/frees pci buffers at a very high
rate, so "tbia" occurs pretty often.
The fix itself took 2 days of hacking and 50+ reboots...

> Just delete it, don't comment it out.  You might mention in the
> function header comment that we're called with interrupts disabled.

Ok.

> > -	*(vip)CIA_IOC_CIA_CTRL;
> > -	mb();
> 
> I'm pretty sure you don't want to do this.

Right... I noticed these deleted lines only after posting the patch.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20  2:40     ` Andrea Arcangeli
@ 2001-05-20 12:12       ` Ivan Kokshaysky
  2001-05-20 13:40         ` Andrea Arcangeli
  2001-05-20 14:23         ` Gérard Roudier
       [not found]       ` <3B07AF49.5A85205F@uow.edu.au>
  1 sibling, 2 replies; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-20 12:12 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 04:40:13AM +0200, Andrea Arcangeli wrote:
> I was only talking about when you get the "pci_map_sg failed" because
> you have not 3 but 300 scsi disks connected to your system and you are
> writing to all them at the same time allocating zillons of pte, and one
> of your drivers (possibly not even a storage driver) is actually not
> checking the reval of the pci_map_* functions. You don't need a pte
> memleak to trigger it, even regardless of the fact I grown the dynamic
> window to 1G which makes it 8 times harder to trigger than in mainline.

I think you're too pessimistic. Don't mix "disks" and "controllers" --
SCSI adapter with 10 drives attached is a single DMA agent, not 10 agents.

If you're so concerned about Big Iron, go ahead and implement 64-bit PCI
support, it would be right long-term solution. I'm pretty sure that
high-end servers use mostly this kind of hardware.

Oh, well. This doesn't mean that I'm disagreed with what you said. :-)
Driver writers must realize that pci mappings are limited resources.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 12:12       ` Ivan Kokshaysky
@ 2001-05-20 13:40         ` Andrea Arcangeli
  2001-05-20 14:23         ` Gérard Roudier
  1 sibling, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 13:40 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 04:12:34PM +0400, Ivan Kokshaysky wrote:
> On Sun, May 20, 2001 at 04:40:13AM +0200, Andrea Arcangeli wrote:
> > I was only talking about when you get the "pci_map_sg failed" because
> > you have not 3 but 300 scsi disks connected to your system and you are
> > writing to all them at the same time allocating zillons of pte, and one
> > of your drivers (possibly not even a storage driver) is actually not
> > checking the reval of the pci_map_* functions. You don't need a pte
> > memleak to trigger it, even regardless of the fact I grown the dynamic
> > window to 1G which makes it 8 times harder to trigger than in mainline.
> 
> I think you're too pessimistic. Don't mix "disks" and "controllers" --

I'm not pessimistic, I'm fairly relaxed also with a 512Mbyte dynamic window
(that's why I did the change in first place) and I agree that it should
take care of hiding all those bugs on 99% of hardware configurations,
but OTOH I don't want things to work by luck and I'd prefer if the real
bugs gets fixed as well eventually.

> SCSI adapter with 10 drives attached is a single DMA agent, not 10 agents.

you can do simultaneous I/O to all the disks, so you will keep those dma
entries for the SG for each disk in-use at the same time.

> If you're so concerned about Big Iron, go ahead and implement 64-bit PCI
> support, it would be right long-term solution. I'm pretty sure that
> high-end servers use mostly this kind of hardware.

Certainly 64bit pci is supported but that doesn't change the fact you
can as well have 32bit devices on those boxes. 

> Oh, well. This doesn't mean that I'm disagreed with what you said. :-)
> Driver writers must realize that pci mappings are limited resources.

Exactly.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
       [not found]       ` <3B07AF49.5A85205F@uow.edu.au>
@ 2001-05-20 13:49         ` Andrea Arcangeli
  2001-05-20 14:05           ` Andrew Morton
                             ` (3 more replies)
  0 siblings, 4 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 13:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

[ cc'ed to l-k ]

> DMA-mapping.txt assumes that it cannot fail.

DMA-mapping.txt is wrong. Both pci_map_sg and pci_map_single failed if
they returned zero. You either have to drop the skb or to try again later
if they returns zero.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 13:49         ` Andrea Arcangeli
@ 2001-05-20 14:05           ` Andrew Morton
  2001-05-20 14:33             ` Andrea Arcangeli
  2001-05-21  1:01             ` David S. Miller
  2001-05-20 16:18           ` Andrea Arcangeli
                             ` (2 subsequent siblings)
  3 siblings, 2 replies; 97+ messages in thread
From: Andrew Morton @ 2001-05-20 14:05 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

Andrea Arcangeli wrote:
> 
> [ cc'ed to l-k ]
> 
> > DMA-mapping.txt assumes that it cannot fail.
> 
> DMA-mapping.txt is wrong. Both pci_map_sg and pci_map_single failed if
> they returned zero. You either have to drop the skb or to try again later
> if they returns zero.
> 

Well this is news to me.  No drivers understand this.
How long has this been the case?  What platforms?


For netdevices at least, the pci_map_single() call is always close
to the site of the skb allocation.  So what we can do is to roll
them together and use the existing oom-handling associated with alloc_skb(),
assuming the driver has it...


static inline struct sk_buff *pci_alloc_skb(
                int length, int gfp_mask, int reserve,
                pci_dev *pdev, dma_addr_t *dma_addr, int direction, int irq)
{
        struct sk_buff *skb = alloc_skb(length + 16, gfp_mask);
        if (skb) {
                skb_reserve(ret, 16 + reserve);
                *dma_addr = pci_map_single(pdev, skb->tail, size - reserve, direction);
                if (*dma_addr == 0) {
                        if (irq)
                                dev_kfree_skb_irq(skb);
                        else
                                dev_kfree_skb(skb);
                        skb = 0;
                }
        }
        return skb;
}

Sticky, but a lot of it will be compiled away.

-

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 12:12       ` Ivan Kokshaysky
  2001-05-20 13:40         ` Andrea Arcangeli
@ 2001-05-20 14:23         ` Gérard Roudier
  1 sibling, 0 replies; 97+ messages in thread
From: Gérard Roudier @ 2001-05-20 14:23 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Andrea Arcangeli, Richard Henderson, linux-kernel



On Sun, 20 May 2001, Ivan Kokshaysky wrote:

> On Sun, May 20, 2001 at 04:40:13AM +0200, Andrea Arcangeli wrote:
> > I was only talking about when you get the "pci_map_sg failed" because
> > you have not 3 but 300 scsi disks connected to your system and you are
> > writing to all them at the same time allocating zillons of pte, and one
> > of your drivers (possibly not even a storage driver) is actually not
> > checking the reval of the pci_map_* functions. You don't need a pte
> > memleak to trigger it, even regardless of the fact I grown the dynamic
> > window to 1G which makes it 8 times harder to trigger than in mainline.
> 
> I think you're too pessimistic. Don't mix "disks" and "controllers" --
> SCSI adapter with 10 drives attached is a single DMA agent, not 10 agents.
> 
> If you're so concerned about Big Iron, go ahead and implement 64-bit PCI
> support, it would be right long-term solution. I'm pretty sure that
> high-end servers use mostly this kind of hardware.
> 
> Oh, well. This doesn't mean that I'm disagreed with what you said. :-)
> Driver writers must realize that pci mappings are limited resources.

The IOMMU code allocation strategy is designed to fail due to
fragmentation as everything that performs contiguous allocations of
variable quantities.

I may add a test of pci_map_* return code in the sym53c8xx driver, but
the driver will panic on failure. It is not acceptable to consider such
kind of failure as a normal situation (returning some ?_BUSY status to
the SCSI driver) for the following reasons:

- IOs may be reordered and break upper layers assumptions.
- Spurious errors and even BUS resets may happen.

For now, driver methods that are requested to queue IOs are not allowed to
wait for resources. Anyway, the pci_map_* interface is unable to wait.

There are obviously ways to deal gracefully with such resource lack, but
the current SCSI layer isn't featured for that. For example, a
freeze/unfreeze mechanism as described in CAM can be implemented in order
not to reorder IOs, and some mechanism (callback, resource wait, etc...)
must be added to restart the operation when resource is likely to be
available.

IMO, the only acceptable fix in the current kernel is to perform IOMMU PTE
allocations of a fixed quantity at a time, as limiting SG entry to fit in
a single PAGE for example.

  Gérard.

PS: May-be I should look how *BSD's handles IOMMUs.


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 14:05           ` Andrew Morton
@ 2001-05-20 14:33             ` Andrea Arcangeli
  2001-05-21  1:01             ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 14:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 12:05:20AM +1000, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> > 
> > [ cc'ed to l-k ]
> > 
> > > DMA-mapping.txt assumes that it cannot fail.
> > 
> > DMA-mapping.txt is wrong. Both pci_map_sg and pci_map_single failed if
> > they returned zero. You either have to drop the skb or to try again later
> > if they returns zero.
> > 
> 
> Well this is news to me.  No drivers understand this.

Yes, almost all drivers are buggy.

> How long has this been the case?  What platforms?

Always and all platforms.

Just think about this, you have 2^32 of bus address space, and you
theoritically can start I/O for more than 2^32 of phys memory, see?
Whatever platform it is it will never be able to guarantee all mappings
to succeed.

> For netdevices at least, the pci_map_single() call is always close
> to the site of the skb allocation.  So what we can do is to roll
> them together and use the existing oom-handling associated with alloc_skb(),
> assuming the driver has it...

Fine.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 13:49         ` Andrea Arcangeli
  2001-05-20 14:05           ` Andrew Morton
@ 2001-05-20 16:18           ` Andrea Arcangeli
  2001-05-20 16:21             ` Andrew Morton
  2001-05-20 17:16             ` Jeff Garzik
  2001-05-21  1:00           ` David S. Miller
  2001-05-21  1:03           ` David S. Miller
  3 siblings, 2 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 16:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 03:49:58PM +0200, Andrea Arcangeli wrote:
> they returned zero. You either have to drop the skb or to try again later
> if they returns zero.

BTW, pci_map_single is not a nice interface, it cannot return bus
address 0, so once we start the fixage it is probably better to change
the interface as well to get either the error or the bus address via a
pointer passed to the function.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:18           ` Andrea Arcangeli
@ 2001-05-20 16:21             ` Andrew Morton
  2001-05-20 16:44               ` Andrea Arcangeli
  2001-05-20 17:16             ` Jeff Garzik
  1 sibling, 1 reply; 97+ messages in thread
From: Andrew Morton @ 2001-05-20 16:21 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

Andrea Arcangeli wrote:
> 
> On Sun, May 20, 2001 at 03:49:58PM +0200, Andrea Arcangeli wrote:
> > they returned zero. You either have to drop the skb or to try again later
> > if they returns zero.
> 
> BTW, pci_map_single is not a nice interface, it cannot return bus
> address 0, so once we start the fixage it is probably better to change
> the interface as well to get either the error or the bus address via a
> pointer passed to the function.
> 

Would it not be sufficient to define a machine-specific
macro which queries it for error?  On x86 it would be:

#define BUS_ADDR_IS_ERR(addr)	((addr) == 0)

I can't find *any* pci_map_single() in the 2.4.4-ac9 tree
which can fail, BTW.

-

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:21             ` Andrew Morton
@ 2001-05-20 16:44               ` Andrea Arcangeli
  2001-05-20 16:54                 ` Andrew Morton
  0 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 16:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 02:21:18AM +1000, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> Would it not be sufficient to define a machine-specific
> macro which queries it for error?  On x86 it would be:
> 
> #define BUS_ADDR_IS_ERR(addr)	((addr) == 0)

that would be more flexible at least, however not mixing the error with
a potential bus address still looks cleaner to me.

> I can't find *any* pci_map_single() in the 2.4.4-ac9 tree
> which can fail, BTW.

I assume you mean that no one single caller of pci_map_single is
checking if it failed or not (because all pci_map_single can fail).

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:44               ` Andrea Arcangeli
@ 2001-05-20 16:54                 ` Andrew Morton
  2001-05-20 17:12                   ` Andrea Arcangeli
  2001-05-21  1:07                   ` David S. Miller
  0 siblings, 2 replies; 97+ messages in thread
From: Andrew Morton @ 2001-05-20 16:54 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

Andrea Arcangeli wrote:
> 
> > I can't find *any* pci_map_single() in the 2.4.4-ac9 tree
> > which can fail, BTW.
> 
> I assume you mean that no one single caller of pci_map_single is
> checking if it failed or not (because all pci_map_single can fail).

No.  Most of the pci_map_single() implementations just
use virt_to_bus()/virt_to_phys().  Even sparc64's fancy
iommu-based pci_map_single() always succeeds.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:54                 ` Andrew Morton
@ 2001-05-20 17:12                   ` Andrea Arcangeli
  2001-05-21  1:07                   ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 17:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 02:54:16AM +1000, Andrew Morton wrote:
> No.  Most of the pci_map_single() implementations just
> use virt_to_bus()/virt_to_phys(). [..]

then you are saying that on the platforms without an iommu the pci_map_*
cannot fail, of course, furthmore even a missing pci_unmap cannot
trigger an iommu address space leak on those platforms. That has nothing
to do with the fact pci_map_single can fail or not, the device drivers
are not architectural specific.

> [..]  Even sparc64's fancy
> iommu-based pci_map_single() always succeeds.

Whatever sparc64 does to hide the driver bugs you can break it if you
pci_map 4G+1 bytes of phyical memory.  Otherwise it means it's sleeping
or looping inside the pci_map functions which would break things in
another manner.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:18           ` Andrea Arcangeli
  2001-05-20 16:21             ` Andrew Morton
@ 2001-05-20 17:16             ` Jeff Garzik
  2001-05-20 17:37               ` Andrea Arcangeli
  1 sibling, 1 reply; 97+ messages in thread
From: Jeff Garzik @ 2001-05-20 17:16 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

Andrea Arcangeli wrote:
> 
> On Sun, May 20, 2001 at 03:49:58PM +0200, Andrea Arcangeli wrote:
> > they returned zero. You either have to drop the skb or to try again later
> > if they returns zero.
> 
> BTW, pci_map_single is not a nice interface, it cannot return bus
> address 0, 

who says?

A value of zero for the mapping is certainly an acceptable value, and it
should be handled by drivers.

In fact its an open bug in a couple net drivers that they check the
mapping to see if it is non-zero...

-- 
Jeff Garzik      | "Do you have to make light of everything?!"
Building 1024    | "I'm extremely serious about nailing your
MandrakeSoft     |  step-daughter, but other than that, yes."

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 17:16             ` Jeff Garzik
@ 2001-05-20 17:37               ` Andrea Arcangeli
  0 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-20 17:37 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 01:16:25PM -0400, Jeff Garzik wrote:
> Andrea Arcangeli wrote:
> > 
> > On Sun, May 20, 2001 at 03:49:58PM +0200, Andrea Arcangeli wrote:
> > > they returned zero. You either have to drop the skb or to try again later
> > > if they returns zero.
> > 
> > BTW, pci_map_single is not a nice interface, it cannot return bus
> > address 0, 
> 
> who says?
> 
> A value of zero for the mapping is certainly an acceptable value, and it
> should be handled by drivers.

this is exactly why I'm saying pci_map_single currently is ugly in
declaring a retval of 0 as an error, because as you also explicitly said
above bus address 0 is perfectly valid bus adress, so my whole point is
that I'd prefer to change the API of pci_map_single to notify of faliure
not returning 0 like it does right now in 2.4.5pre3 and all previous 2.4
kernels but via a parameter, so bus address zero returns a valid bus
address as it should be just now (but it isn't right now).

> In fact its an open bug in a couple net drivers that they check the
> mapping to see if it is non-zero...

if a driver is catching the faluire of pci_map_single by checking if the
bus address returned is zero such driver is one of the few (or the only
one) correct driver out there.

As it stands right now a bus address of 0 means pci_map_single failed.

For pci_map_sg if it returns zero it means it failed too.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 12:05   ` Ivan Kokshaysky
@ 2001-05-21  0:37     ` Richard Henderson
  0 siblings, 0 replies; 97+ messages in thread
From: Richard Henderson @ 2001-05-21  0:37 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: linux-kernel

On Sun, May 20, 2001 at 04:05:18PM +0400, Ivan Kokshaysky wrote:
> Ok. What do you think about reorg like this:
> basically leave the old code as is, and add
>         if (is_pyxis)
>                 alpha_mv.mv_pci_tbi = cia_pci_tbi_try2;
>         else
>                 tbia test
>                 ...

Seems good.

> 21174 docs confirm that (though in a very low voice ;-) :
>  "The 21174 may hang with TBIA=3."

Mmm.  Tasty.  :-)


r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 13:49         ` Andrea Arcangeli
  2001-05-20 14:05           ` Andrew Morton
  2001-05-20 16:18           ` Andrea Arcangeli
@ 2001-05-21  1:00           ` David S. Miller
  2001-05-21  7:47             ` Alan Cox
  2001-05-21  7:53             ` David S. Miller
  2001-05-21  1:03           ` David S. Miller
  3 siblings, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  1:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Andrea Arcangeli, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrew Morton writes:
 > Well this is news to me.  No drivers understand this.
 > How long has this been the case?  What platforms?

The DMA interfaces may never fail and I've discussed this over and
over with port maintainers a _long_ time ago.

Don't worry Andrew.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 14:05           ` Andrew Morton
  2001-05-20 14:33             ` Andrea Arcangeli
@ 2001-05-21  1:01             ` David S. Miller
  2001-05-21  1:47               ` Andrea Arcangeli
  2001-05-21  7:05               ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  1:01 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > > Well this is news to me.  No drivers understand this.
 > 
 > Yes, almost all drivers are buggy.

No, the interface says that the DMA routines may not return failure.

If you want to change the DMA api to act some other way, then fine
please propose it, but do not act as if today they have to act this
way because that is simply not true.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 13:49         ` Andrea Arcangeli
                             ` (2 preceding siblings ...)
  2001-05-21  1:00           ` David S. Miller
@ 2001-05-21  1:03           ` David S. Miller
  2001-05-21  1:58             ` Richard Henderson
  3 siblings, 1 reply; 97+ messages in thread
From: David S. Miller @ 2001-05-21  1:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > On Sun, May 20, 2001 at 03:49:58PM +0200, Andrea Arcangeli wrote:
 > > they returned zero. You either have to drop the skb or to try again later
 > > if they returns zero.
 > 
 > BTW, pci_map_single is not a nice interface, it cannot return bus
 > address 0, so once we start the fixage it is probably better to change
 > the interface as well to get either the error or the bus address via a
 > pointer passed to the function.

No, pci_map_single is a fine interface.  What is lacking is a
"INVALID_DMA_ADDR" define for each platform, and I've known about
this for some time.

But for the time being, everyone assumes address zero is not valid and
it shouldn't be too painful to reserve the first page of DMA space
until we fix this issue.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-20 16:54                 ` Andrew Morton
  2001-05-20 17:12                   ` Andrea Arcangeli
@ 2001-05-21  1:07                   ` David S. Miller
  2001-05-21  1:37                     ` Andrea Arcangeli
                                       ` (3 more replies)
  1 sibling, 4 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  1:07 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > > [..]  Even sparc64's fancy
 > > iommu-based pci_map_single() always succeeds.
 > 
 > Whatever sparc64 does to hide the driver bugs you can break it if you
 > pci_map 4G+1 bytes of phyical memory.

Which is an utterly stupid thing to do.

Please construct a plausable situation where this would occur legally
and not be a driver bug, given the maximum number of PCI busses and
slots found on sparc64 and the maximum _concurrent_ usage of PCI dma
space for any given driver (which isn't doing something stupid).

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:07                   ` David S. Miller
@ 2001-05-21  1:37                     ` Andrea Arcangeli
  2001-05-21  6:53                     ` David S. Miller
                                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21  1:37 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 06:07:17PM -0700, David S. Miller wrote:
> 
> Andrea Arcangeli writes:
>  > > [..]  Even sparc64's fancy
>  > > iommu-based pci_map_single() always succeeds.
>  > 
>  > Whatever sparc64 does to hide the driver bugs you can break it if you
>  > pci_map 4G+1 bytes of phyical memory.
> 
> Which is an utterly stupid thing to do.
> 
> Please construct a plausable situation where this would occur legally
> and not be a driver bug, given the maximum number of PCI busses and
> slots found on sparc64 and the maximum _concurrent_ usage of PCI dma
> space for any given driver (which isn't doing something stupid).

Assume I have a dozen of PCI cards that does DMA using SG tables that
can map up to some houndred mbytes of ram each, so I can just program
the cards to start the dma on those houndred mbyte of ram, most of the
time the I/O is not simulaneous, but very rarely it happens to be
simultaneous and in turn it tries to pci_map_sg more than 4G of physical
ram. After that sparc64 iommu code will say "bye bye" and the machine
will crash because the nic driver is not checking for pci_map_single
faliures.

I don't see why the above scenario should be classified as stupid. such
pci_map_* API and the device drivers have to be generic.

It's like if you say me that there's no need to check for
alloc_pages(GFP_ATOMIC) faliures in the device drivers because all
machines you are using have 256G of ram and you never use all the
physical ram with your workloads. I would never buy such an argument.

Furthmore currently (2.4.5pre3) on alpha you only need to ask the iommu
to map more than 128mbyte of ram to crash (I increased it to 512mbyte at
least, Jay said my first patch that increased it to 1G is risky because
some device gets confused by bus addresses at around -1M and we keep the
dynamic window above 3G, 512M should still be enough to cover 99% of
hardware configurations I agree on that but this is not a good excuse to
left all device drivers buggy just because those bugs doesn't trigger in
all the hardware configurations out there).

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:01             ` David S. Miller
@ 2001-05-21  1:47               ` Andrea Arcangeli
  2001-05-21  7:05               ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21  1:47 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 06:01:40PM -0700, David S. Miller wrote:
> 
> Andrea Arcangeli writes:
>  > > Well this is news to me.  No drivers understand this.
>  > 
>  > Yes, almost all drivers are buggy.
> 
> No, the interface says that the DMA routines may not return failure.

The alpha returns a faliure since day zero of iommu support, the sparc64
has too otherwise it's even more buggy than alpha when the machine runs
out of pci virtual address space.

> If you want to change the DMA api to act some other way, then fine
> please propose it, but do not act as if today they have to act this
> way because that is simply not true.

About the pci_map_single API I'd like if bus address 0 would not be the
indication of faluire, mainly on platforms without an iommu that's not
nice, x86 happens to get it right only because the physical page zero is
reserved for completly other reasons. so we either add a err parameter
to the pci_map_single, or we define a per-arch bus address to indicate
an error, either ways are ok from my part.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:03           ` David S. Miller
@ 2001-05-21  1:58             ` Richard Henderson
  0 siblings, 0 replies; 97+ messages in thread
From: Richard Henderson @ 2001-05-21  1:58 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky, linux-kernel

On Sun, May 20, 2001 at 06:03:44PM -0700, David S. Miller wrote:
> But for the time being, everyone assumes address zero is not valid and
> it shouldn't be too painful to reserve the first page of DMA space
> until we fix this issue.

Indeed, virtually all PCI systems have legacy PeeCee compatibility crap
handing out in low i/o memory, and we don't use the first megabyte or so.


r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:07                   ` David S. Miller
  2001-05-21  1:37                     ` Andrea Arcangeli
@ 2001-05-21  6:53                     ` David S. Miller
  2001-05-21  7:59                       ` Alan Cox
                                         ` (3 more replies)
  2001-05-22 13:11                     ` Pavel Machek
  2001-05-22 23:02                     ` David S. Miller
  3 siblings, 4 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  6:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > Assume I have a dozen of PCI cards that does DMA using SG tables that
 > can map up to some houndred mbytes of ram each, so I can just program
 > the cards to start the dma on those houndred mbyte of ram, most of the
 > time the I/O is not simulaneous, but very rarely it happens to be
 > simultaneous and in turn it tries to pci_map_sg more than 4G of physical
 > ram.

What are these "devices", and what drivers "just program the cards to
start the dma on those hundred mbyte of ram"?

Are we designing Linux for hypothetical systems with hypothetical
devices and drivers, or for the real world?

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:01             ` David S. Miller
  2001-05-21  1:47               ` Andrea Arcangeli
@ 2001-05-21  7:05               ` David S. Miller
  2001-05-21  8:59                 ` Andrea Arcangeli
  2001-05-21  9:02                 ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  7:05 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > On Sun, May 20, 2001 at 06:01:40PM -0700, David S. Miller wrote:
 > > No, the interface says that the DMA routines may not return failure.
 > 
 > The alpha returns a faliure since day zero of iommu support, the sparc64
 > has too otherwise it's even more buggy than alpha when the machine runs
 > out of pci virtual address space.

So what?  I frankly don't care what alpha or any platform happens to
do.  The Documentation/DMA-mapping.txt document says absolutely
nothing about these routines ever failing nor what a failure value
would be.  If you ask David Mosberger and the ia64 people, the PPC
folks, or even HPPA port team, they will all show no surprise when
told that these routines may not fail.  I talked to them, along with
Richard, about this issue at length when the DMA stuff was put
together.  And it was agreed upon that the routines will not allow
failure in 2.4.x and we would work on resolving this in 2.5.x and no
sooner.

THIS DMA-mapping.txt document is the specification of the behavior of
these DMA interfaces, and the driver author may only assume the
behavior described in that document.

If Alpha does something different, that's a feature.

 > About the pci_map_single API I'd like if bus address 0 would not be the
 > indication of faluire, mainly on platforms without an iommu that's not
 > nice, x86 happens to get it right only because the physical page zero is
 > reserved for completly other reasons. so we either add a err parameter
 > to the pci_map_single, or we define a per-arch bus address to indicate
 > an error, either ways are ok from my part.

I'm more than happy to add the per-arch define (which would be zero
on all platforms right now, so this exercise is %100 academic :-)))))

But I will NOT add the change that the pci_map_*() interfaces may
fail in 2.4.x, this is an unacceptable API change.  Reasons are
starting with the scsi layer issues Gerard mentioned, and I knew
about this kind of crap when I designed the API.  We can work on a
failure mode for this stuff, but it is 2.5.x material, no sooner.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:00           ` David S. Miller
@ 2001-05-21  7:47             ` Alan Cox
  2001-05-21  7:53             ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-21  7:47 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Andrea Arcangeli, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

> Andrew Morton writes:
>  > Well this is news to me.  No drivers understand this.
>  > How long has this been the case?  What platforms?
> 
> The DMA interfaces may never fail and I've discussed this over and
> over with port maintainers a _long_ time ago.

And how do you propose to implemnt cache coherent pci allocations on machines
which lack the ability to have pages coherent between I/O and memory space ?

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:00           ` David S. Miller
  2001-05-21  7:47             ` Alan Cox
@ 2001-05-21  7:53             ` David S. Miller
  2001-05-21  8:03               ` Alan Cox
  2001-05-21  8:11               ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  7:53 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Andrea Arcangeli, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Alan Cox writes:
 > And how do you propose to implemnt cache coherent pci allocations
 > on machines which lack the ability to have pages coherent between
 > I/O and memory space ?

Pages, being in memory space, are never in I/O space.

Maybe I don't understand the situation.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  6:53                     ` David S. Miller
@ 2001-05-21  7:59                       ` Alan Cox
  2001-05-21  8:06                       ` Chris Wedgwood
                                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-21  7:59 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

> What are these "devices", and what drivers "just program the cards to
> start the dma on those hundred mbyte of ram"?
> 
> Are we designing Linux for hypothetical systems with hypothetical
> devices and drivers, or for the real world?

Ok how about a PIV Xeon with 64Gb of memory and 5 AMI Megaraids, which are
limited to the low 2Gb range for pci mapping and otherwise need bounce buffers.
Or how about any consistent alloc on certain HP machines which totally lack
coherency - also I suspect the R10K on an O2 might fall into that - Ralf ?

Look at the history of kernel API's over time. Everything that can go wrong
eventually does.


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  7:53             ` David S. Miller
@ 2001-05-21  8:03               ` Alan Cox
  2001-05-21  8:11               ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-21  8:03 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alan Cox, Andrew Morton, Andrea Arcangeli, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

> Alan Cox writes:
>  > And how do you propose to implemnt cache coherent pci allocations
>  > on machines which lack the ability to have pages coherent between
>  > I/O and memory space ?
> 
> Pages, being in memory space, are never in I/O space.

Ok my fault. Let me try that again with clearer Linux terminology.

Pages allocated in main memory and mapped for access by PCI devices. On some
HP systems there is now way for such a page to stay coherent. It is quite
possible to sync the view but there is no sane way to allow any
pci_alloc_consistent to succeed

Alan



^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  6:53                     ` David S. Miller
  2001-05-21  7:59                       ` Alan Cox
@ 2001-05-21  8:06                       ` Chris Wedgwood
  2001-05-21  8:09                       ` David S. Miller
  2001-05-23  0:05                       ` Albert D. Cahalan
  3 siblings, 0 replies; 97+ messages in thread
From: Chris Wedgwood @ 2001-05-21  8:06 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Sun, May 20, 2001 at 11:53:42PM -0700, David S. Miller wrote:

    What are these "devices", and what drivers "just program the
    cards to start the dma on those hundred mbyte of ram"?

Some NMR machines... (probably not those used for imaging, but those
use for atomic sampling and bond analysis).



   --cw

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  8:09                       ` David S. Miller
@ 2001-05-21  8:09                         ` Alan Cox
  0 siblings, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-21  8:09 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alan Cox, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

>  > Look at the history of kernel API's over time. Everything that can
>  > go wrong eventually does.
> 
> I agree, and it will be dealt with in 2.5.x
> 
> The scsi layer in 2.4.x is simply not able to handle failure in these
> code paths, as Gerard Roudier has mentioned.

On that I am unconvinced. It is certainly grungy enough that fighting that war
in 2.5 makes sense however.



^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  6:53                     ` David S. Miller
  2001-05-21  7:59                       ` Alan Cox
  2001-05-21  8:06                       ` Chris Wedgwood
@ 2001-05-21  8:09                       ` David S. Miller
  2001-05-21  8:09                         ` Alan Cox
  2001-05-23  0:05                       ` Albert D. Cahalan
  3 siblings, 1 reply; 97+ messages in thread
From: David S. Miller @ 2001-05-21  8:09 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Alan Cox writes:
 > Ok how about a PIV Xeon with 64Gb of memory and 5 AMI Megaraids, which are
 > limited to the low 2Gb range for pci mapping and otherwise need bounce buffers.
 > Or how about any consistent alloc on certain HP machines which totally lack
 > coherency - also I suspect the R10K on an O2 might fall into that - Ralf ?

If they need bounce buffers because of a device specific DMA range
limitation (this is what I gather this is), then the PCI dma interface
is of no help to this case.

 > Look at the history of kernel API's over time. Everything that can
 > go wrong eventually does.

I agree, and it will be dealt with in 2.5.x

The scsi layer in 2.4.x is simply not able to handle failure in these
code paths, as Gerard Roudier has mentioned.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  7:53             ` David S. Miller
  2001-05-21  8:03               ` Alan Cox
@ 2001-05-21  8:11               ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  8:11 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Andrea Arcangeli, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Alan Cox writes:
 > Pages allocated in main memory and mapped for access by PCI devices. On some
 > HP systems there is now way for such a page to stay coherent. It is quite
 > possible to sync the view but there is no sane way to allow any
 > pci_alloc_consistent to succeed

This is not what the HP folk told me, and in fact they said that
pci_alloc_consistent could be made to work via disabling the cache
attribute in the cpu side mappings or something similar in the PCI
controller IOMMU mappings.

Please someone on the HPPA team provide details :-)

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  7:05               ` David S. Miller
@ 2001-05-21  8:59                 ` Andrea Arcangeli
  2001-05-21  9:50                   ` Gerd Knorr
  2001-05-21  9:02                 ` David S. Miller
  1 sibling, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21  8:59 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 12:05:40AM -0700, David S. Miller wrote:
> together.  And it was agreed upon that the routines will not allow
> failure in 2.4.x and we would work on resolving this in 2.5.x and no
> sooner.

I'm glad you at least just considered to fix all those bugs for 2.5 but
that won't change that if somebody runs out of entries now with sparc64
the only thing I can do in a short term is to use HIGHMEM, so that the
serialization to limit the amount of max simultaneous pci32 DMA will
happen in the code that allocates the bounce buffers. Tell me a best way
to get rid of those bugs all together if you can.

Furthmore some arch for the legacy pci32 cards may not provide an huge
amount of entries and so you could more easily trigger those bugs
without the need of uncommon hardware, those bugs renders the iommu
unusable for those archs in 2.4 because it would trigger the device
drivers bugs far too easily.

Please tell Andrew to worry about that, if somebody ever worried about
that we would have all network drivers correct just now and the needed
panics in the lowlevel scsi layer.

This without considering bttv and friends are not even trying to use the
pci_map_* yet, I hope you don't watch TV on your sparc64 if you have
enough ram.

I hate those kind of broken compromises between something that works
almost all the time and that breaks when you are not only using a few
harddisk and a few nic, and that is unfixable in the right way in a
short term after it triggers (bttv is fixable in a short term of course,
I'm only talking about when you run out of pci mappings).

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  7:05               ` David S. Miller
  2001-05-21  8:59                 ` Andrea Arcangeli
@ 2001-05-21  9:02                 ` David S. Miller
  2001-05-21  9:23                   ` Andi Kleen
                                     ` (3 more replies)
  1 sibling, 4 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  9:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > Tell me a best way to get rid of those bugs all together if you can.

Please give me a test case that triggers the bug on sparc64
and I will promptly work on a fix, ok?

I mean a test case you _actually_ trigger, not some fantasy case.

In theory it can happen, but nobody is showing me that it actually
does ever happen.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:02                 ` David S. Miller
@ 2001-05-21  9:23                   ` Andi Kleen
  2001-05-21  9:30                   ` David S. Miller
                                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 97+ messages in thread
From: Andi Kleen @ 2001-05-21  9:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


On the topic of to the PCI DMA code: one thing I'm missing
are pci_map_single()/pci_map_sg() that take struct page * instead of
of direct pointers. Currently I don't see how you would implement IO-MMU IO
on a 32bit box with more than 4GB of memory, because the address won't
fit into the pointer.

-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:02                 ` David S. Miller
  2001-05-21  9:23                   ` Andi Kleen
@ 2001-05-21  9:30                   ` David S. Miller
  2001-05-21  9:42                     ` Andi Kleen
  2001-05-21 10:00                     ` David S. Miller
  2001-05-21  9:56                   ` Andrea Arcangeli
  2001-05-21 10:11                   ` David S. Miller
  3 siblings, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21  9:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Andi Kleen writes:
 > On the topic of to the PCI DMA code: one thing I'm missing
 > are pci_map_single()/pci_map_sg() that take struct page * instead of
 > of direct pointers. Currently I don't see how you would implement IO-MMU IO
 > on a 32bit box with more than 4GB of memory, because the address won't
 > fit into the pointer.

How does the buffer get there in the first place? :-)

Yes, the zerocopy stuff is capable of doing this.  But the block
I/O layer is not, neither is any other subsystem to my knowledge.

Certainly, when this changes, we can make the interfaces adapt to
this.

Because of this, for example, the sbus IOMMU stuff on sparc32 still
uses HIGHMEM exactly because of this pointer limitation.  In fact,
any machine using >4GB of memory currently cannot be supported without
highmem enabled, which is going to enable bounce buffering in the block
I/O layer, etc.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:30                   ` David S. Miller
@ 2001-05-21  9:42                     ` Andi Kleen
  2001-05-21 10:02                       ` Andrea Arcangeli
  2001-05-21 10:00                     ` David S. Miller
  1 sibling, 1 reply; 97+ messages in thread
From: Andi Kleen @ 2001-05-21  9:42 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andi Kleen, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 02:30:09AM -0700, David S. Miller wrote:
> 
> Andi Kleen writes:
>  > On the topic of to the PCI DMA code: one thing I'm missing
>  > are pci_map_single()/pci_map_sg() that take struct page * instead of
>  > of direct pointers. Currently I don't see how you would implement IO-MMU IO
>  > on a 32bit box with more than 4GB of memory, because the address won't
>  > fit into the pointer.
> 
> How does the buffer get there in the first place? :-)

I guess you know ;)

e.g. via page table tricks from user space, like the PAE mode on IA32
or via kmap.

> Certainly, when this changes, we can make the interfaces adapt to
> this.

I am just curious why you didn't consider that case when designing the
interfaces. Was that a deliberate decision or just an oversight?
[I guess the first, but why?]

> 
> Because of this, for example, the sbus IOMMU stuff on sparc32 still
> uses HIGHMEM exactly because of this pointer limitation.  In fact,
> any machine using >4GB of memory currently cannot be supported without
> highmem enabled, which is going to enable bounce buffering in the block
> I/O layer, etc.

That's currently the case, but at least on IA32 the block layer must be
fixed soon because it's a serious performance problem in some cases
(and fixing it is not very hard). Now that will probably first use DAC
and not a IO-MMU, and thus not use the pci mapping API, but I would not be 
surprised if people came up with IO-MMU schemes for it too.
[actually most IA32 boxes already have one in form of the AGP GART, it's just
not commonly used for serious things yet]

-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  8:59                 ` Andrea Arcangeli
@ 2001-05-21  9:50                   ` Gerd Knorr
  0 siblings, 0 replies; 97+ messages in thread
From: Gerd Knorr @ 2001-05-21  9:50 UTC (permalink / raw)
  To: linux-kernel

>  This without considering bttv and friends are not even trying to use the
>  pci_map_* yet, I hope you don't watch TV on your sparc64 if you have
>  enough ram.

The bttv devel versions[1] are fixed already, they should work
out-of-the box on sparc too.  Just watching TV is harmless (needs
lots of I/O bandwidth, but doesn't need much address space).
Video capture does a better job on eating iommu resources ...

  Gerd

[1] http://bytesex.org/bttv/, 0.8.x versions.

-- 
Gerd Knorr <kraxel@bytesex.org>  --  SuSE Labs, Außenstelle Berlin

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:02                 ` David S. Miller
  2001-05-21  9:23                   ` Andi Kleen
  2001-05-21  9:30                   ` David S. Miller
@ 2001-05-21  9:56                   ` Andrea Arcangeli
  2001-05-21 10:19                     ` David S. Miller
  2001-05-21 10:11                   ` David S. Miller
  3 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21  9:56 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 02:02:21AM -0700, David S. Miller wrote:
> Please give me a test case that triggers the bug on sparc64

I just given you a test case that triggers on sparc64 in earlier email.
Chris just given a real world example of applications where that kind of
design is useful and there are certainly other kind of apps where that
kind of hardware design can be useful too.

A name of an high end pci32 card that AFIK can trigger those bugs is the
Quadrics which is a very nice piece of hardware btw.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:30                   ` David S. Miller
  2001-05-21  9:42                     ` Andi Kleen
@ 2001-05-21 10:00                     ` David S. Miller
  2001-05-21 10:27                       ` Andi Kleen
  2001-05-21 10:34                       ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:00 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Andi Kleen writes:
 > > Certainly, when this changes, we can make the interfaces adapt to
 > > this.
 > 
 > I am just curious why you didn't consider that case when designing the
 > interfaces. Was that a deliberate decision or just an oversight?
 > [I guess the first, but why?]

I didn't want the API to do exactly what we needed it to do
but not one bit more.  I tried very hard to keep it as minimal
as possible, and I even fought many additions to the API (a few
of which turned out to be reasonable, see pci_pool threads).

To this end, since HIGHMEM is needed anyways on such machines
(ie. the "sizeof(void *)" issue), I decided to not consider that
case.

Working on pages is useful even _ignoring_ the specific issues we are
talking about.  It really is the one generic way to represent all
pieces of memory inside the kernel (regardless of HIGHMEM and similar
issues).

But I simply did not see anyone who would really make use of it in
the 2.4.x timeframe.  (and I made this estimate in the middle of
2.3.x, so I didn't even see zerocopy coming along so clearly, shrug)

 > That's currently the case, but at least on IA32 the block layer
 > must be fixed soon because it's a serious performance problem in
 > some cases (and fixing it is not very hard).

If such a far reaching change goes into 2.4.x, I would probably
begin looking at enhancing the PCI dma interfaces as needed ;-)

 > Now that will probably first use DAC
 > and not a IO-MMU, and thus not use the pci mapping API, but I would not be 
 > surprised if people came up with IO-MMU schemes for it too.
 > [actually most IA32 boxes already have one in form of the AGP GART, it's just
 > not commonly used for serious things yet]

DAC usage should go through a portable PCI dma API as well,
for the reasons you mention as well as others.  If we do this
from the beginning, there will be no chance for things like
virt_to_bus64() et al. to start sneaking into the PCI drivers :-)

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:42                     ` Andi Kleen
@ 2001-05-21 10:02                       ` Andrea Arcangeli
  2001-05-21 10:17                         ` Alan Cox
  0 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21 10:02 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David S. Miller, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 11:42:16AM +0200, Andi Kleen wrote:
> [actually most IA32 boxes already have one in form of the AGP GART, it's just
> not commonly used for serious things yet]

I can be really wrong on this because I didn't checked anything about
the GART yet but I suspect you cannot use the GART for this stuff on
ia32 in 2.4 because I think I recall it provides not an huge marging of
mapping entries that so would far too easily trigger the bugs in the
device drivers not checking for pci_map_* faliures also in a common
desktop/webserver/fileserver kind of usage of an high end machine.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:02                 ` David S. Miller
                                     ` (2 preceding siblings ...)
  2001-05-21  9:56                   ` Andrea Arcangeli
@ 2001-05-21 10:11                   ` David S. Miller
  2001-05-21 10:50                     ` Andrea Arcangeli
                                       ` (2 more replies)
  3 siblings, 3 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:11 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > I just given you a test case that triggers on sparc64 in earlier email.

If you are talking about the bttv card:

1) I showed you in a private email that I calculated the
   maximum possible IOMMU space that one could allocate
   to bttv cards in a fully loaded Sunfire sparc64 system
   to be between 300MB and 400MB.  This is assuming that
   every PCI slot contained a bttv card, and it still
   used only ~%35 of the available IOMMU resources.

2) It currently doesn't even use the portable APIs yet anyways,
   so effectively it is not supported on sparc64.

The only other examples you showed were theoretical, for cards and
configurations that simply are not supported or cannot happen
on sparc64 with current kernels.

 > Chris just given a real world example of applications where that kind of
 > design is useful and there are certainly other kind of apps where that
 > kind of hardware design can be useful too.
 > 
 > A name of an high end pci32 card that AFIK can trigger those bugs is the
 > Quadrics which is a very nice piece of hardware btw.

I think such designs which gobble up a gig or so of DMA mappings on
pci32 are not useful in the slightest.  These cards really ought
to be using dual address cycles, ie. 64-bit PCI addressing.  It is
the same response you would give to someone trying to obtain 3 or more
gigabytes of user address space in a process on x86, right?  You might
respond to that person "What you really need is x86-64." for example
:-)

To me, from this perspective, the Quadrics sounds instead like a very
broken piece of hardware.  And in any event, is there even a Quadrics
driver for sparc64? :-)  (I'm a free software old-fart, so please
excuse my immediate association between "high end" and "proprietary"
:-)

Finally Andrea, have you even begun to consider the possible
starvation cases once we make this a resource allocation which can
fail under "normal" conditions.  Maybe the device eatins all the IOMMU
entries, immediately obtains a new mapping when he frees any mapping,
effectively keeping out all other devices.

This may be easily solved, I don't know.

But this along with the potential scsi layer issues, are basically the
reasons I'm trying hard to keep the API as it is right now for 2.4.x
Changing this in 2.4.x is going to open up Pandora's Box, really.

Later,
David S. Miller
davem@redhat.com



^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:02                       ` Andrea Arcangeli
@ 2001-05-21 10:17                         ` Alan Cox
  0 siblings, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-21 10:17 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andi Kleen, David S. Miller, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

> I can be really wrong on this because I didn't checked anything about
> the GART yet but I suspect you cannot use the GART for this stuff on
> ia32 in 2.4 because I think I recall it provides not an huge marging of
> mapping entries that so would far too easily trigger the bugs in the
> device drivers not checking for pci_map_* faliures also in a common
> desktop/webserver/fileserver kind of usage of an high end machine.

Not all chipsets support reading through GART address space from PCI either,
it is meant for AGP to use.


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  9:56                   ` Andrea Arcangeli
@ 2001-05-21 10:19                     ` David S. Miller
  2001-05-21 11:00                       ` Andrea Arcangeli
                                         ` (3 more replies)
  0 siblings, 4 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:19 UTC (permalink / raw)
  To: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


David S. Miller writes:
 > 
 > 1) I showed you in a private email that I calculated the
 >    maximum possible IOMMU space that one could allocate
 >    to bttv cards in a fully loaded Sunfire sparc64 system
 >    to be between 300MB and 400MB.  This is assuming that
 >    every PCI slot contained a bttv card, and it still
 >    used only ~%35 of the available IOMMU resources.

This is totally wrong in two ways.

Let me fix this, the IOMMU on these machines is per PCI bus, so this
figure should be drastically lower.

Electrically (someone correct me, I'm probably wrong) PCI is limited
to 6 physical plug-in slots I believe, let's say it's 8 to choose an
arbitrary larger number to be safe.

Then we have:

max bytes per bttv: max_gbuffers * max_gbufsize
		    64           * 0x208000      == 133.12MB

133.12MB * 8 PCI slots == ~1.06 GB

Which is still only half of the total IOMMU space available per
controller.

Later,
David S. Miller
davem@redhat.com


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:00                     ` David S. Miller
@ 2001-05-21 10:27                       ` Andi Kleen
  2001-05-21 22:22                         ` Jens Axboe
  2001-05-21 10:34                       ` David S. Miller
  1 sibling, 1 reply; 97+ messages in thread
From: Andi Kleen @ 2001-05-21 10:27 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andi Kleen, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:00:24AM -0700, David S. Miller wrote:
>  > That's currently the case, but at least on IA32 the block layer
>  > must be fixed soon because it's a serious performance problem in
>  > some cases (and fixing it is not very hard).
> 
> If such a far reaching change goes into 2.4.x, I would probably
> begin looking at enhancing the PCI dma interfaces as needed ;-)

Hmm, I don't think it'll be a far reaching change. As far as I can see 
all it needs is a new entry point for block device drivers that uses 
bh->b_page. When that entry point exists skip the create_bounce call 
in __make_request. After that it is purely problem for selected drivers.

[BTW, the 2.4.4 netstack does not seem to make any attempt to handle the
pagecache > 4GB case on IA32 for sendfile, as the pci_* functions are dummies 
here.  It probably needs bounce buffers there for this case]



-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:00                     ` David S. Miller
  2001-05-21 10:27                       ` Andi Kleen
@ 2001-05-21 10:34                       ` David S. Miller
  2001-05-21 10:42                         ` Andi Kleen
  2001-05-21 10:55                         ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:34 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Andi Kleen writes:
 > [BTW, the 2.4.4 netstack does not seem to make any attempt to handle the
 > pagecache > 4GB case on IA32 for sendfile, as the pci_* functions are dummies 
 > here.  It probably needs bounce buffers there for this case]

egrep illegal_highdma net/core/dev.c

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:34                       ` David S. Miller
@ 2001-05-21 10:42                         ` Andi Kleen
  2001-05-21 10:55                         ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Andi Kleen @ 2001-05-21 10:42 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andi Kleen, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:34:50AM -0700, David S. Miller wrote:
> 
> Andi Kleen writes:
>  > [BTW, the 2.4.4 netstack does not seem to make any attempt to handle the
>  > pagecache > 4GB case on IA32 for sendfile, as the pci_* functions are dummies 
>  > here.  It probably needs bounce buffers there for this case]
> 
> egrep illegal_highdma net/core/dev.c

There is just no portable way for the driver to figure out if it should
set this flag or not. e.g. acenic.c gets it wrong: it is unconditionally
set even on IA32. Currently it requires an architecture ifdef to set properly.

-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:11                   ` David S. Miller
@ 2001-05-21 10:50                     ` Andrea Arcangeli
  2001-05-21 10:59                     ` David S. Miller
  2001-05-21 13:55                     ` Jonathan Lundell
  2 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21 10:50 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:11:52AM -0700, David S. Miller wrote:
> I think such designs which gobble up a gig or so of DMA mappings on

they maps something like 200mbyte I think. I also seen other cards doing
the same kind of stuff again for the distributed computing.

> to be using dual address cycles, ie. 64-bit PCI addressing.  It is
> the same response you would give to someone trying to obtain 3 or more
> gigabytes of user address space in a process on x86, right?  You might

I never seen those running on 64bit boxes even if they are supposed to
run there too.

Here it's a little different, 32bit virtual address space limitation
isn't always a showstopper for those kind of CPU intensive apps (they
don't need huge caches).

> respond to that person "What you really need is x86-64." for example
> :-)

for the 32bit virtual address space issues of course yes ;)

> To me, from this perspective, the Quadrics sounds instead like a very
> broken piece of hardware.  And in any event, is there even a Quadrics

they're not the only ones doing that, I seen others doing that kind of
stuff, it's just a matter of information memory fast across a cluster,
if you delegate that work to a separate engine (btw they runs a
sparc32bit cpu, also guess why they aren't pci64) you can spend much
more cpu cycles of the main CPU on the userspace computations.

> driver for sparc64? :-)  (I'm a free software old-fart, so please
> excuse my immediate association between "high end" and "proprietary"
> :-)

:)

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:34                       ` David S. Miller
  2001-05-21 10:42                         ` Andi Kleen
@ 2001-05-21 10:55                         ` David S. Miller
  2001-05-21 11:08                           ` Andi Kleen
  2001-05-21 11:36                           ` David S. Miller
  1 sibling, 2 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Andi Kleen writes:
 > On Mon, May 21, 2001 at 03:34:50AM -0700, David S. Miller wrote:
 > > egrep illegal_highdma net/core/dev.c
 > 
 > There is just no portable way for the driver to figure out if it should
 > set this flag or not. e.g. acenic.c gets it wrong: it is unconditionally
 > set even on IA32. Currently it requires an architecture ifdef to set properly.

Well, certainly, this could perhaps be a bug in the Acenic driver.
It should check if DAC cycles can be used on the platform, for example.

But please, let's get back to the original problem though.

The original claim is that the situation was not handled at all.  All
I'm trying to say is simply that the net stack does check via
illegal_highdma() the condition you stated was not being checked at
all.  To me it sounded like you were claiming that HIGHMEM pages went
totally unchecked through device transmit, and that is totally untrue.

If you were trying to point out the problem with what the Acenic
driver is doind, just state that next time ok? :-)

There is no question that what Acenic is doing with ifdefs needs a
clean portable solution.  This will be part of the 64-bit DAC API
interfaces (whenever those become really necessary, I simply don't
see the need right now).

Plainly, I'm going to be highly reluctant to make changes to the PCI
dma API in 2.4.x  It is already hard enough to get all the PCI drivers
in line and using it.  Suggesting this kind of change is similar to
saying "let's change the arguments to request_irq()".  We would do it
to fix a true "people actually hit this" kind of bug, of course.  Yet
we would avoid it at all possible costs due to the disruption this
would cause.

I'm not trying to be a big bad guy about this.  What I'm trying to do
is make sure at least one person (me :-) is thinking about the
ramifications any such change has on all current drivers which use
these interfaces already.  And also, to port maintainers...

Later,
David S. Miller
davem@redhat.com



^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:11                   ` David S. Miller
  2001-05-21 10:50                     ` Andrea Arcangeli
@ 2001-05-21 10:59                     ` David S. Miller
  2001-05-21 11:19                       ` Andrea Arcangeli
  2001-05-21 13:55                     ` Jonathan Lundell
  2 siblings, 1 reply; 97+ messages in thread
From: David S. Miller @ 2001-05-21 10:59 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > On Mon, May 21, 2001 at 03:11:52AM -0700, David S. Miller wrote:
 > > I think such designs which gobble up a gig or so of DMA mappings on
 > 
 > they maps something like 200mbyte I think. I also seen other cards doing
 > the same kind of stuff again for the distributed computing.

Ok, 200MB, let's see what this gives us as an example.

200MB multiplied by 6 PCI slots, which uses up about 1.2GB IOMMU
space.

This still leaves around 800MB IOMMU space free on that sparc64 PCI
controller.

It wouldn't run out of space, and this is assuming that Sun ever made
a sparc64 system with 6 physical PCI card slots (I don't think they
ever did honestly, I think 4 physical card slots was the maximum).

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:19                     ` David S. Miller
@ 2001-05-21 11:00                       ` Andrea Arcangeli
  2001-05-21 11:04                       ` David S. Miller
                                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21 11:00 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:19:54AM -0700, David S. Miller wrote:
> max bytes per bttv: max_gbuffers * max_gbufsize
> 		    64           * 0x208000      == 133.12MB
> 
> 133.12MB * 8 PCI slots == ~1.06 GB
> 
> Which is still only half of the total IOMMU space available per
> controller.

and it is the double of the iommu space that I am going to reserve for
pci dynamic mappings on the tsunami (right now it is 128Mbyte... and I'll
change to 512mbyte) also bttv is not doing that a large dma and by
default it only uses 2 buffers in the ring. bttv is not a good example
of what can really overflow the pci virtual address space in real life
(when I mentioned it it was only to point out it still uses
virt_to_bus), filling a pci bus with bttv cards sounds quite silly
anyways ;)

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:19                     ` David S. Miller
  2001-05-21 11:00                       ` Andrea Arcangeli
@ 2001-05-21 11:04                       ` David S. Miller
  2001-05-21 11:27                         ` Andrea Arcangeli
  2001-05-22 11:12                       ` Chris Wedgwood
  2001-05-22 17:51                       ` Jonathan Lundell
  3 siblings, 1 reply; 97+ messages in thread
From: David S. Miller @ 2001-05-21 11:04 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel


Andrea Arcangeli writes:
 > On Mon, May 21, 2001 at 03:19:54AM -0700, David S. Miller wrote:
 > > max bytes per bttv: max_gbuffers * max_gbufsize
 > > 		    64           * 0x208000      == 133.12MB
 > > 
 > > 133.12MB * 8 PCI slots == ~1.06 GB
 > > 
 > > Which is still only half of the total IOMMU space available per
 > > controller.
 > 
 > and it is the double of the iommu space that I am going to reserve for
 > pci dynamic mappings on the tsunami (right now it is 128Mbyte... and I'll
 > change to 512mbyte)

How many physical PCI slots on a Tsunami system?  (I know the
answer, this question is rhetorical :-)

See?  This is why I think all these examples are silly, and we
need to be realistic about this whole situation.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:55                         ` David S. Miller
@ 2001-05-21 11:08                           ` Andi Kleen
  2001-05-21 11:36                           ` David S. Miller
  1 sibling, 0 replies; 97+ messages in thread
From: Andi Kleen @ 2001-05-21 11:08 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andi Kleen, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:55:15AM -0700, David S. Miller wrote:
> The original claim is that the situation was not handled at all.  All
> I'm trying to say is simply that the net stack does check via
> illegal_highdma() the condition you stated was not being checked at
> all.  To me it sounded like you were claiming that HIGHMEM pages went
> totally unchecked through device transmit, and that is totally untrue.

Yes I was wrong on this one, but after looking at the acenic I think
the current solution is not very nice to driver authors.

> 
> If you were trying to point out the problem with what the Acenic
> driver is doind, just state that next time ok? :-)

It's more a generic problem with PCI DMA API, acenic just happens to
be the clearest victim of it @)

> Plainly, I'm going to be highly reluctant to make changes to the PCI
> dma API in 2.4.x  It is already hard enough to get all the PCI drivers
> in line and using it.  Suggesting this kind of change is similar to
> saying "let's change the arguments to request_irq()".  We would do it
> to fix a true "people actually hit this" kind of bug, of course.  Yet
> we would avoid it at all possible costs due to the disruption this
> would cause.

How about a new function (pci_nonrepresentable_address() or whatever) 
that returns true when page cache contains pages that are not representable
physically as void *. On IA32 it would return true only if CONFIG_PAE is 
true and there is memory >4GB. 

Then the network driver could do

	if (!pci_nonrepresentable_address()) 
		dev->features |= NETIF_F_HIGHDMA;

[in theory it would be possible to push it up into dev.c to avoid
bounces for highmem memory <4GB, but I don't want to teach such details
to the generic layer]

DAC support would be the next step, not necessarily one for 2.4, but perhaps
one that could be backported from 2.5 at some point.

-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:59                     ` David S. Miller
@ 2001-05-21 11:19                       ` Andrea Arcangeli
  2001-05-21 11:51                         ` Ivan Kokshaysky
  0 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21 11:19 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:59:58AM -0700, David S. Miller wrote:
> This still leaves around 800MB IOMMU space free on that sparc64 PCI
> controller.

if it was 400mbyte you were screwed up too, the point here is that the
marging is way too to allows ignore the issue completly, furthmore there
can be fragmentation effects in the pagetbles, at least in the way alpha
manages them which is to find contigous virtual pci bus addresses for each sg.
Alpha in mainline is just screwedup if a single pci bus tries to dynamic
map more than 128mbyte, changing it to 512mbyte is trivial, growing more
has performance implications as it needs to reduce the direct windows
which I don't like to as it would also increase the number of machines
that will get bitten by drivers that still use the virt_to_bus and also
increase the pressure on the iommu ptes too.

Now I'm not asking to break the API for 2.4 to take care of that, you
seems to be convinced in fixing this for 2.5 and I'm ok with that,
I just changed the printk of running out of entries to be KERN_ERR at
least, so we know if somebody has real life troubles with 2.4 I will go
HIGHMEM which is a matter of 2 hours for me to implement.

Only thing I suggest is to change the API before starting fixing the
drivers, I mean: don't start checking for bus address 0 before changing
the API to return faliure in another way. It's true x86 is reserving the
zero page anyways because it's a magic bios thing, but for example on
the alpha such a 0 bus address that we cannot use wastes 8 mbyte of DMA
virtual bus addresses that we reserve for the ISA cards (of course we
almost never need 16mbyte of ram all under isa dma but since it's so
low cost to allow that I think we will just in case).

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 11:04                       ` David S. Miller
@ 2001-05-21 11:27                         ` Andrea Arcangeli
  2001-05-21 12:16                           ` Peter Rival
  0 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-21 11:27 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 04:04:28AM -0700, David S. Miller wrote:
> How many physical PCI slots on a Tsunami system?  (I know the

on tsunamis probably not many, but on a Typhoon (the one in the es40
that is the 4-way extension) I don't know, but certainly the box is
large.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:55                         ` David S. Miller
  2001-05-21 11:08                           ` Andi Kleen
@ 2001-05-21 11:36                           ` David S. Miller
  2001-05-21 11:41                             ` Andi Kleen
  1 sibling, 1 reply; 97+ messages in thread
From: David S. Miller @ 2001-05-21 11:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Andi Kleen writes:
 > How about a new function (pci_nonrepresentable_address() or whatever) 
 > that returns true when page cache contains pages that are not representable
 > physically as void *. On IA32 it would return true only if CONFIG_PAE is 
 > true and there is memory >4GB. 

No, if we're going to change anything, let's do it right.

Sure, you'll make this one check "portable", but the guts of the
main ifdef stuff for DAC support is still there.

I'd rather live with the hackish stuff temporarily, and get this all
cleaned up in one shot when we have a real DAC support API.

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 11:36                           ` David S. Miller
@ 2001-05-21 11:41                             ` Andi Kleen
  0 siblings, 0 replies; 97+ messages in thread
From: Andi Kleen @ 2001-05-21 11:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andi Kleen, Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 04:36:50AM -0700, David S. Miller wrote:
> I'd rather live with the hackish stuff temporarily, and get this all
> cleaned up in one shot when we have a real DAC support API.

Real would just be variants of the pci_* functions that currently take void * 
replaced with struct page *, or do you have something more fancy in mind?

-Andi

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 11:19                       ` Andrea Arcangeli
@ 2001-05-21 11:51                         ` Ivan Kokshaysky
  2001-05-21 17:53                           ` Richard Henderson
  0 siblings, 1 reply; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-21 11:51 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: David S. Miller, Andrew Morton, Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 01:19:59PM +0200, Andrea Arcangeli wrote:
> Alpha in mainline is just screwedup if a single pci bus tries to dynamic
> map more than 128mbyte, changing it to 512mbyte is trivial, growing more

Could you just describe the configuration where increasing sg window
from 128 to 512Mb actually fixes "out of ptes" problem? I mean which
drivers involved, what kind of load etc.
I'm unable reproduce it with *8Mb* window, so I'm asking.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 11:27                         ` Andrea Arcangeli
@ 2001-05-21 12:16                           ` Peter Rival
  0 siblings, 0 replies; 97+ messages in thread
From: Peter Rival @ 2001-05-21 12:16 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: David S. Miller, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

Andrea Arcangeli wrote:

> On Mon, May 21, 2001 at 04:04:28AM -0700, David S. Miller wrote:
> > How many physical PCI slots on a Tsunami system?  (I know the
>
> on tsunamis probably not many, but on a Typhoon (the one in the es40
> that is the 4-way extension) I don't know, but certainly the box is
> large.
>

ES40 has either 8 or 10 PCI slots across 2 PCI buses.  And then there's
Wildfire - 14 slots per PCI drawer (4 PCI buses) * 2 drawers/QBB * 8 QBBs =
224 PCI slots & 64 PCI buses.  BTW, Titan (aka ES45) has 10 slots as well,
but with 3 buses instead.

 - Pete


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:11                   ` David S. Miller
  2001-05-21 10:50                     ` Andrea Arcangeli
  2001-05-21 10:59                     ` David S. Miller
@ 2001-05-21 13:55                     ` Jonathan Lundell
  2001-05-21 14:17                       ` Ivan Kokshaysky
  2001-05-21 15:47                       ` Jonathan Lundell
  2 siblings, 2 replies; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-21 13:55 UTC (permalink / raw)
  To: David S. Miller, Andrea Arcangeli, Andrew Morton,
	Ivan Kokshaysky, Richard Henderson, linux-kernel

At 3:19 AM -0700 2001-05-21, David S. Miller wrote:
>This is totally wrong in two ways.
>
>Let me fix this, the IOMMU on these machines is per PCI bus, so this
>figure should be drastically lower.
>
>Electrically (someone correct me, I'm probably wrong) PCI is limited
>to 6 physical plug-in slots I believe, let's say it's 8 to choose an
>arbitrary larger number to be safe.
>
>Then we have:
>
>max bytes per bttv: max_gbuffers * max_gbufsize
>		    64           * 0x208000      == 133.12MB
>
>133.12MB * 8 PCI slots == ~1.06 GB
>
>Which is still only half of the total IOMMU space available per
>controller.

8 slots (and  you're right, 6 is a practical upper limit, fewer for 
66 MHz) *per bus*. Buses can proliferate like crazy, so the slot 
limit becomes largely irrelevant. A typical quad Ethernet card, for 
example (and this is true for many/most multiple-device cards), has a 
bridge, its own internal PCI bus, and four "slots" ("devices" in PCI 
terminology).
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 13:55                     ` Jonathan Lundell
@ 2001-05-21 14:17                       ` Ivan Kokshaysky
  2001-05-21 15:47                       ` Jonathan Lundell
  1 sibling, 0 replies; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-21 14:17 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: David S. Miller, Andrea Arcangeli, Andrew Morton,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 06:55:29AM -0700, Jonathan Lundell wrote:
> 8 slots (and  you're right, 6 is a practical upper limit, fewer for 
> 66 MHz) *per bus*. Buses can proliferate like crazy, so the slot 
> limit becomes largely irrelevant.

True, but the bandwidth limit is highly relevant. That's why modern
systems have multiple root buses, not a bridged ones.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 13:55                     ` Jonathan Lundell
  2001-05-21 14:17                       ` Ivan Kokshaysky
@ 2001-05-21 15:47                       ` Jonathan Lundell
  1 sibling, 0 replies; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-21 15:47 UTC (permalink / raw)
  To: Ivan Kokshaysky
  Cc: David S. Miller, Andrea Arcangeli, Andrew Morton,
	Richard Henderson, linux-kernel

At 6:17 PM +0400 2001-05-21, Ivan Kokshaysky wrote:
>On Mon, May 21, 2001 at 06:55:29AM -0700, Jonathan Lundell wrote:
>>  8 slots (and  you're right, 6 is a practical upper limit, fewer for
>>  66 MHz) *per bus*. Buses can proliferate like crazy, so the slot
>>  limit becomes largely irrelevant.
>
>True, but the bandwidth limit is highly relevant. That's why modern
>systems have multiple root buses, not a bridged ones.

Sure, there are systems with multiple root buses (I'm a bit fuzzy on 
how well Linux handles that), and bandwidth is important, but it's 
simply wrong to assume that a particular root bus will never have 
more than 6 or 8 devices. There are legitimate cases (firewalls 
spring to mind) where port count is driven by other considerations 
than aggregate bandwidth.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 11:51                         ` Ivan Kokshaysky
@ 2001-05-21 17:53                           ` Richard Henderson
  2001-05-22  0:56                             ` Andrea Arcangeli
  2001-05-22 13:22                             ` Andrea Arcangeli
  0 siblings, 2 replies; 97+ messages in thread
From: Richard Henderson @ 2001-05-21 17:53 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: Andrea Arcangeli, linux-kernel

On Mon, May 21, 2001 at 03:51:51PM +0400, Ivan Kokshaysky wrote:
> I'm unable reproduce it with *8Mb* window, so I'm asking.

Me either.  But Tom Vier, the guy who started this thread
was able to use up the 8MB.  Which is completely believable.

The following should aleviate the situation on these smaller
machines where the direct map does cover all physical memory.
Really, we were failing gratuitously before.

On Tsunami and Titan, espectially with more than 4G ram we
should probably just go ahead and allocate the 512M or 1G
scatter-gather arena.

(BTW, Andrea, it's easy enough to work around the Cypress
problem by marking the last 1M of the 1G arena in use.)


r~



diff -ruNp linux/arch/alpha/kernel/pci_iommu.c linux-new/arch/alpha/kernel/pci_iommu.c
--- linux/arch/alpha/kernel/pci_iommu.c	Fri Mar  2 11:12:07 2001
+++ linux-new/arch/alpha/kernel/pci_iommu.c	Mon May 21 01:25:25 2001
@@ -402,8 +402,20 @@ sg_fill(struct scatterlist *leader, stru
 	paddr &= ~PAGE_MASK;
 	npages = calc_npages(paddr + size);
 	dma_ofs = iommu_arena_alloc(arena, npages);
-	if (dma_ofs < 0)
-		return -1;
+	if (dma_ofs < 0) {
+		/* If we attempted a direct map above but failed, die.  */
+		if (leader->dma_address == 0)
+			return -1;
+
+		/* Otherwise, break up the remaining virtually contiguous
+		   hunks into individual direct maps.  */
+		for (sg = leader; sg < end; ++sg)
+			if (sg->dma_address == 2 || sg->dma_address == -2)
+				sg->dma_address = 0;
+
+		/* Retry.  */
+		return sg_fill(leader, end, out, arena, max_dma);
+	}
 
 	out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr;
 	out->dma_length = size;

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:27                       ` Andi Kleen
@ 2001-05-21 22:22                         ` Jens Axboe
  0 siblings, 0 replies; 97+ messages in thread
From: Jens Axboe @ 2001-05-21 22:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David S. Miller, Andrea Arcangeli, Andrew Morton,
	Ivan Kokshaysky, Richard Henderson, linux-kernel

On Mon, May 21 2001, Andi Kleen wrote:
> On Mon, May 21, 2001 at 03:00:24AM -0700, David S. Miller wrote:
> >  > That's currently the case, but at least on IA32 the block layer
> >  > must be fixed soon because it's a serious performance problem in
> >  > some cases (and fixing it is not very hard).
> > 
> > If such a far reaching change goes into 2.4.x, I would probably
> > begin looking at enhancing the PCI dma interfaces as needed ;-)
> 
> Hmm, I don't think it'll be a far reaching change. As far as I can see 
> all it needs is a new entry point for block device drivers that uses 
> bh->b_page. When that entry point exists skip the create_bounce call 
> in __make_request. After that it is purely problem for selected drivers.

I've already done it, however not as a 2.4 solution. The partial and WIP
patches is here:

*.kernel.org/pub/linux/kernel/people/axboe/v2.5/bio-7

Block driver can indicate the need for bounce buffers above a certain
page.

Of course I can hack up something for 2.4 as well, but is this really a
pressing need?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 17:53                           ` Richard Henderson
@ 2001-05-22  0:56                             ` Andrea Arcangeli
  2001-05-22 14:29                               ` Andrea Arcangeli
  2001-05-22 13:22                             ` Andrea Arcangeli
  1 sibling, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22  0:56 UTC (permalink / raw)
  To: Ivan Kokshaysky, linux-kernel; +Cc: rth, David S. Miller

On Mon, May 21, 2001 at 10:53:39AM -0700, Richard Henderson wrote:
> should probably just go ahead and allocate the 512M or 1G
> scatter-gather arena.

I just have a bugreport in my mailbox about pci_map faliures even after
I enlarged to window to 1G argghh (at first it looked apparently stable
by growing the window), so I'm stuck again, it seems I was right in not
being careless about the pci_map_* bugs today even if the 1G window
looked to offer a rasonable marging at first.

The pci_map_* failed triggers during a benchmark with a certain driver
that does massive DMA (similar to the examples I did previously), the
developers of the driver simply told me the hardware wants to do massive
zerocopy dma to userspace and they apparently excluded it could be a
memleak in the driver missing some pci_unmap_* after I told them to
check for that. Even enabling HIGHMEM would not be enough because they
do dma on userspace but on the network side, so it won't be taken care
by create_bounces(), so I at least would need to put another bounce
buffer layer in the driver to make highmem to work.

Other more efficient ways to go besides highmem plus additional bounce
buffer layer are:

2) fixing all buggy drivers now (would be a great pain as it seems to me
   I should do that alone apparently as it seems everybody else doesn't
   care about those bugs for 2.4)
3) let the "massing DMA" hardware to use DAC

Theoritically I could also cheat again and take a way 4) that is to try
to enlarge the window beyond 1G and see if the bugs gets hided also
during the benchmark that way, but I would take this as last resort as
this would again not be a definitive solution and I'd risk to get stuck
again tomorrow like I'm right now.

I think I will prefer to take a dirty way 3) just for those drivers to
solve this production problem even if it won't be implemented in a
generic manner at first (I got the idea from the quadrics folks that do
this just now with their nics if I understood well).

If I understand correctly on the tsunami enabling DAC simply means to
enable the pchip->pctl |= MWIN (monster window) bit during the boot
stage on both pchip.

Then the device driver of the "massive DMA" hardware should simply
program the registers of the nic to do use DAC with bus addresses that
are the phys address of the destination/source memory of the DMA,
only changed to have bit 40th set to 1. Those should be all the needed
changes necessary to make pci64 to work on tsunami at the same time of
pci32 direct/dynamic windows and it would be very efficient and it
sounds the best way to workaround the broken pci_map_* in 2.4 given
fixing the pci_map_* the right way is a pain.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:19                     ` David S. Miller
  2001-05-21 11:00                       ` Andrea Arcangeli
  2001-05-21 11:04                       ` David S. Miller
@ 2001-05-22 11:12                       ` Chris Wedgwood
  2001-05-22 17:51                       ` Jonathan Lundell
  3 siblings, 0 replies; 97+ messages in thread
From: Chris Wedgwood @ 2001-05-22 11:12 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

On Mon, May 21, 2001 at 03:19:54AM -0700, David S. Miller wrote:

    Electrically (someone correct me, I'm probably wrong) PCI is
    limited to 6 physical plug-in slots I believe, let's say it's 8
    to choose an arbitrary larger number to be safe.

Minor nit... it can in fact be higher than this, but typically it is
not. CompactPCI implementations may go higher (different electrical
characteristics allow for this).



  --cw

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:07                   ` David S. Miller
  2001-05-21  1:37                     ` Andrea Arcangeli
  2001-05-21  6:53                     ` David S. Miller
@ 2001-05-22 13:11                     ` Pavel Machek
  2001-05-22 23:02                     ` David S. Miller
  3 siblings, 0 replies; 97+ messages in thread
From: Pavel Machek @ 2001-05-22 13:11 UTC (permalink / raw)
  To: David S. Miller, Andrea Arcangeli
  Cc: Andrew Morton, Ivan Kokshaysky, Richard Henderson, linux-kernel

Hi!

>  > > [..]  Even sparc64's fancy
>  > > iommu-based pci_map_single() always succeeds.
>  > 
>  > Whatever sparc64 does to hide the driver bugs you can break it if you
>  > pci_map 4G+1 bytes of phyical memory.
> 
> Which is an utterly stupid thing to do.
> 
> Please construct a plausable situation where this would occur legally
> and not be a driver bug, given the maximum number of PCI busses and
> slots found on sparc64 and the maximum _concurrent_ usage of PCI dma
> space for any given driver (which isn't doing something stupid).

What stops you from plugging PCI-to-PCI bridges in order to create
some large number of slots, like 128?
								Pavel
-- 
I'm pavel@ucw.cz. "In my country we have almost anarchy and I don't care."
Panos Katsaloulis describing me w.r.t. patents at discuss@linmodems.org

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 17:53                           ` Richard Henderson
  2001-05-22  0:56                             ` Andrea Arcangeli
@ 2001-05-22 13:22                             ` Andrea Arcangeli
  1 sibling, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 13:22 UTC (permalink / raw)
  To: Ivan Kokshaysky, linux-kernel; +Cc: rth

On Mon, May 21, 2001 at 10:53:39AM -0700, Richard Henderson wrote:
> diff -ruNp linux/arch/alpha/kernel/pci_iommu.c linux-new/arch/alpha/kernel/pci_iommu.c
> --- linux/arch/alpha/kernel/pci_iommu.c	Fri Mar  2 11:12:07 2001
> +++ linux-new/arch/alpha/kernel/pci_iommu.c	Mon May 21 01:25:25 2001
> @@ -402,8 +402,20 @@ sg_fill(struct scatterlist *leader, stru
>  	paddr &= ~PAGE_MASK;
>  	npages = calc_npages(paddr + size);
>  	dma_ofs = iommu_arena_alloc(arena, npages);
> -	if (dma_ofs < 0)
> -		return -1;
> +	if (dma_ofs < 0) {
> +		/* If we attempted a direct map above but failed, die.  */
> +		if (leader->dma_address == 0)
> +			return -1;
> +
> +		/* Otherwise, break up the remaining virtually contiguous
> +		   hunks into individual direct maps.  */
> +		for (sg = leader; sg < end; ++sg)
> +			if (sg->dma_address == 2 || sg->dma_address == -2)
					    ^^^^ should be == 1

> +				sg->dma_address = 0;
> +
> +		/* Retry.  */
> +		return sg_fill(leader, end, out, arena, max_dma);
> +	}
>  
>  	out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr;
>  	out->dma_length = size;

I am going to merge this one (however it won't help on the big memory
machines, it will only try to hide the problem on the machines with not
much memory above 2G).

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22  0:56                             ` Andrea Arcangeli
@ 2001-05-22 14:29                               ` Andrea Arcangeli
  2001-05-22 14:44                                 ` Ivan Kokshaysky
  0 siblings, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 14:29 UTC (permalink / raw)
  To: Ivan Kokshaysky, linux-kernel; +Cc: rth, David S. Miller

While merging all the recent fixes in my tree and while reserving the
pci32 space above -1M to have a dynamic window of almost 1G without
dropping down the direct window, I noticed and fixed a severe bug, and
so now I started to wonder if the real reason of the crash when an
invalid entry is cached in the tlb and we do dma through it (both of
es40 and other platforms as well according to Ivan) could be just this
new software bug:

		for (i = 0; i < n; ++i)
			ptes[p+i] = ~1UL;

we reserve by setting also all the bits over 31 to 1. The tsunami specs
says that bits between 32 and 63 _must_ be zero, so the above is
definitely buggy. Maybe this has relactions with the fact the crashes
triggered on >=4G machines.

I will change it to:

		for (i = 0; i < n; ++i)
                        ptes[p+i] = 0x2;

which is just obviously correct for our internal management of the
allocation in the critical sections and that is a definitely necessary
fix according to the specs. Maybe this is the right(tm) fix and then I
can drop the artificial alignment and the tsunami will go to re-fetch
the pte on memory automatically when we do the I/O through an invalid
pte then. If tsunami gets fixed by it I can bet then we can drop the
align_entry field from the pci_iommu_arena structure all together and
what was referred as hardware bug for the other platforms would be
infact a software bug in the iommu code.

I am optimistic this is the definitive fix so I will left out the
so far absolutely necessary artifical alignment on the tsunami for now
and I will put in this critical fix for now (until I get the confirm),
and if it works I will drop the align_entry field all together from the
pci_iommu_arena structure.

Ivan could you test the above fix on the platforms that needs the
align_entry hack?

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 14:29                               ` Andrea Arcangeli
@ 2001-05-22 14:44                                 ` Ivan Kokshaysky
  2001-05-22 15:00                                   ` Andrea Arcangeli
  2001-05-22 15:18                                   ` Andrea Arcangeli
  0 siblings, 2 replies; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-22 14:44 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, rth, David S. Miller

On Tue, May 22, 2001 at 04:29:16PM +0200, Andrea Arcangeli wrote:
> Ivan could you test the above fix on the platforms that needs the
> align_entry hack?

That was one of the first things I noticed, and I've tried exactly
that (2 instead of ~1UL).
No, it wasn't the cause of the crashes on pyxis, so I left it as is.
But probably it worth to be changed, at least for correctness.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 14:44                                 ` Ivan Kokshaysky
@ 2001-05-22 15:00                                   ` Andrea Arcangeli
  2001-05-22 20:28                                     ` Richard Henderson
  2001-05-22 20:48                                     ` Jonathan Lundell
  2001-05-22 15:18                                   ` Andrea Arcangeli
  1 sibling, 2 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 15:00 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: linux-kernel, rth, David S. Miller

On Tue, May 22, 2001 at 06:44:09PM +0400, Ivan Kokshaysky wrote:
> No, it wasn't the cause of the crashes on pyxis, so I left it as is.

Ok.

> But probably it worth to be changed, at least for correctness.

I must be changed, specs says "must be zero" so if the machine crashes
if you set it to 1 it has the full rights to. I am waiting to know if it
helps on the es40 that so far needed the artificial alignment (the
artificial alignment incidentally workarounds completly the above bug
because with the artificial alignment the iommu will never see a
corrupted pte of that kind [~1UL] so I'm still optimistic even if we
cannot drop the align_entry all together ;).

I'm also wondering if ISA needs the sg to start on a 64k boundary,
core_titan assumes that but I don't see why core_titan would need that
and not the others, not being an ISA expert I'm not sure but it sounds
like potential all platforms with real ISA (so where the sg_isa isn't
only used for generating low pci addresses for broken pci32) needs
that.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 14:44                                 ` Ivan Kokshaysky
  2001-05-22 15:00                                   ` Andrea Arcangeli
@ 2001-05-22 15:18                                   ` Andrea Arcangeli
  2001-05-22 15:55                                     ` Ivan Kokshaysky
  1 sibling, 1 reply; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 15:18 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: linux-kernel, rth, David S. Miller

On Tue, May 22, 2001 at 06:44:09PM +0400, Ivan Kokshaysky wrote:
> On Tue, May 22, 2001 at 04:29:16PM +0200, Andrea Arcangeli wrote:
> > Ivan could you test the above fix on the platforms that needs the
> > align_entry hack?
> 
> That was one of the first things I noticed, and I've tried exactly
> that (2 instead of ~1UL).

just in case (I guess it wouldn't matter much but), but are you sure you
tried it with also the locking fixes applied too?

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 15:18                                   ` Andrea Arcangeli
@ 2001-05-22 15:55                                     ` Ivan Kokshaysky
  2001-05-22 16:06                                       ` Andrea Arcangeli
  0 siblings, 1 reply; 97+ messages in thread
From: Ivan Kokshaysky @ 2001-05-22 15:55 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, rth, David S. Miller

On Tue, May 22, 2001 at 05:18:15PM +0200, Andrea Arcangeli wrote:
> just in case (I guess it wouldn't matter much but), but are you sure you
> tried it with also the locking fixes applied too?

Yes. Though those races more likely would cause silent data
corruption, but not immediate crash.

Ivan.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 15:55                                     ` Ivan Kokshaysky
@ 2001-05-22 16:06                                       ` Andrea Arcangeli
  0 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 16:06 UTC (permalink / raw)
  To: Ivan Kokshaysky; +Cc: linux-kernel, rth, David S. Miller

On Tue, May 22, 2001 at 07:55:18PM +0400, Ivan Kokshaysky wrote:
> Yes. Though those races more likely would cause silent data
> corruption, but not immediate crash.

Ok. I wasn't sure if it was crashing or not for you.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21 10:19                     ` David S. Miller
                                         ` (2 preceding siblings ...)
  2001-05-22 11:12                       ` Chris Wedgwood
@ 2001-05-22 17:51                       ` Jonathan Lundell
  3 siblings, 0 replies; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-22 17:51 UTC (permalink / raw)
  To: Chris Wedgwood, David S. Miller; +Cc: linux-kernel

At 11:12 PM +1200 2001-05-22, Chris Wedgwood wrote:
>On Mon, May 21, 2001 at 03:19:54AM -0700, David S. Miller wrote:
>
>     Electrically (someone correct me, I'm probably wrong) PCI is
>     limited to 6 physical plug-in slots I believe, let's say it's 8
>     to choose an arbitrary larger number to be safe.
>
>Minor nit... it can in fact be higher than this, but typically it is
>not. CompactPCI implementations may go higher (different electrical
>characteristics allow for this).

Compact PCI specifies a max of 8 slots (one of which is typically the 
system board). Regular PCI doesn't have a hard and fast slot limit 
(except for the logical limit of 32 devices per bus); the limits are 
driven by electrical loading concerns. As I recall, a bus of typical 
length can accommodate 10 "loads", where a load is either a device 
pin or a slot connector (that is, an expansion card counts as two 
loads, one for the device and one for the connector). (I take this to 
be a rule of thumb, not a hard spec, based on the detailed electrical 
requirements in the PCI spec.)

Still, the presence of bridges opens up the number of devices on a 
root PCI bus to a very high number, logically. Certainly having three 
or four quad Ethernet cards, so 12 or 16 devices, is a plausible 
configuration. As for bandwidth, a 64x66 PCI bus has a nominal burst 
bandwidth of 533 MB/second, which would be saturated by 20 full 
duplex 100baseT ports that were themselves saturated in both 
directions (all ignoring overhead). Full saturation is not reasonable 
for either PCI or Ethernet; I'm just looking at order-of-magnitude 
numbers here.

The bottom line is: don't make any hard and fast assumption about the 
number of devices connected to a root PCI bus.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 15:00                                   ` Andrea Arcangeli
@ 2001-05-22 20:28                                     ` Richard Henderson
  2001-05-22 20:40                                       ` Jeff Garzik
  2001-05-22 21:08                                       ` Alan Cox
  2001-05-22 20:48                                     ` Jonathan Lundell
  1 sibling, 2 replies; 97+ messages in thread
From: Richard Henderson @ 2001-05-22 20:28 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ivan Kokshaysky, linux-kernel, David S. Miller

On Tue, May 22, 2001 at 05:00:16PM +0200, Andrea Arcangeli wrote:
> I'm also wondering if ISA needs the sg to start on a 64k boundary,

Traditionally, ISA could not do DMA across a 64k boundary.

The only ISA card I have (a soundblaster compatible) appears
to work without caring for this, but I suppose we should pay
lip service to pedantics.


r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:28                                     ` Richard Henderson
@ 2001-05-22 20:40                                       ` Jeff Garzik
  2001-05-22 20:52                                         ` Andrea Arcangeli
                                                           ` (2 more replies)
  2001-05-22 21:08                                       ` Alan Cox
  1 sibling, 3 replies; 97+ messages in thread
From: Jeff Garzik @ 2001-05-22 20:40 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Andrea Arcangeli, Ivan Kokshaysky, linux-kernel, David S. Miller

Richard Henderson wrote:
> 
> On Tue, May 22, 2001 at 05:00:16PM +0200, Andrea Arcangeli wrote:
> > I'm also wondering if ISA needs the sg to start on a 64k boundary,
> 
> Traditionally, ISA could not do DMA across a 64k boundary.
> 
> The only ISA card I have (a soundblaster compatible) appears
> to work without caring for this, but I suppose we should pay
> lip service to pedantics.

ISA cards can do sg?

[sure it's theoretically possible... do any drivers use it though, if
such hardware even exists?]

-- 
Jeff Garzik      | "Are you the police?"
Building 1024    | "No, ma'am.  We're musicians."
MandrakeSoft     |

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 15:00                                   ` Andrea Arcangeli
  2001-05-22 20:28                                     ` Richard Henderson
@ 2001-05-22 20:48                                     ` Jonathan Lundell
  2001-05-22 21:02                                       ` Richard Henderson
  2001-05-22 21:17                                       ` Jonathan Lundell
  1 sibling, 2 replies; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-22 20:48 UTC (permalink / raw)
  To: Richard Henderson, Andrea Arcangeli
  Cc: Ivan Kokshaysky, linux-kernel, David S. Miller

At 1:28 PM -0700 2001-05-22, Richard Henderson wrote:
>On Tue, May 22, 2001 at 05:00:16PM +0200, Andrea Arcangeli wrote:
>>  I'm also wondering if ISA needs the sg to start on a 64k boundary,
>
>Traditionally, ISA could not do DMA across a 64k boundary.
>
>The only ISA card I have (a soundblaster compatible) appears
>to work without caring for this, but I suppose we should pay
>lip service to pedantics.

64KB for 8-bit DMA; 128KB for 16-bit DMA. It's a limitation of the 
legacy third-party-DMA controllers, which had only 16-bit address 
registers (the high part of the address lives in a non-counting 
register). This doesn't apply to bus-master DMA, just the legacy 
(8237) stuff. There was also a 24-bit address limitation.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:40                                       ` Jeff Garzik
@ 2001-05-22 20:52                                         ` Andrea Arcangeli
  2001-05-22 20:57                                         ` Richard Henderson
  2001-05-22 21:09                                         ` Alan Cox
  2 siblings, 0 replies; 97+ messages in thread
From: Andrea Arcangeli @ 2001-05-22 20:52 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Richard Henderson, Ivan Kokshaysky, linux-kernel, David S. Miller

On Tue, May 22, 2001 at 04:40:17PM -0400, Jeff Garzik wrote:
> ISA cards can do sg?

The fact it's sg or single physically consecutive since the first place
doesn't matter.  The only point here is that from whatever physically
aligned piece of ram right now you will get a misaligned virtual pci
address.

Andrea

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:40                                       ` Jeff Garzik
  2001-05-22 20:52                                         ` Andrea Arcangeli
@ 2001-05-22 20:57                                         ` Richard Henderson
  2001-05-22 21:09                                         ` Alan Cox
  2 siblings, 0 replies; 97+ messages in thread
From: Richard Henderson @ 2001-05-22 20:57 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Andrea Arcangeli, Ivan Kokshaysky, linux-kernel, David S. Miller

On Tue, May 22, 2001 at 04:40:17PM -0400, Jeff Garzik wrote:
> ISA cards can do sg?

No, but the host iommu can.  The isa card sees whatever
view of memory presented to it by the iommu.


r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:48                                     ` Jonathan Lundell
@ 2001-05-22 21:02                                       ` Richard Henderson
  2001-05-22 21:10                                         ` Alan Cox
  2001-05-22 21:17                                       ` Jonathan Lundell
  1 sibling, 1 reply; 97+ messages in thread
From: Richard Henderson @ 2001-05-22 21:02 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Andrea Arcangeli, Ivan Kokshaysky, linux-kernel, David S. Miller

On Tue, May 22, 2001 at 01:48:23PM -0700, Jonathan Lundell wrote:
> 64KB for 8-bit DMA; 128KB for 16-bit DMA. [...]  This doesn't
> apply to bus-master DMA, just the legacy (8237) stuff.

Would this 8237 be something on the ISA card, or something on
the old pc mainboards?  I'm wondering if we can safely ignore
this issue altogether here...

> There was also a 24-bit address limitation.

Yes, that's in the number of address lines going to the isa card.
We work around that one by having an iommu arena from 8M to 16M
and forcing all ISA traffic to go through there.


r~

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:28                                     ` Richard Henderson
  2001-05-22 20:40                                       ` Jeff Garzik
@ 2001-05-22 21:08                                       ` Alan Cox
  1 sibling, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-22 21:08 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Andrea Arcangeli, Ivan Kokshaysky, linux-kernel, David S. Miller

> On Tue, May 22, 2001 at 05:00:16PM +0200, Andrea Arcangeli wrote:
> > I'm also wondering if ISA needs the sg to start on a 64k boundary,
> Traditionally, ISA could not do DMA across a 64k boundary.

The ISA dmac on the x86 needs a 64K boundary (128K for 16bit) because it
did not carry the 16 bit address to the top latch byte. 

> 


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:40                                       ` Jeff Garzik
  2001-05-22 20:52                                         ` Andrea Arcangeli
  2001-05-22 20:57                                         ` Richard Henderson
@ 2001-05-22 21:09                                         ` Alan Cox
  2 siblings, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-22 21:09 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Richard Henderson, Andrea Arcangeli, Ivan Kokshaysky,
	linux-kernel, David S. Miller

> ISA cards can do sg?

AHA1542 scsi for one. It wasnt that uncommon.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 21:02                                       ` Richard Henderson
@ 2001-05-22 21:10                                         ` Alan Cox
  0 siblings, 0 replies; 97+ messages in thread
From: Alan Cox @ 2001-05-22 21:10 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Jonathan Lundell, Andrea Arcangeli, Ivan Kokshaysky,
	linux-kernel, David S. Miller

> On Tue, May 22, 2001 at 01:48:23PM -0700, Jonathan Lundell wrote:
> > 64KB for 8-bit DMA; 128KB for 16-bit DMA. [...]  This doesn't
> > apply to bus-master DMA, just the legacy (8237) stuff.
> 
> Would this 8237 be something on the ISA card, or something on
> the old pc mainboards?  I'm wondering if we can safely ignore
> this issue altogether here...

On the mainboards. Bus mastering ISA controllers have various limits of their
own but I dont believe the 64/128K one is required


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 20:48                                     ` Jonathan Lundell
  2001-05-22 21:02                                       ` Richard Henderson
@ 2001-05-22 21:17                                       ` Jonathan Lundell
  2001-05-22 21:24                                         ` Alan Cox
  1 sibling, 1 reply; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-22 21:17 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Andrea Arcangeli, Ivan Kokshaysky, linux-kernel, David S. Miller

At 2:02 PM -0700 2001-05-22, Richard Henderson wrote:
>On Tue, May 22, 2001 at 01:48:23PM -0700, Jonathan Lundell wrote:
>>  64KB for 8-bit DMA; 128KB for 16-bit DMA. [...]  This doesn't
>>  apply to bus-master DMA, just the legacy (8237) stuff.
>
>Would this 8237 be something on the ISA card, or something on
>the old pc mainboards?  I'm wondering if we can safely ignore
>this issue altogether here...

On the main board, and not just the old ones. These days it's 
typically in the chipset's south bridge. "Third-party DMA" is 
sometimes called "fly-by DMA". The ISA card is a slave, as is memory, 
and the DMA chip reads from one ands writes to the other.

IDE didn't originally use DMA at all (but floppies did), just 
programmed IO. These days, PC chipsets mostly have some form of 
extended higher-performance DMA facilities for stuff like IDE, but 
I'm not really familiar with the details.

<aside>I do wish Linux didn't have so much PC legacy sh^Htuff 
embedded into the i386 architecture.</aside>

>  > There was also a 24-bit address limitation.
>
>Yes, that's in the number of address lines going to the isa card.
>We work around that one by having an iommu arena from 8M to 16M
>and forcing all ISA traffic to go through there.


-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 21:17                                       ` Jonathan Lundell
@ 2001-05-22 21:24                                         ` Alan Cox
  2001-05-22 21:34                                           ` Jonathan Lundell
  0 siblings, 1 reply; 97+ messages in thread
From: Alan Cox @ 2001-05-22 21:24 UTC (permalink / raw)
  To: Jonathan Lundell
  Cc: Richard Henderson, Andrea Arcangeli, Ivan Kokshaysky,
	linux-kernel, David S. Miller

> On the main board, and not just the old ones. These days it's 
> typically in the chipset's south bridge. "Third-party DMA" is 
> sometimes called "fly-by DMA". The ISA card is a slave, as is memory, 
> and the DMA chip reads from one ands writes to the other.

There is also another mode which will give the Alpha kittens I suspect. A
few PCI cards do SB emulation by snooping the PCI bus. So the kernel writes
to the ISA DMA controller which does a pointless ISA transfer and the PCI
card sniffs the DMA controller setup (as it goes to pci, then when nobody 
claims it on to the isa bridge) then does bus mastering DMA of its own to fake
the ISA dma


Alan


^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-22 21:24                                         ` Alan Cox
@ 2001-05-22 21:34                                           ` Jonathan Lundell
  0 siblings, 0 replies; 97+ messages in thread
From: Jonathan Lundell @ 2001-05-22 21:34 UTC (permalink / raw)
  To: Alan Cox
  Cc: Richard Henderson, Andrea Arcangeli, Ivan Kokshaysky,
	linux-kernel, David S. Miller

At 10:24 PM +0100 2001-05-22, Alan Cox wrote:
>  > On the main board, and not just the old ones. These days it's
>>  typically in the chipset's south bridge. "Third-party DMA" is
>>  sometimes called "fly-by DMA". The ISA card is a slave, as is memory,
>>  and the DMA chip reads from one ands writes to the other.
>
>There is also another mode which will give the Alpha kittens I suspect. A
>few PCI cards do SB emulation by snooping the PCI bus. So the kernel writes
>to the ISA DMA controller which does a pointless ISA transfer and the PCI
>card sniffs the DMA controller setup (as it goes to pci, then when nobody
>claims it on to the isa bridge) then does bus mastering DMA of its own to fake
>the ISA dma

That's sick.
-- 
/Jonathan Lundell.

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  1:07                   ` David S. Miller
                                       ` (2 preceding siblings ...)
  2001-05-22 13:11                     ` Pavel Machek
@ 2001-05-22 23:02                     ` David S. Miller
  3 siblings, 0 replies; 97+ messages in thread
From: David S. Miller @ 2001-05-22 23:02 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel


Pavel Machek writes:
 > What stops you from plugging PCI-to-PCI bridges in order to create
 > some large number of slots, like 128?

Desire for decent bandwidth usage on that PCI segment perhaps ;-)

Later,
David S. Miller
davem@redhat.com

^ permalink raw reply	[flat|nested] 97+ messages in thread

* Re: alpha iommu fixes
  2001-05-21  6:53                     ` David S. Miller
                                         ` (2 preceding siblings ...)
  2001-05-21  8:09                       ` David S. Miller
@ 2001-05-23  0:05                       ` Albert D. Cahalan
  3 siblings, 0 replies; 97+ messages in thread
From: Albert D. Cahalan @ 2001-05-23  0:05 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrea Arcangeli, Andrew Morton, Ivan Kokshaysky,
	Richard Henderson, linux-kernel

David S. Miller writes:

> What are these "devices", and what drivers "just program the cards to
> start the dma on those hundred mbyte of ram"?

Hmmm, I have a few cards that are used that way. They are used
for communication between nodes of a cluster.

One might put 16 cards in a system. The cards are quite happy to
do a 2 GB DMA transfer. Scatter-gather is possible, but it cuts
performance. Typically the driver would provide a huge chunk
of memory for an app to use, mapped using large pages on x86 or
using BAT registers on ppc. (reserved during boot of course)
The app would crunch numbers using the CPU (with AltiVec, VIS,
3dnow, etc.) and instruct the device to transfer data to/from
the memory region.

Remote nodes initiate DMA too, even supplying the PCI bus address
on both sides of the interconnect. :-) No IOMMU problems with
that one, eh? The other node may transfer data at will.







^ permalink raw reply	[flat|nested] 97+ messages in thread

end of thread, other threads:[~2001-05-23  0:07 UTC | newest]

Thread overview: 97+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-05-18 17:46 alpha iommu fixes Ivan Kokshaysky
2001-05-19  2:34 ` Tom Vier
2001-05-19 10:48   ` Ivan Kokshaysky
2001-05-19 20:58     ` Tom Vier
2001-05-19 13:55 ` Andrea Arcangeli
2001-05-19 19:11   ` Ivan Kokshaysky
2001-05-20  2:40     ` Andrea Arcangeli
2001-05-20 12:12       ` Ivan Kokshaysky
2001-05-20 13:40         ` Andrea Arcangeli
2001-05-20 14:23         ` Gérard Roudier
     [not found]       ` <3B07AF49.5A85205F@uow.edu.au>
2001-05-20 13:49         ` Andrea Arcangeli
2001-05-20 14:05           ` Andrew Morton
2001-05-20 14:33             ` Andrea Arcangeli
2001-05-21  1:01             ` David S. Miller
2001-05-21  1:47               ` Andrea Arcangeli
2001-05-21  7:05               ` David S. Miller
2001-05-21  8:59                 ` Andrea Arcangeli
2001-05-21  9:50                   ` Gerd Knorr
2001-05-21  9:02                 ` David S. Miller
2001-05-21  9:23                   ` Andi Kleen
2001-05-21  9:30                   ` David S. Miller
2001-05-21  9:42                     ` Andi Kleen
2001-05-21 10:02                       ` Andrea Arcangeli
2001-05-21 10:17                         ` Alan Cox
2001-05-21 10:00                     ` David S. Miller
2001-05-21 10:27                       ` Andi Kleen
2001-05-21 22:22                         ` Jens Axboe
2001-05-21 10:34                       ` David S. Miller
2001-05-21 10:42                         ` Andi Kleen
2001-05-21 10:55                         ` David S. Miller
2001-05-21 11:08                           ` Andi Kleen
2001-05-21 11:36                           ` David S. Miller
2001-05-21 11:41                             ` Andi Kleen
2001-05-21  9:56                   ` Andrea Arcangeli
2001-05-21 10:19                     ` David S. Miller
2001-05-21 11:00                       ` Andrea Arcangeli
2001-05-21 11:04                       ` David S. Miller
2001-05-21 11:27                         ` Andrea Arcangeli
2001-05-21 12:16                           ` Peter Rival
2001-05-22 11:12                       ` Chris Wedgwood
2001-05-22 17:51                       ` Jonathan Lundell
2001-05-21 10:11                   ` David S. Miller
2001-05-21 10:50                     ` Andrea Arcangeli
2001-05-21 10:59                     ` David S. Miller
2001-05-21 11:19                       ` Andrea Arcangeli
2001-05-21 11:51                         ` Ivan Kokshaysky
2001-05-21 17:53                           ` Richard Henderson
2001-05-22  0:56                             ` Andrea Arcangeli
2001-05-22 14:29                               ` Andrea Arcangeli
2001-05-22 14:44                                 ` Ivan Kokshaysky
2001-05-22 15:00                                   ` Andrea Arcangeli
2001-05-22 20:28                                     ` Richard Henderson
2001-05-22 20:40                                       ` Jeff Garzik
2001-05-22 20:52                                         ` Andrea Arcangeli
2001-05-22 20:57                                         ` Richard Henderson
2001-05-22 21:09                                         ` Alan Cox
2001-05-22 21:08                                       ` Alan Cox
2001-05-22 20:48                                     ` Jonathan Lundell
2001-05-22 21:02                                       ` Richard Henderson
2001-05-22 21:10                                         ` Alan Cox
2001-05-22 21:17                                       ` Jonathan Lundell
2001-05-22 21:24                                         ` Alan Cox
2001-05-22 21:34                                           ` Jonathan Lundell
2001-05-22 15:18                                   ` Andrea Arcangeli
2001-05-22 15:55                                     ` Ivan Kokshaysky
2001-05-22 16:06                                       ` Andrea Arcangeli
2001-05-22 13:22                             ` Andrea Arcangeli
2001-05-21 13:55                     ` Jonathan Lundell
2001-05-21 14:17                       ` Ivan Kokshaysky
2001-05-21 15:47                       ` Jonathan Lundell
2001-05-20 16:18           ` Andrea Arcangeli
2001-05-20 16:21             ` Andrew Morton
2001-05-20 16:44               ` Andrea Arcangeli
2001-05-20 16:54                 ` Andrew Morton
2001-05-20 17:12                   ` Andrea Arcangeli
2001-05-21  1:07                   ` David S. Miller
2001-05-21  1:37                     ` Andrea Arcangeli
2001-05-21  6:53                     ` David S. Miller
2001-05-21  7:59                       ` Alan Cox
2001-05-21  8:06                       ` Chris Wedgwood
2001-05-21  8:09                       ` David S. Miller
2001-05-21  8:09                         ` Alan Cox
2001-05-23  0:05                       ` Albert D. Cahalan
2001-05-22 13:11                     ` Pavel Machek
2001-05-22 23:02                     ` David S. Miller
2001-05-20 17:16             ` Jeff Garzik
2001-05-20 17:37               ` Andrea Arcangeli
2001-05-21  1:00           ` David S. Miller
2001-05-21  7:47             ` Alan Cox
2001-05-21  7:53             ` David S. Miller
2001-05-21  8:03               ` Alan Cox
2001-05-21  8:11               ` David S. Miller
2001-05-21  1:03           ` David S. Miller
2001-05-21  1:58             ` Richard Henderson
2001-05-20  1:11 ` Richard Henderson
2001-05-20 12:05   ` Ivan Kokshaysky
2001-05-21  0:37     ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).