All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 4/4] amd iommu: Large io page support - implementation
@ 2010-12-03 16:03 Wei Wang2
  2010-12-07  0:47 ` Kay, Allen M
  0 siblings, 1 reply; 12+ messages in thread
From: Wei Wang2 @ 2010-12-03 16:03 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 284 bytes --]

This is the implementation.

Thanks,
We
Signed-off-by: Wei Wang <wei.wang2@amd.com>
--
Legal Information:
Advanced Micro Devices GmbH
Sitz: Dornach, Gemeinde Aschheim, 
Landkreis München Registergericht München, 
HRB Nr. 43632
Geschäftsführer:
Alberto Bozzo, Andrew Bowd

[-- Attachment #2: iommu_spage_impl.patch --]
[-- Type: text/x-diff, Size: 19017 bytes --]

diff -r 1f6773300a7b xen/drivers/passthrough/amd/iommu_map.c
--- a/xen/drivers/passthrough/amd/iommu_map.c	Thu Dec 02 16:06:30 2010 +0100
+++ b/xen/drivers/passthrough/amd/iommu_map.c	Fri Dec 03 15:48:50 2010 +0100
@@ -71,11 +71,36 @@ int send_iommu_command(struct amd_iommu 
     return 0;
 }
 
-static void invalidate_iommu_page(struct amd_iommu *iommu,
-                                  u64 io_addr, u16 domain_id)
+static bool_t check_order(int order)
+{
+    return ( order == 0 || order == 9 || order == 18 );
+}
+
+void invalidate_iommu_pages(struct amd_iommu *iommu, u64 io_addr,
+                            u16 domain_id, int order)
 {
     u64 addr_lo, addr_hi;
     u32 cmd[4], entry;
+    u64 mask = 0;
+    int sflag = 0, pde = 0;
+
+    BUG_ON( !check_order(order) );
+
+    /* If sflag == 1, the size of the invalidate command is determined
+     by the first zero bit in the address starting from Address[12] */
+    if ( order == 9 || order == 18 )
+    {
+        mask = ((1ULL << (order - 1)) - 1) << PAGE_SHIFT;
+        io_addr |= mask;
+        sflag = 1;
+    }
+
+    /* All pages associated with the domainID are invalidated */
+    else if ( io_addr == 0x7FFFFFFFFFFFF000ULL )
+    {
+        sflag = 1;
+        pde = 1;
+    }
 
     addr_lo = io_addr & DMA_32BIT_MASK;
     addr_hi = io_addr >> 32;
@@ -88,10 +113,10 @@ static void invalidate_iommu_page(struct
                          &entry);
     cmd[1] = entry;
 
-    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, 0,
+    set_field_in_reg_u32(sflag, 0,
                          IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK,
                          IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry);
-    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
+    set_field_in_reg_u32(pde, entry,
                          IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK,
                          IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry);
     set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry,
@@ -149,37 +174,40 @@ void flush_command_buffer(struct amd_iom
     AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n");
 }
 
-static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn)
-{
-    u32 *l1e;
+static void clear_iommu_pte_present(u64 pde, unsigned long gfn, int order)
+{
+    u32 *pte;
     int offset;
-    void *l1_table;
-
-    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
-
-    offset = gfn & (~PTE_PER_TABLE_MASK);
-    l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
+    void *pd_table;
+
+    BUG_ON( !check_order(order) );
+
+    pd_table = map_domain_page(pde >> PAGE_SHIFT);
+
+    offset = (gfn >> order) & (~PTE_PER_TABLE_MASK);
+    pte = (u32*)(pd_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
 
     /* clear l1 entry */
-    l1e[0] = l1e[1] = 0;
-
-    unmap_domain_page(l1_table);
-}
-
-static void set_iommu_l1e_present(u64 l2e, unsigned long gfn,
-                                 u64 maddr, int iw, int ir)
+    pte[0] = pte[1] = 0;
+
+    unmap_domain_page(pd_table);
+}
+
+static void set_iommu_pte_present(u64 pde, unsigned long gfn,
+                                 u64 maddr, int iw, int ir, int order)
 {
     u64 addr_lo, addr_hi;
     u32 entry;
-    void *l1_table;
+    void *pd_table;
     int offset;
-    u32 *l1e;
-
-    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
-
-    offset = gfn & (~PTE_PER_TABLE_MASK);
-    l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
-
+    u32 *pte;
+
+    BUG_ON( !check_order(order) );
+
+    pd_table = map_domain_page(pde >> PAGE_SHIFT);
+
+    offset = (gfn >> order) & (~PTE_PER_TABLE_MASK);
+    pte = (u32*)((u8*)pd_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
     addr_lo = maddr & DMA_32BIT_MASK;
     addr_hi = maddr >> 32;
 
@@ -194,7 +222,7 @@ static void set_iommu_l1e_present(u64 l2
                          IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_PTE_IO_READ_PERMISSION_MASK,
                          IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry);
-    l1e[1] = entry;
+    pte[1] = entry;
 
     set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
                          IOMMU_PTE_ADDR_LOW_MASK,
@@ -205,13 +233,12 @@ static void set_iommu_l1e_present(u64 l2
     set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
                          IOMMU_PTE_PRESENT_MASK,
                          IOMMU_PTE_PRESENT_SHIFT, &entry);
-    l1e[0] = entry;
-
-    unmap_domain_page(l1_table);
-}
-
-static void amd_iommu_set_page_directory_entry(u32 *pde, 
-                                               u64 next_ptr, u8 next_level)
+    pte[0] = entry;
+
+    unmap_domain_page(pd_table);
+}
+
+static void set_iommu_pde_present(u32 *pde, u64 next_ptr, u8 next_level)
 {
     u64 addr_lo, addr_hi;
     u32 entry;
@@ -360,29 +387,11 @@ void amd_iommu_add_dev_table_entry(
     dte[3] = entry;
 }
 
-u64 amd_iommu_get_next_table_from_pte(u32 *entry)
-{
-    u64 addr_lo, addr_hi, ptr;
-
-    addr_lo = get_field_from_reg_u32(
-        entry[0],
-        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
-        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT);
-
-    addr_hi = get_field_from_reg_u32(
-        entry[1],
-        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
-        IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT);
-
-    ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
-    return ptr;
-}
-
-static int amd_iommu_is_pte_present(u32 *entry)
-{
-    return (get_field_from_reg_u32(entry[0],
-                                   IOMMU_PDE_PRESENT_MASK,
-                                   IOMMU_PDE_PRESENT_SHIFT));
+static int is_pde_present(u32 *entry)
+{
+    return get_field_from_reg_u32(entry[0],
+                                  IOMMU_PDE_PRESENT_MASK,
+                                  IOMMU_PDE_PRESENT_SHIFT);
 }
 
 void invalidate_dev_table_entry(struct amd_iommu *iommu,
@@ -404,17 +413,27 @@ void invalidate_dev_table_entry(struct a
     send_iommu_command(iommu, cmd);
 }
 
-static u64 iommu_l2e_from_pfn(struct page_info *table, int level,
-                              unsigned long io_pfn)
+static u64 iommu_pt_from_pfn(struct page_info *table, int level,
+                             unsigned long io_pfn,
+                             int order, struct domain *d)
 {
     unsigned long offset;
     void *pde = NULL;
     void *table_vaddr;
     u64 next_table_maddr = 0;
-
-    BUG_ON( table == NULL || level == 0 );
-
-    while ( level > 1 )
+    unsigned long flags;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct amd_iommu *iommu;
+    u64 spfn, mask;
+    unsigned int lowest = order / PTE_PER_TABLE_SHIFT + 1;
+
+    BUG_ON( table == NULL || level == 0 ||
+           level < lowest || !check_order(order) );
+
+    if ( level == lowest )
+        return page_to_maddr(table);
+
+    while ( level > lowest )
     {
         offset = io_pfn >> ((PTE_PER_TABLE_SHIFT *
                              (level - IOMMU_PAGING_MODE_LEVEL_1)));
@@ -422,9 +441,9 @@ static u64 iommu_l2e_from_pfn(struct pag
 
         table_vaddr = __map_domain_page(table);
         pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE);
-        next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
-
-        if ( !amd_iommu_is_pte_present(pde) )
+        next_table_maddr = get_next_table_from_pte(pde);
+
+        if ( !is_pde_present(pde) )
         {
             if ( next_table_maddr == 0 )
             {
@@ -435,13 +454,41 @@ static u64 iommu_l2e_from_pfn(struct pag
                     return 0;
                 }
                 next_table_maddr = page_to_maddr(table);
-                amd_iommu_set_page_directory_entry(
-                    (u32 *)pde, next_table_maddr, level - 1);
+                set_iommu_pde_present((u32 *)pde,
+                                      next_table_maddr, level - 1);
             }
             else /* should never reach here */
                 return 0;
         }
 
+        /* The same gfn has been remapped as smaller page size */
+        else if ( next_table_maddr && is_pte((u32 *)pde) )
+        {
+            table = alloc_amd_iommu_pgtable();
+            if ( table == NULL )
+            {
+                printk("AMD-Vi: Cannot allocate I/O page table\n");
+                return 0;
+            }
+            next_table_maddr = page_to_maddr(table);
+            set_iommu_pde_present((u32 *)pde, next_table_maddr, level - 1);
+
+            /* The entire super page must be invalidated */
+            for_each_amd_iommu ( iommu )
+            {
+                spin_lock_irqsave(&iommu->lock, flags);
+
+                /* Round io_pfn to super page boundary */
+                mask =  ~((1ULL << ((level - 1) * PTE_PER_TABLE_SHIFT)) - 1);
+                spfn = io_pfn & mask;
+                invalidate_iommu_pages(iommu, spfn << PAGE_SHIFT,
+                                       hd->domain_id,
+                                       (level - 1) * PTE_PER_TABLE_SHIFT);
+                flush_command_buffer(iommu);
+                spin_unlock_irqrestore(&iommu->lock, flags);
+            }
+        }
+
         unmap_domain_page(table_vaddr);
         table = maddr_to_page(next_table_maddr);
         level--;
@@ -453,60 +500,76 @@ int amd_iommu_map_page(struct domain *d,
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                        unsigned int flags)
 {
-    u64 iommu_l2e;
+    return amd_iommu_map_pages(d, gfn, mfn, 0, flags);
+}
+
+int amd_iommu_unmap_page(struct domain *d, unsigned long gfn)
+{
+    return amd_iommu_unmap_pages(d, gfn, 0);
+}
+
+int amd_iommu_map_pages(struct domain *d, unsigned long gfn,
+                        unsigned long mfn, unsigned int order,
+                        unsigned int flags)
+{
+    u64 iommu_pt;
     struct hvm_iommu *hd = domain_hvm_iommu(d);
 
-    BUG_ON( !hd->root_table );
+    BUG_ON( !hd->root_table || !check_order(order) );
 
     spin_lock(&hd->mapping_lock);
 
-    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
-    if ( iommu_l2e == 0 )
+    iommu_pt = iommu_pt_from_pfn(hd->root_table,
+                                 hd->paging_mode, gfn, order, d);
+    if ( iommu_pt == 0 )
     {
         spin_unlock(&hd->mapping_lock);
-        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn);
+        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx, order = %d\n",
+                        gfn, order);
         domain_crash(d);
         return -EFAULT;
     }
-
-    set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT,
+    set_iommu_pte_present(iommu_pt, gfn, (u64)mfn << PAGE_SHIFT,
                           !!(flags & IOMMUF_writable),
-                          !!(flags & IOMMUF_readable));
+                          !!(flags & IOMMUF_readable), order);
 
     spin_unlock(&hd->mapping_lock);
     return 0;
 }
 
-int amd_iommu_unmap_page(struct domain *d, unsigned long gfn)
-{
-    u64 iommu_l2e;
+int amd_iommu_unmap_pages(struct domain *d, unsigned long gfn,
+                         unsigned int order)
+{
+    u64 iommu_pt;
     unsigned long flags;
     struct amd_iommu *iommu;
     struct hvm_iommu *hd = domain_hvm_iommu(d);
 
-    BUG_ON( !hd->root_table );
+    BUG_ON( !hd->root_table || !check_order(order) );
 
     spin_lock(&hd->mapping_lock);
 
-    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
-
-    if ( iommu_l2e == 0 )
+    iommu_pt = iommu_pt_from_pfn(hd->root_table,
+                                 hd->paging_mode, gfn, 0, d);
+    if ( iommu_pt == 0 )
     {
         spin_unlock(&hd->mapping_lock);
-        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn);
+        AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx, order = %d\n",
+                        gfn, order);
         domain_crash(d);
         return -EFAULT;
     }
 
     /* mark PTE as 'page not present' */
-    clear_iommu_l1e_present(iommu_l2e, gfn);
+    clear_iommu_pte_present(iommu_pt, gfn, order);
     spin_unlock(&hd->mapping_lock);
 
     /* send INVALIDATE_IOMMU_PAGES command */
     for_each_amd_iommu ( iommu )
     {
         spin_lock_irqsave(&iommu->lock, flags);
-        invalidate_iommu_page(iommu, (u64)gfn << PAGE_SHIFT, hd->domain_id);
+        invalidate_iommu_pages(iommu, (u64)gfn << PAGE_SHIFT,
+                               hd->domain_id, order);
         flush_command_buffer(iommu);
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
@@ -514,80 +577,38 @@ int amd_iommu_unmap_page(struct domain *
     return 0;
 }
 
-int amd_iommu_reserve_domain_unity_map(
-    struct domain *domain,
-    unsigned long phys_addr,
-    unsigned long size, int iw, int ir)
-{
-    u64 iommu_l2e;
+int amd_iommu_reserve_domain_unity_map(struct domain *domain,
+                                       u64 phys_addr,
+                                       unsigned long size, int iw, int ir)
+{
     unsigned long npages, i;
-    struct hvm_iommu *hd = domain_hvm_iommu(domain);
+    unsigned long gfn;
+    unsigned int flags = !!ir;
+    int rt = 0;
 
     npages = region_to_pages(phys_addr, size);
-
-    spin_lock(&hd->mapping_lock);
-    for ( i = 0; i < npages; ++i )
-    {
-        iommu_l2e = iommu_l2e_from_pfn(
-            hd->root_table, hd->paging_mode, phys_addr >> PAGE_SHIFT);
-
-        if ( iommu_l2e == 0 )
-        {
-            spin_unlock(&hd->mapping_lock);
-            AMD_IOMMU_DEBUG("Invalid IO pagetable entry phys_addr = %lx\n",
-                          phys_addr);
-            domain_crash(domain);
-            return -EFAULT;
-        }
-
-        set_iommu_l1e_present(iommu_l2e,
-            (phys_addr >> PAGE_SHIFT), phys_addr, iw, ir);
-
-        phys_addr += PAGE_SIZE;
-    }
-    spin_unlock(&hd->mapping_lock);
+    if ( iw )
+        flags |= IOMMUF_writable;
+    gfn = phys_addr >> PAGE_SHIFT;
+    for ( i = 0; i < npages; i++ )
+    {
+        rt = amd_iommu_map_pages(domain, gfn +i, gfn +i, 0, flags);
+        if ( rt != 0 )
+            return rt;
+    }
     return 0;
 }
 
 void invalidate_all_iommu_pages(struct domain *d)
 {
-    u32 cmd[4], entry;
     unsigned long flags;
     struct amd_iommu *iommu;
-    int domain_id = d->domain_id;
-    u64 addr_lo = 0x7FFFFFFFFFFFF000ULL & DMA_32BIT_MASK;
-    u64 addr_hi = 0x7FFFFFFFFFFFF000ULL >> 32;
-
-    set_field_in_reg_u32(domain_id, 0,
-                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK,
-                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry);
-    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry,
-                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
-                         &entry);
-    cmd[1] = entry;
-
-    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
-                         IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK,
-                         IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry);
-    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
-                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK,
-                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry);
-    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry,
-                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK,
-                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry);
-    cmd[2] = entry;
-
-    set_field_in_reg_u32((u32)addr_hi, 0,
-                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK,
-                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry);
-    cmd[3] = entry;
-
-    cmd[0] = 0;
 
     for_each_amd_iommu ( iommu )
     {
         spin_lock_irqsave(&iommu->lock, flags);
-        send_iommu_command(iommu, cmd);
+        invalidate_iommu_pages(iommu, 0x7FFFFFFFFFFFF000ULL,
+                               d->domain_id, 0);
         flush_command_buffer(iommu);
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
diff -r 1f6773300a7b xen/drivers/passthrough/amd/pci_amd_iommu.c
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c	Thu Dec 02 16:06:30 2010 +0100
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c	Fri Dec 03 15:48:50 2010 +0100
@@ -224,9 +224,7 @@ static int amd_iommu_domain_init(struct 
         return -ENOMEM;
     }
 
-    hd->paging_mode = is_hvm_domain(d) ?
-        IOMMU_PAGE_TABLE_LEVEL_4 : get_paging_mode(max_page);
-
+    hd->paging_mode = get_paging_mode(max_page);
     hd->domain_id = d->domain_id;
 
     return 0;
@@ -337,8 +335,8 @@ static void deallocate_next_page_table(s
         for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
         {
             pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE);
-            next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
-            if ( next_table_maddr != 0 )
+            next_table_maddr = get_next_table_from_pte((u32*)pde);
+            if ( next_table_maddr != 0 && !is_pte((u32*)pde) )
             {
                 deallocate_next_page_table(
                     maddr_to_page(next_table_maddr), level - 1);
@@ -362,7 +360,6 @@ static void deallocate_iommu_page_tables
     }
     spin_unlock(&hd->mapping_lock);
 }
-
 
 static void amd_iommu_domain_destroy(struct domain *d)
 {
diff -r 1f6773300a7b xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h	Thu Dec 02 16:06:30 2010 +0100
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h	Fri Dec 03 15:48:50 2010 +0100
@@ -51,9 +51,8 @@ int amd_iommu_map_page(struct domain *d,
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                        unsigned int flags);
 int amd_iommu_unmap_page(struct domain *d, unsigned long gfn);
-u64 amd_iommu_get_next_table_from_pte(u32 *entry);
 int amd_iommu_reserve_domain_unity_map(struct domain *domain,
-        unsigned long phys_addr, unsigned long size, int iw, int ir);
+        u64 phys_addr, unsigned long size, int iw, int ir);
 void invalidate_all_iommu_pages(struct domain *d);
 
 int amd_iommu_map_pages(struct domain *d, unsigned long gfn, unsigned long mfn,
@@ -158,4 +157,28 @@ static inline void __free_amd_iommu_tabl
     free_xenheap_pages(table, order);
 }
 
+static inline bool_t is_pte(u32 *pde)
+{
+    u32 next_level = get_field_from_reg_u32(pde[0],
+                                            IOMMU_PDE_NEXT_LEVEL_MASK,
+                                            IOMMU_PDE_NEXT_LEVEL_SHIFT);;
+    return ( next_level == 7 || next_level == 0 );
+}
+
+static inline u64 get_next_table_from_pte(u32 *entry)
+{
+    u64 addr_lo, addr_hi, ptr;
+
+    addr_lo = get_field_from_reg_u32(entry[0],
+                                     IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+                                     IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT);
+
+    addr_hi = get_field_from_reg_u32(entry[1],
+                                     IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
+                                     IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT);
+
+    ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
+    return ptr;
+}
+
 #endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-03 16:03 [PATCH 4/4] amd iommu: Large io page support - implementation Wei Wang2
@ 2010-12-07  0:47 ` Kay, Allen M
  2010-12-07 11:20   ` Wei Wang2
  0 siblings, 1 reply; 12+ messages in thread
From: Kay, Allen M @ 2010-12-07  0:47 UTC (permalink / raw)
  To: Wei Wang2, xen-devel

Hi Wei,

My understanding is that both EPT/NPT already supports 2M and 1G page sizes.  If this is true and if NPT supports the same page table format as AMD iommu, shouldn't iommu 2M and 1G support just a matter of pointing iommu page table pointer to NPT page table of the same guest OS thus sharing the same page table between NPT and AMD iommu?

This should save a lot code changes in iommu code.  We just need to flush iommu page table in IOTLB at appropriate places.

Allen

-----Original Message-----
From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2
Sent: Friday, December 03, 2010 8:04 AM
To: xen-devel@lists.xensource.com
Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support - implementation

This is the implementation.

Thanks,
We
Signed-off-by: Wei Wang <wei.wang2@amd.com>
--
Legal Information:
Advanced Micro Devices GmbH
Sitz: Dornach, Gemeinde Aschheim,
Landkreis München Registergericht München, HRB Nr. 43632
Geschäftsführer:
Alberto Bozzo, Andrew Bowd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-07  0:47 ` Kay, Allen M
@ 2010-12-07 11:20   ` Wei Wang2
  2010-12-07 18:00     ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Wei Wang2 @ 2010-12-07 11:20 UTC (permalink / raw)
  To: Kay, Allen M; +Cc: xen-devel

Hi Allen,
Actually, each amd iommu pde entry uses bit 9-11 to encode next page table 
level, but these bits are also used as AVL bits by p2m table to encode 
different page types...So, it might not be quite easy to share NPT table with 
amd iommu unless we change p2m table encoding for this first.
Thanks,
Wei

On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
> Hi Wei,
>
> My understanding is that both EPT/NPT already supports 2M and 1G page
> sizes.  If this is true and if NPT supports the same page table format as
> AMD iommu, shouldn't iommu 2M and 1G support just a matter of pointing
> iommu page table pointer to NPT page table of the same guest OS thus
> sharing the same page table between NPT and AMD iommu?
>
> This should save a lot code changes in iommu code.  We just need to flush
> iommu page table in IOTLB at appropriate places.
>
> Allen
>
> -----Original Message-----
> From: xen-devel-bounces@lists.xensource.com
> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2 Sent:
> Friday, December 03, 2010 8:04 AM
> To: xen-devel@lists.xensource.com
> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
> implementation
>
> This is the implementation.
>
> Thanks,
> We
> Signed-off-by: Wei Wang <wei.wang2@amd.com>
> --
> Legal Information:
> Advanced Micro Devices GmbH
> Sitz: Dornach, Gemeinde Aschheim,
> Landkreis München Registergericht München, HRB Nr. 43632
> Geschäftsführer:
> Alberto Bozzo, Andrew Bowd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-07 11:20   ` Wei Wang2
@ 2010-12-07 18:00     ` Keir Fraser
  2010-12-07 18:21       ` Kay, Allen M
  2010-12-08 10:03       ` Tim Deegan
  0 siblings, 2 replies; 12+ messages in thread
From: Keir Fraser @ 2010-12-07 18:00 UTC (permalink / raw)
  To: Wei Wang2, Kay, Allen M; +Cc: Tim Deegan, xen-devel


Cc'ing Tim -- he could advise how plausible it is to find other available
bits to move the p2m types to. Also I'm not sure whether the p2m tables ever
get used as host pagetables these days (e.g., when guest has CR0.PG=0). That
could affect how difficult it is to mess with the p2m format.

If it's possible, though, it's probably worth pursuing. Sharing the tables
uses less memory, and could be less complicated code too.

 -- Keir

On 07/12/2010 11:20, "Wei Wang2" <wei.wang2@amd.com> wrote:

> Hi Allen,
> Actually, each amd iommu pde entry uses bit 9-11 to encode next page table
> level, but these bits are also used as AVL bits by p2m table to encode
> different page types...So, it might not be quite easy to share NPT table with
> amd iommu unless we change p2m table encoding for this first.
> Thanks,
> Wei
> 
> On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
>> Hi Wei,
>> 
>> My understanding is that both EPT/NPT already supports 2M and 1G page
>> sizes.  If this is true and if NPT supports the same page table format as
>> AMD iommu, shouldn't iommu 2M and 1G support just a matter of pointing
>> iommu page table pointer to NPT page table of the same guest OS thus
>> sharing the same page table between NPT and AMD iommu?
>> 
>> This should save a lot code changes in iommu code.  We just need to flush
>> iommu page table in IOTLB at appropriate places.
>> 
>> Allen
>> 
>> -----Original Message-----
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2 Sent:
>> Friday, December 03, 2010 8:04 AM
>> To: xen-devel@lists.xensource.com
>> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
>> implementation
>> 
>> This is the implementation.
>> 
>> Thanks,
>> We
>> Signed-off-by: Wei Wang <wei.wang2@amd.com>
>> --
>> Legal Information:
>> Advanced Micro Devices GmbH
>> Sitz: Dornach, Gemeinde Aschheim,
>> Landkreis München Registergericht München, HRB Nr. 43632
>> Geschäftsführer:
>> Alberto Bozzo, Andrew Bowd
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-07 18:00     ` Keir Fraser
@ 2010-12-07 18:21       ` Kay, Allen M
  2010-12-08 10:03       ` Tim Deegan
  1 sibling, 0 replies; 12+ messages in thread
From: Kay, Allen M @ 2010-12-07 18:21 UTC (permalink / raw)
  To: Keir Fraser, Wei Wang2; +Cc: Deegan

FYI.  We are currently testing a patch that sharing EPT and VT-d page tables on newer systems where both EPT and VT-d has the same page table format.  There is very little change in VT-d specific code.  We just need to do IOTLB flushes in appropriate places to support this.

-----Original Message-----
From: Keir Fraser [mailto:keir.xen@gmail.com] On Behalf Of Keir Fraser
Sent: Tuesday, December 07, 2010 10:01 AM
To: Wei Wang2; Kay, Allen M
Cc: xen-devel@lists.xensource.com; Tim Deegan
Subject: Re: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support - implementation


Cc'ing Tim -- he could advise how plausible it is to find other available
bits to move the p2m types to. Also I'm not sure whether the p2m tables ever
get used as host pagetables these days (e.g., when guest has CR0.PG=0). That
could affect how difficult it is to mess with the p2m format.

If it's possible, though, it's probably worth pursuing. Sharing the tables
uses less memory, and could be less complicated code too.

 -- Keir

On 07/12/2010 11:20, "Wei Wang2" <wei.wang2@amd.com> wrote:

> Hi Allen,
> Actually, each amd iommu pde entry uses bit 9-11 to encode next page table
> level, but these bits are also used as AVL bits by p2m table to encode
> different page types...So, it might not be quite easy to share NPT table with
> amd iommu unless we change p2m table encoding for this first.
> Thanks,
> Wei
> 
> On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
>> Hi Wei,
>> 
>> My understanding is that both EPT/NPT already supports 2M and 1G page
>> sizes.  If this is true and if NPT supports the same page table format as
>> AMD iommu, shouldn't iommu 2M and 1G support just a matter of pointing
>> iommu page table pointer to NPT page table of the same guest OS thus
>> sharing the same page table between NPT and AMD iommu?
>> 
>> This should save a lot code changes in iommu code.  We just need to flush
>> iommu page table in IOTLB at appropriate places.
>> 
>> Allen
>> 
>> -----Original Message-----
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2 Sent:
>> Friday, December 03, 2010 8:04 AM
>> To: xen-devel@lists.xensource.com
>> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
>> implementation
>> 
>> This is the implementation.
>> 
>> Thanks,
>> We
>> Signed-off-by: Wei Wang <wei.wang2@amd.com>
>> --
>> Legal Information:
>> Advanced Micro Devices GmbH
>> Sitz: Dornach, Gemeinde Aschheim,
>> Landkreis München Registergericht München, HRB Nr. 43632
>> Geschäftsführer:
>> Alberto Bozzo, Andrew Bowd
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-07 18:00     ` Keir Fraser
  2010-12-07 18:21       ` Kay, Allen M
@ 2010-12-08 10:03       ` Tim Deegan
  2010-12-08 13:02         ` Wei Wang2
  1 sibling, 1 reply; 12+ messages in thread
From: Tim Deegan @ 2010-12-08 10:03 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Wei Wang2, xen-devel, Kay, Allen M

Hi, 

At 18:00 +0000 on 07 Dec (1291744845), Keir Fraser wrote:
> Cc'ing Tim -- he could advise how plausible it is to find other available
> bits to move the p2m types to.

PAE Xen we need to use bits 9-11 because there are no other bits
available.  On 64-bit we could just shift all the type bits up by three;
I think there's plenty of space above bit 52 for all the types we need.

Even on PAE we really only need to store a p2m type in leaf nodes (l1
entries and superpage entries) so bits 9-11 are free on internal nodes.
I don't understand why the iommu tables have a level stored in every
entry, though - is that a hardware requirement?

> Also I'm not sure whether the p2m tables ever
> get used as host pagetables these days (e.g., when guest has CR0.PG=0). That
> could affect how difficult it is to mess with the p2m format.

They never get used for PG=0 any more, we have a page in the firmware
with a 1-1 4GB mapping instead.  But they do get used as pagetables in
order to use a linear mapping to read the p2m quickly. 

> If it's possible, though, it's probably worth pursuing. Sharing the tables
> uses less memory, and could be less complicated code too.

Yep, sounds like a great idea. 

Cheers,

Tim.

> On 07/12/2010 11:20, "Wei Wang2" <wei.wang2@amd.com> wrote:
> 
> > Hi Allen,
> > Actually, each amd iommu pde entry uses bit 9-11 to encode next page table
> > level, but these bits are also used as AVL bits by p2m table to encode
> > different page types...So, it might not be quite easy to share NPT table with
> > amd iommu unless we change p2m table encoding for this first.
> > Thanks,
> > Wei
> > 
> > On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
> >> Hi Wei,
> >> 
> >> My understanding is that both EPT/NPT already supports 2M and 1G page
> >> sizes.  If this is true and if NPT supports the same page table format as
> >> AMD iommu, shouldn't iommu 2M and 1G support just a matter of pointing
> >> iommu page table pointer to NPT page table of the same guest OS thus
> >> sharing the same page table between NPT and AMD iommu?
> >> 
> >> This should save a lot code changes in iommu code.  We just need to flush
> >> iommu page table in IOTLB at appropriate places.
> >> 
> >> Allen
> >> 
> >> -----Original Message-----
> >> From: xen-devel-bounces@lists.xensource.com
> >> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2 Sent:
> >> Friday, December 03, 2010 8:04 AM
> >> To: xen-devel@lists.xensource.com
> >> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
> >> implementation
> >> 
> >> This is the implementation.
> >> 
> >> Thanks,
> >> We
> >> Signed-off-by: Wei Wang <wei.wang2@amd.com>
> >> --
> >> Legal Information:
> >> Advanced Micro Devices GmbH
> >> Sitz: Dornach, Gemeinde Aschheim,
> >> Landkreis München Registergericht München, HRB Nr. 43632
> >> Geschäftsführer:
> >> Alberto Bozzo, Andrew Bowd
> > 
> > 
> > 
> > 
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
> 
> 

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-08 10:03       ` Tim Deegan
@ 2010-12-08 13:02         ` Wei Wang2
  2010-12-08 13:11           ` Tim Deegan
  0 siblings, 1 reply; 12+ messages in thread
From: Wei Wang2 @ 2010-12-08 13:02 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Kay, Allen M, xen-devel, Keir Fraser

On Wednesday 08 December 2010 11:03:18 Tim Deegan wrote:
> Hi,
>
> At 18:00 +0000 on 07 Dec (1291744845), Keir Fraser wrote:
> > Cc'ing Tim -- he could advise how plausible it is to find other available
> > bits to move the p2m types to.
>
> PAE Xen we need to use bits 9-11 because there are no other bits
> available.  On 64-bit we could just shift all the type bits up by three;
> I think there's plenty of space above bit 52 for all the types we need.
>
> Even on PAE we really only need to store a p2m type in leaf nodes (l1
> entries and superpage entries) so bits 9-11 are free on internal nodes.
> I don't understand why the iommu tables have a level stored in every
> entry, though - is that a hardware requirement?

Hi Tim,
AMD IOMMU supports "skip level" feature, which allows hardware to skip page 
table walks if io address contains long string of 0 bits. To do this, iommu 
page table encodes the level of lower page table directly in bit 9- 11 of 
every pde entry. Therefore, to share p2m table with iommu, either p2m code or 
iommu code has to maintain bit 9- bit 11 in p2m entries as "next  level" 
field. 

Also even in leaf entries, amd iommu hardware requires the content of "next 
level" field to be '0' or '7'. So, in order to be more compatible with iommu, 
we should at least use '0' or '7' in bit 9- bit 11 to represent p2m type for 
normal guest ram in the case of PAE.

Thanks,
Wei
>
> > Also I'm not sure whether the p2m tables ever
> > get used as host pagetables these days (e.g., when guest has CR0.PG=0).
> > That could affect how difficult it is to mess with the p2m format.
>
> They never get used for PG=0 any more, we have a page in the firmware
> with a 1-1 4GB mapping instead.  But they do get used as pagetables in
> order to use a linear mapping to read the p2m quickly.
>
> > If it's possible, though, it's probably worth pursuing. Sharing the
> > tables uses less memory, and could be less complicated code too.
>
> Yep, sounds like a great idea.
>
> Cheers,
>
> Tim.
>
> > On 07/12/2010 11:20, "Wei Wang2" <wei.wang2@amd.com> wrote:
> > > Hi Allen,
> > > Actually, each amd iommu pde entry uses bit 9-11 to encode next page
> > > table level, but these bits are also used as AVL bits by p2m table to
> > > encode different page types...So, it might not be quite easy to share
> > > NPT table with amd iommu unless we change p2m table encoding for this
> > > first.
> > > Thanks,
> > > Wei
> > >
> > > On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
> > >> Hi Wei,
> > >>
> > >> My understanding is that both EPT/NPT already supports 2M and 1G page
> > >> sizes.  If this is true and if NPT supports the same page table format
> > >> as AMD iommu, shouldn't iommu 2M and 1G support just a matter of
> > >> pointing iommu page table pointer to NPT page table of the same guest
> > >> OS thus sharing the same page table between NPT and AMD iommu?
> > >>
> > >> This should save a lot code changes in iommu code.  We just need to
> > >> flush iommu page table in IOTLB at appropriate places.
> > >>
> > >> Allen
> > >>
> > >> -----Original Message-----
> > >> From: xen-devel-bounces@lists.xensource.com
> > >> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2
> > >> Sent: Friday, December 03, 2010 8:04 AM
> > >> To: xen-devel@lists.xensource.com
> > >> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
> > >> implementation
> > >>
> > >> This is the implementation.
> > >>
> > >> Thanks,
> > >> We
> > >> Signed-off-by: Wei Wang <wei.wang2@amd.com>
> > >> --
> > >> Legal Information:
> > >> Advanced Micro Devices GmbH
> > >> Sitz: Dornach, Gemeinde Aschheim,
> > >> Landkreis München Registergericht München, HRB Nr. 43632
> > >> Geschäftsführer:
> > >> Alberto Bozzo, Andrew Bowd
> > >
> > > _______________________________________________
> > > Xen-devel mailing list
> > > Xen-devel@lists.xensource.com
> > > http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] amd iommu: Large io page support - implementation
  2010-12-08 13:02         ` Wei Wang2
@ 2010-12-08 13:11           ` Tim Deegan
  2010-12-08 18:38             ` [RFC][PATCH][VTD] EPT/VT-d page table sharing Kay, Allen M
  0 siblings, 1 reply; 12+ messages in thread
From: Tim Deegan @ 2010-12-08 13:11 UTC (permalink / raw)
  To: Wei Wang2; +Cc: Kay, Allen M, xen-devel, Keir Fraser

At 13:02 +0000 on 08 Dec (1291813359), Wei Wang2 wrote:
> AMD IOMMU supports "skip level" feature, which allows hardware to skip page 
> table walks if io address contains long string of 0 bits. To do this, iommu 
> page table encodes the level of lower page table directly in bit 9- 11 of 
> every pde entry. Therefore, to share p2m table with iommu, either p2m code or 
> iommu code has to maintain bit 9- bit 11 in p2m entries as "next  level" 
> field. 

Oh, nice!  Thanks for clarifying. 

> Also even in leaf entries, amd iommu hardware requires the content of "next 
> level" field to be '0' or '7'. So, in order to be more compatible with iommu, 
> we should at least use '0' or '7' in bit 9- bit 11 to represent p2m type for 
> normal guest ram in the case of PAE.

That's not going to be enough; there are more than two types of memory
that are marked as 'present' in the p2m.  I think that we just can't
have IOMMU and p2m sharing a pagetable on PAE builds.

I don't think that having IOMMU support be 64-bit only would be too
terrible, though.  A number of other features are 64-bit only, and the
64-bit hypervisor is generally better anyway.  Are there going to be
many machines with IOMMUs that don't have 64-bit processors?

Cheers,

Tim.

> > > Also I'm not sure whether the p2m tables ever
> > > get used as host pagetables these days (e.g., when guest has CR0.PG=0).
> > > That could affect how difficult it is to mess with the p2m format.
> >
> > They never get used for PG=0 any more, we have a page in the firmware
> > with a 1-1 4GB mapping instead.  But they do get used as pagetables in
> > order to use a linear mapping to read the p2m quickly.
> >
> > > If it's possible, though, it's probably worth pursuing. Sharing the
> > > tables uses less memory, and could be less complicated code too.
> >
> > Yep, sounds like a great idea.
> >
> > Cheers,
> >
> > Tim.
> >
> > > On 07/12/2010 11:20, "Wei Wang2" <wei.wang2@amd.com> wrote:
> > > > Hi Allen,
> > > > Actually, each amd iommu pde entry uses bit 9-11 to encode next page
> > > > table level, but these bits are also used as AVL bits by p2m table to
> > > > encode different page types...So, it might not be quite easy to share
> > > > NPT table with amd iommu unless we change p2m table encoding for this
> > > > first.
> > > > Thanks,
> > > > Wei
> > > >
> > > > On Tuesday 07 December 2010 01:47:22 Kay, Allen M wrote:
> > > >> Hi Wei,
> > > >>
> > > >> My understanding is that both EPT/NPT already supports 2M and 1G page
> > > >> sizes.  If this is true and if NPT supports the same page table format
> > > >> as AMD iommu, shouldn't iommu 2M and 1G support just a matter of
> > > >> pointing iommu page table pointer to NPT page table of the same guest
> > > >> OS thus sharing the same page table between NPT and AMD iommu?
> > > >>
> > > >> This should save a lot code changes in iommu code.  We just need to
> > > >> flush iommu page table in IOTLB at appropriate places.
> > > >>
> > > >> Allen
> > > >>
> > > >> -----Original Message-----
> > > >> From: xen-devel-bounces@lists.xensource.com
> > > >> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Wei Wang2
> > > >> Sent: Friday, December 03, 2010 8:04 AM
> > > >> To: xen-devel@lists.xensource.com
> > > >> Subject: [Xen-devel] [PATCH 4/4] amd iommu: Large io page support -
> > > >> implementation
> > > >>
> > > >> This is the implementation.
> > > >>
> > > >> Thanks,
> > > >> We
> > > >> Signed-off-by: Wei Wang <wei.wang2@amd.com>
> > > >> --
> > > >> Legal Information:
> > > >> Advanced Micro Devices GmbH
> > > >> Sitz: Dornach, Gemeinde Aschheim,
> > > >> Landkreis München Registergericht München, HRB Nr. 43632
> > > >> Geschäftsführer:
> > > >> Alberto Bozzo, Andrew Bowd
> > > >
> > > > _______________________________________________
> > > > Xen-devel mailing list
> > > > Xen-devel@lists.xensource.com
> > > > http://lists.xensource.com/xen-devel
> 
> 
> 

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC][PATCH][VTD] EPT/VT-d page table sharing
  2010-12-08 13:11           ` Tim Deegan
@ 2010-12-08 18:38             ` Kay, Allen M
  2010-12-09 10:12               ` Tim Deegan
  0 siblings, 1 reply; 12+ messages in thread
From: Kay, Allen M @ 2010-12-08 18:38 UTC (permalink / raw)
  To: Wei Wang2, Tim Deegan, Keir Fraser; +Cc: xen-devel, Han, Weidong

[-- Attachment #1: Type: text/plain, Size: 415 bytes --]

Attached is EPT/VT-d page table sharing patch I mentioned earlier.  Basic idea is to leverage 2MB and 1GB page size support in EPT by having VT-d using the same page tables as EPT.  When EPT page table changes, flush VT-d IOTLB cache.

We are still further testing this patch but would appreciate feedbacks.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Allen Kay <allen.m.kay@intel.com>

[-- Attachment #2: share12083.patch --]
[-- Type: application/octet-stream, Size: 17148 bytes --]

diff -r bfd13358b8bf xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Thu Nov 25 05:00:06 2010 -0800
@@ -44,7 +44,7 @@
     p2m_lock(p2m);
 
     /* Check to make sure this is still PoD */
-    if ( entry->avail1 != p2m_populate_on_demand )
+    if ( entry->avail2 != p2m_populate_on_demand )
     {
         p2m_unlock(p2m);
         return 0;
@@ -110,13 +110,8 @@
     if ( pg == NULL )
         return 0;
 
-    ept_entry->emt = 0;
-    ept_entry->ipat = 0;
-    ept_entry->sp = 0;
-    ept_entry->avail1 = 0;
+    ept_entry->epte = 0;
     ept_entry->mfn = page_to_mfn(pg);
-    ept_entry->avail2 = 0;
-
     ept_entry->r = ept_entry->w = ept_entry->x = 1;
 
     return 1;
@@ -166,14 +161,15 @@
     {
         ept_entry_t *epte = table + i;
 
+        epte->epte = 0;
         epte->emt = ept_entry->emt;
         epte->ipat = ept_entry->ipat;
         epte->sp = (level > 1) ? 1 : 0;
-        epte->avail1 = ept_entry->avail1;
-        epte->avail2 = 0;
+        epte->avail2 = ept_entry->avail2;
         epte->mfn = ept_entry->mfn + i * trunk;
+        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
-        ept_p2m_type_to_flags(epte, epte->avail1);
+        ept_p2m_type_to_flags(epte, epte->avail2);
 
         if ( (level - 1) == target )
             continue;
@@ -225,7 +221,7 @@
 
     if ( !is_epte_present(ept_entry) )
     {
-        if ( ept_entry->avail1 == p2m_populate_on_demand )
+        if ( ept_entry->avail2 == p2m_populate_on_demand )
             return GUEST_TABLE_POD_PAGE;
 
         if ( read_only )
@@ -254,7 +250,7 @@
 ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
               unsigned int order, p2m_type_t p2mt)
 {
-    ept_entry_t *table, *ept_entry;
+    ept_entry_t *table, *ept_entry = NULL;
     unsigned long gfn_remainder = gfn;
     unsigned long offset = 0;
     u32 index;
@@ -264,6 +260,7 @@
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
+    int ept_present = 0;
     int needs_sync = 1;
     struct domain *d = p2m->domain;
 
@@ -301,6 +298,7 @@
     offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1);
 
     ept_entry = table + index;
+    ept_present = (ept_entry->epte & 0x7) ? 1 : 0;
 
     /*
      * When we are here, we must be on a leaf ept entry
@@ -318,12 +316,13 @@
         if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
              (p2mt == p2m_ram_paging_in_start) )
         {
+            ept_entry->epte = 0;
             ept_entry->emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
                                                 direct_mmio);
             ept_entry->ipat = ipat;
             ept_entry->sp = order ? 1 : 0;
-            ept_entry->avail1 = p2mt;
-            ept_entry->avail2 = 0;
+            ept_entry->avail2 = p2mt;
+            ept_entry->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
             if ( ept_entry->mfn == mfn_x(mfn) )
                 need_modify_vtd_table = 0;
@@ -365,11 +364,12 @@
 
         ept_entry = table + index;
 
+        ept_entry->epte = 0;
         ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
         ept_entry->ipat = ipat;
         ept_entry->sp = i ? 1 : 0;
-        ept_entry->avail1 = p2mt;
-        ept_entry->avail2 = 0;
+        ept_entry->avail2 = p2mt;
+        ept_entry->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
         if ( ept_entry->mfn == mfn_x(mfn) )
              need_modify_vtd_table = 0;
@@ -393,31 +393,35 @@
     if ( needs_sync )
         ept_sync_domain(p2m->domain);
 
-    /* Now the p2m table is not shared with vt-d page table */
     if ( rv && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table )
     {
-        if ( p2mt == p2m_ram_rw )
-        {
-            if ( order == EPT_TABLE_ORDER )
-            {
-                for ( i = 0; i < (1 << order); i++ )
-                    iommu_map_page(
-                        p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i,
-                        IOMMUF_readable | IOMMUF_writable);
-            }
-            else if ( !order )
-                iommu_map_page(
-                    p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable);
-        }
+        if ( iommu_hap_pt_share )
+            iommu_pte_flush(d, gfn, (u64*)ept_entry, ept_present);
         else
         {
-            if ( order == EPT_TABLE_ORDER )
+            if ( p2mt == p2m_ram_rw )
             {
-                for ( i = 0; i < (1 << order); i++ )
-                    iommu_unmap_page(p2m->domain, gfn - offset + i);
+                if ( order == EPT_TABLE_ORDER )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_map_page(
+                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i,
+                            IOMMUF_readable | IOMMUF_writable);
+                }
+                else if ( !order )
+                    iommu_map_page(
+                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable);
             }
-            else if ( !order )
-                iommu_unmap_page(p2m->domain, gfn);
+            else
+            {
+                if ( order == EPT_TABLE_ORDER )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_unmap_page(p2m->domain, gfn - offset + i);
+                }
+                else if ( !order )
+                    iommu_unmap_page(p2m->domain, gfn);
+            }
         }
     }
 
@@ -483,7 +487,7 @@
     index = gfn_remainder >> (i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
-    if ( ept_entry->avail1 == p2m_populate_on_demand )
+    if ( ept_entry->avail2 == p2m_populate_on_demand )
     {
         if ( q == p2m_query )
         {
@@ -499,9 +503,9 @@
     }
 
 
-    if ( ept_entry->avail1 != p2m_invalid )
+    if ( ept_entry->avail2 != p2m_invalid )
     {
-        *t = ept_entry->avail1;
+        *t = ept_entry->avail2;
         mfn = _mfn(ept_entry->mfn);
         if ( i )
         {
@@ -654,7 +658,7 @@
         uint64_t trunk = 0;
 
         e = ept_get_entry_content(p2m, gfn, &level);
-        if ( !p2m_has_emt(e.avail1) )
+        if ( !p2m_has_emt(e.avail2) )
             continue;
 
         order = 0;
@@ -673,8 +677,8 @@
                      */
                     order = level * EPT_TABLE_ORDER;
                     if ( need_modify_ept_entry(p2m, gfn, mfn, 
-                          e.ipat, e.emt, e.avail1) )
-                        ept_set_entry(p2m, gfn, mfn, order, e.avail1);
+                          e.ipat, e.emt, e.avail2) )
+                        ept_set_entry(p2m, gfn, mfn, order, e.avail2);
                     gfn += trunk;
                     break;
                 }
@@ -683,8 +687,8 @@
         }
         else /* gfn assigned with 4k */
         {
-            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.avail1) )
-                ept_set_entry(p2m, gfn, mfn, order, e.avail1);
+            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.avail2) )
+                ept_set_entry(p2m, gfn, mfn, order, e.avail2);
         }
     }
     p2m_unlock(p2m);
@@ -710,10 +714,10 @@
                                        ept_page_level - 1, ot, nt);
         else
         {
-            if ( epte[i].avail1 != ot )
+            if ( epte[i].avail2 != ot )
                 continue;
 
-            epte[i].avail1 = nt;
+            epte[i].avail2 = nt;
             ept_p2m_type_to_flags(epte + i, nt);
         }
     }
@@ -787,9 +791,9 @@
 
             index = gfn_remainder >> order;
             ept_entry = table + index;
-            if ( ept_entry->avail1 != p2m_invalid )
+            if ( ept_entry->avail2 != p2m_invalid )
             {
-                ( ept_entry->avail1 == p2m_populate_on_demand ) ? 
+                ( ept_entry->avail2 == p2m_populate_on_demand ) ? 
                 ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
                 ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
 
diff -r bfd13358b8bf xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/arch/x86/mm/p2m.c	Thu Nov 25 05:00:06 2010 -0800
@@ -1800,6 +1800,7 @@
     struct page_info *page, *p2m_top;
     unsigned int page_count = 0;
     unsigned long gfn = -1UL;
+    struct domain *d = p2m->domain;
 
     p2m_lock(p2m);
 
@@ -1828,6 +1829,10 @@
 
     p2m->phys_table = pagetable_from_mfn(page_to_mfn(p2m_top));
 
+    if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        iommu_set_pgd(d);
+
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
diff -r bfd13358b8bf xen/drivers/passthrough/iommu.c
--- a/xen/drivers/passthrough/iommu.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/iommu.c	Thu Nov 25 05:00:06 2010 -0800
@@ -47,6 +47,7 @@
 bool_t __read_mostly iommu_snoop = 1;
 bool_t __read_mostly iommu_qinval = 1;
 bool_t __read_mostly iommu_intremap = 1;
+bool_t __read_mostly iommu_hap_pt_share;
 bool_t __read_mostly amd_iommu_debug;
 bool_t __read_mostly amd_iommu_perdev_intremap;
 
@@ -174,10 +175,11 @@
     if ( has_arch_pdevs(d) && !need_iommu(d) )
     {
         d->need_iommu = 1;
-        rc = iommu_populate_page_table(d);
+        if ( !iommu_hap_pt_share )
+            rc = iommu_populate_page_table(d);
         goto done;
     }
-done:    
+done:
     spin_unlock(&pcidevs_lock);
     return rc;
 }
diff -r bfd13358b8bf xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/vtd/iommu.c	Thu Nov 25 05:00:06 2010 -0800
@@ -33,6 +33,8 @@
 #include <xen/keyhandler.h>
 #include <asm/msi.h>
 #include <asm/irq.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/p2m.h>
 #include <mach_apic.h>
 #include "iommu.h"
 #include "dmar.h"
@@ -1627,6 +1629,9 @@
     if ( list_empty(&acpi_drhd_units) )
         return;
 
+    if ( iommu_hap_pt_share )
+        return;
+
     spin_lock(&hd->mapping_lock);
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
@@ -1645,6 +1650,10 @@
     int flush_dev_iotlb;
     int iommu_domid;
 
+    /* Do nothing if VT-d shares EPT page table */
+    if ( iommu_hap_pt_share )
+        return 0;
+
     /* do nothing if dom0 and iommu supports pass thru */
     if ( iommu_passthrough && (d->domain_id == 0) )
         return 0;
@@ -1715,6 +1724,83 @@
     return 0;
 }
 
+void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu = NULL;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    int flush_dev_iotlb;
+    int iommu_domid;
+
+    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
+            continue;
+
+        flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
+        iommu_domid= domain_iommu_domid(d, iommu);
+        if ( iommu_domid == -1 )
+            continue;
+        if ( iommu_flush_iotlb_psi(iommu, iommu_domid,
+                                   (paddr_t)gfn << PAGE_SHIFT_4K, 1,
+                                   !present, flush_dev_iotlb) )
+            iommu_flush_write_buffer(iommu);
+    }
+}
+
+static int vtd_ept_page_compatible(struct iommu *iommu)
+{
+    u64 cap = iommu->cap;
+
+    if ( ept_has_2mb(cpu_has_vmx_ept_2mb) != cap_sps_2mb(cap) )
+        return 0;
+
+    if ( ept_has_1gb(cpu_has_vmx_ept_1gb) != cap_sps_1gb(cap) )
+        return 0;
+
+    return 1;
+}
+
+/*
+ * set VT-d page table directory to EPT table if they can share
+ */
+
+static int sharept;
+boolean_param("sharept", sharept);
+
+void iommu_set_pgd(struct domain *d)
+{
+    struct acpi_drhd_unit *drhd;
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    mfn_t pgd_mfn;
+
+    ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled );
+
+    if ( !sharept )
+        goto out;
+
+    for_each_drhd_unit ( drhd )
+    {
+        if ( drhd->iommu->nr_pt_levels != VTD_PAGE_TABLE_LEVEL_4 )
+            goto out;
+
+        if ( !vtd_ept_page_compatible(drhd->iommu) )
+            goto out;
+    }
+    iommu_hap_pt_share = TRUE;
+
+    pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
+    hd->pgd_maddr = pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
+
+out:
+    gdprintk(XENLOG_INFO VTDPREFIX,
+             "VT-d page table %s with EPT table\n",
+             iommu_hap_pt_share ? "shares" : "not shares");
+}
+
 static int domain_rmrr_mapped(struct domain *d,
                               struct acpi_rmrr_unit *rmrr)
 {
diff -r bfd13358b8bf xen/drivers/passthrough/vtd/iommu.h
--- a/xen/drivers/passthrough/vtd/iommu.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/vtd/iommu.h	Thu Nov 25 05:00:06 2010 -0800
@@ -68,15 +68,19 @@
 /*
  * Decoding Capability Register
  */
-#define cap_read_drain(c)    (((c) >> 55) & 1)
-#define cap_write_drain(c)    (((c) >> 54) & 1)
-#define cap_max_amask_val(c)    (((c) >> 48) & 0x3f)
-#define cap_num_fault_regs(c)    ((((c) >> 40) & 0xff) + 1)
+#define cap_read_drain(c)      (((c) >> 55) & 1)
+#define cap_write_drain(c)     (((c) >> 54) & 1)
+#define cap_max_amask_val(c)   (((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)  ((((c) >> 40) & 0xff) + 1)
 #define cap_pgsel_inv(c)       (((c) >> 39) & 1)
 
-#define cap_super_page_val(c)    (((c) >> 34) & 0xf)
+#define cap_super_page_val(c)  (((c) >> 34) & 0xf)
 #define cap_super_offset(c)    (((find_first_bit(&cap_super_page_val(c), 4)) \
-                    * OFFSET_STRIDE) + 21)
+                                 * OFFSET_STRIDE) + 21)
+#define cap_sps_2mb(c)         ((c >> 34) & 1)
+#define cap_sps_1gb(c)         ((c >> 35) & 1)
+#define cap_sps_512gb(c)       ((c >> 36) & 1)
+#define cap_sps_1tb(c)         ((c >> 37) & 1)
 
 #define cap_fault_reg_offset(c)    ((((c) >> 24) & 0x3ff) * 16)
 
diff -r bfd13358b8bf xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h	Thu Nov 25 05:00:06 2010 -0800
@@ -30,15 +30,21 @@
 
 typedef union {
     struct {
-        u64 r       :   1,
-        w           :   1,
-        x           :   1,
-        emt         :   3, /* EPT Memory type */
-        ipat        :   1, /* Ignore PAT memory type */
-        sp          :   1, /* Is this a superpage? */
-        avail1      :   4,
-        mfn         :   40,
-        avail2      :   12;
+        u64 r       :   1,  /* bit 0 - Read permission */
+        w           :   1,  /* bit 1 - Write permission */
+        x           :   1,  /* bit 2 - Execute permission */
+        emt         :   3,  /* bits 5:3 - EPT Memory type */
+        ipat        :   1,  /* bit 6 - Ignore PAT memory type */
+        sp          :   1,  /* bit 7 - Is this a superpage? */
+        rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
+        avail1      :   1,  /* bit 10 - Software available 1 */
+        rsvd2_snp   :   1,  /* bit 11 - Used for VT-d snoop control
+                               in shared EPT/VT-d usage */
+        mfn         :   40, /* bits 51:12 - Machine physical frame number */
+        avail2      :   10, /* bits 61:52 - Software available 2 */
+        rsvd3_tm    :   1,  /* bit 62 - Used for VT-d transient-mapping
+                               hint in shared EPT/VT-d usage */
+        avail3      :   1;  /* bit 63 - Software available 3 */
     };
     u64 epte;
 } ept_entry_t;
@@ -208,6 +214,11 @@
 #define cpu_has_vmx_ept_invept_single_context   \
     (vmx_ept_vpid_cap & VMX_EPT_INVEPT_SINGLE_CONTEXT)
 
+#define EPT_2MB_SHIFT     16
+#define EPT_1GB_SHIFT     17
+#define ept_has_2mb(c)    ((c >> EPT_2MB_SHIFT) & 1)
+#define ept_has_1gb(c)    ((c >> EPT_1GB_SHIFT) & 1)
+
 #define INVEPT_SINGLE_CONTEXT   1
 #define INVEPT_ALL_CONTEXT      2
 
diff -r bfd13358b8bf xen/include/xen/iommu.h
--- a/xen/include/xen/iommu.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/include/xen/iommu.h	Thu Nov 25 05:00:06 2010 -0800
@@ -30,6 +30,7 @@
 extern bool_t force_iommu, iommu_verbose;
 extern bool_t iommu_workaround_bios_bug, iommu_passthrough;
 extern bool_t iommu_snoop, iommu_qinval, iommu_intremap;
+extern bool_t iommu_hap_pt_share;
 extern bool_t amd_iommu_debug;
 extern bool_t amd_iommu_perdev_intremap;
 
@@ -85,7 +86,8 @@
 int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                    unsigned int flags);
 int iommu_unmap_page(struct domain *d, unsigned long gfn);
-
+void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present);
+void iommu_set_pgd(struct domain *d);
 void iommu_domain_teardown(struct domain *d);
 int hvm_do_IRQ_dpci(struct domain *d, unsigned int irq);
 int dpci_ioport_intercept(ioreq_t *p);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH][VTD] EPT/VT-d page table sharing
  2010-12-08 18:38             ` [RFC][PATCH][VTD] EPT/VT-d page table sharing Kay, Allen M
@ 2010-12-09 10:12               ` Tim Deegan
  2010-12-11  2:01                 ` Kay, Allen M
  0 siblings, 1 reply; 12+ messages in thread
From: Tim Deegan @ 2010-12-09 10:12 UTC (permalink / raw)
  To: Kay, Allen M; +Cc: Wei Wang2, Han, Weidong, xen-devel, Keir Fraser

At 18:38 +0000 on 08 Dec (1291833518), Kay, Allen M wrote:
> Attached is EPT/VT-d page table sharing patch I mentioned earlier.  Basic idea is to leverage 2MB and 1GB page size support in EPT by having VT-d using the same page tables as EPT.  When EPT page table changes, flush VT-d IOTLB cache.
> 
> We are still further testing this patch but would appreciate feedbacks.

Looks good!  Two minor nits:
 - the path through iommu_set_pgd where the feature gets turned on is a
   bit confusing.  It would be cleaner to have the global flag enabled 
   as part of the general machine init.
 - while you're moving the p2m bits into avail2, could you rename the
   avail2 field to p2mt or similar?  It would make the rest of the code
   clearer.

Cheers,

Tim.

> Signed-off-by: Weidong Han <weidong.han@intel.com>
> Signed-off-by: Allen Kay <allen.m.kay@intel.com>



-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC][PATCH][VTD] EPT/VT-d page table sharing
  2010-12-09 10:12               ` Tim Deegan
@ 2010-12-11  2:01                 ` Kay, Allen M
  2010-12-21 17:08                   ` Ian Campbell
  0 siblings, 1 reply; 12+ messages in thread
From: Kay, Allen M @ 2010-12-11  2:01 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Wei Wang2, Han, Weidong, xen-devel, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1647 bytes --]

Hi Tim,

Thanks for your feedback.  In attached updated patch, I have:

   1) moved setting of iommu_hap_pt_share flag to init_vtd_hw(). This function initialized other vt-d specific features.  Let me know if you know of a better place.
   2) rename avail2 field to sa_p2mt.

Let me know if you have any additional comments.

Allen

-----Original Message-----
From: Tim Deegan [mailto:Tim.Deegan@citrix.com] 
Sent: Thursday, December 09, 2010 2:13 AM
To: Kay, Allen M
Cc: Wei Wang2; Keir Fraser; Han, Weidong; xen-devel@lists.xensource.com
Subject: Re: [RFC][PATCH][VTD] EPT/VT-d page table sharing

At 18:38 +0000 on 08 Dec (1291833518), Kay, Allen M wrote:
> Attached is EPT/VT-d page table sharing patch I mentioned earlier.  Basic idea is to leverage 2MB and 1GB page size support in EPT by having VT-d using the same page tables as EPT.  When EPT page table changes, flush VT-d IOTLB cache.
> 
> We are still further testing this patch but would appreciate feedbacks.

Looks good!  Two minor nits:
 - the path through iommu_set_pgd where the feature gets turned on is a
   bit confusing.  It would be cleaner to have the global flag enabled 
   as part of the general machine init.
 - while you're moving the p2m bits into avail2, could you rename the
   avail2 field to p2mt or similar?  It would make the rest of the code
   clearer.

Cheers,

Tim.

> Signed-off-by: Weidong Han <weidong.han@intel.com>
> Signed-off-by: Allen Kay <allen.m.kay@intel.com>



-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

[-- Attachment #2: share1210.patch --]
[-- Type: application/octet-stream, Size: 18876 bytes --]

diff -r bfd13358b8bf xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Sat Nov 27 12:08:57 2010 -0800
@@ -44,7 +44,7 @@
     p2m_lock(p2m);
 
     /* Check to make sure this is still PoD */
-    if ( entry->avail1 != p2m_populate_on_demand )
+    if ( entry->sa_p2mt != p2m_populate_on_demand )
     {
         p2m_unlock(p2m);
         return 0;
@@ -110,13 +110,8 @@
     if ( pg == NULL )
         return 0;
 
-    ept_entry->emt = 0;
-    ept_entry->ipat = 0;
-    ept_entry->sp = 0;
-    ept_entry->avail1 = 0;
+    ept_entry->epte = 0;
     ept_entry->mfn = page_to_mfn(pg);
-    ept_entry->avail2 = 0;
-
     ept_entry->r = ept_entry->w = ept_entry->x = 1;
 
     return 1;
@@ -166,14 +161,15 @@
     {
         ept_entry_t *epte = table + i;
 
+        epte->epte = 0;
         epte->emt = ept_entry->emt;
         epte->ipat = ept_entry->ipat;
         epte->sp = (level > 1) ? 1 : 0;
-        epte->avail1 = ept_entry->avail1;
-        epte->avail2 = 0;
+        epte->sa_p2mt = ept_entry->sa_p2mt;
         epte->mfn = ept_entry->mfn + i * trunk;
+        epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
-        ept_p2m_type_to_flags(epte, epte->avail1);
+        ept_p2m_type_to_flags(epte, epte->sa_p2mt);
 
         if ( (level - 1) == target )
             continue;
@@ -225,7 +221,7 @@
 
     if ( !is_epte_present(ept_entry) )
     {
-        if ( ept_entry->avail1 == p2m_populate_on_demand )
+        if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
             return GUEST_TABLE_POD_PAGE;
 
         if ( read_only )
@@ -254,7 +250,7 @@
 ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
               unsigned int order, p2m_type_t p2mt)
 {
-    ept_entry_t *table, *ept_entry;
+    ept_entry_t *table, *ept_entry = NULL;
     unsigned long gfn_remainder = gfn;
     unsigned long offset = 0;
     u32 index;
@@ -264,6 +260,7 @@
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
+    int vtd_pte_present = 0;
     int needs_sync = 1;
     struct domain *d = p2m->domain;
 
@@ -302,6 +299,9 @@
 
     ept_entry = table + index;
 
+    /* In case VT-d uses same page table, this flag is needed by VT-d */ 
+    vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0;
+
     /*
      * When we are here, we must be on a leaf ept entry
      * with i == target or i > target.
@@ -318,12 +318,13 @@
         if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
              (p2mt == p2m_ram_paging_in_start) )
         {
+            ept_entry->epte = 0;
             ept_entry->emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat,
                                                 direct_mmio);
             ept_entry->ipat = ipat;
             ept_entry->sp = order ? 1 : 0;
-            ept_entry->avail1 = p2mt;
-            ept_entry->avail2 = 0;
+            ept_entry->sa_p2mt = p2mt;
+            ept_entry->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
             if ( ept_entry->mfn == mfn_x(mfn) )
                 need_modify_vtd_table = 0;
@@ -365,11 +366,12 @@
 
         ept_entry = table + index;
 
+        ept_entry->epte = 0;
         ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio);
         ept_entry->ipat = ipat;
         ept_entry->sp = i ? 1 : 0;
-        ept_entry->avail1 = p2mt;
-        ept_entry->avail2 = 0;
+        ept_entry->sa_p2mt = p2mt;
+        ept_entry->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0;
 
         if ( ept_entry->mfn == mfn_x(mfn) )
              need_modify_vtd_table = 0;
@@ -393,31 +395,35 @@
     if ( needs_sync )
         ept_sync_domain(p2m->domain);
 
-    /* Now the p2m table is not shared with vt-d page table */
     if ( rv && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table )
     {
-        if ( p2mt == p2m_ram_rw )
-        {
-            if ( order == EPT_TABLE_ORDER )
-            {
-                for ( i = 0; i < (1 << order); i++ )
-                    iommu_map_page(
-                        p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i,
-                        IOMMUF_readable | IOMMUF_writable);
-            }
-            else if ( !order )
-                iommu_map_page(
-                    p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable);
-        }
+        if ( iommu_hap_pt_share )
+            iommu_pte_flush(d, gfn, (u64*)ept_entry, vtd_pte_present);
         else
         {
-            if ( order == EPT_TABLE_ORDER )
+            if ( p2mt == p2m_ram_rw )
             {
-                for ( i = 0; i < (1 << order); i++ )
-                    iommu_unmap_page(p2m->domain, gfn - offset + i);
+                if ( order == EPT_TABLE_ORDER )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_map_page(
+                            p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i,
+                            IOMMUF_readable | IOMMUF_writable);
+                }
+                else if ( !order )
+                    iommu_map_page(
+                        p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable);
             }
-            else if ( !order )
-                iommu_unmap_page(p2m->domain, gfn);
+            else
+            {
+                if ( order == EPT_TABLE_ORDER )
+                {
+                    for ( i = 0; i < (1 << order); i++ )
+                        iommu_unmap_page(p2m->domain, gfn - offset + i);
+                }
+                else if ( !order )
+                    iommu_unmap_page(p2m->domain, gfn);
+            }
         }
     }
 
@@ -483,7 +489,7 @@
     index = gfn_remainder >> (i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
-    if ( ept_entry->avail1 == p2m_populate_on_demand )
+    if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
     {
         if ( q == p2m_query )
         {
@@ -499,9 +505,9 @@
     }
 
 
-    if ( ept_entry->avail1 != p2m_invalid )
+    if ( ept_entry->sa_p2mt != p2m_invalid )
     {
-        *t = ept_entry->avail1;
+        *t = ept_entry->sa_p2mt;
         mfn = _mfn(ept_entry->mfn);
         if ( i )
         {
@@ -654,7 +660,7 @@
         uint64_t trunk = 0;
 
         e = ept_get_entry_content(p2m, gfn, &level);
-        if ( !p2m_has_emt(e.avail1) )
+        if ( !p2m_has_emt(e.sa_p2mt) )
             continue;
 
         order = 0;
@@ -673,8 +679,8 @@
                      */
                     order = level * EPT_TABLE_ORDER;
                     if ( need_modify_ept_entry(p2m, gfn, mfn, 
-                          e.ipat, e.emt, e.avail1) )
-                        ept_set_entry(p2m, gfn, mfn, order, e.avail1);
+                          e.ipat, e.emt, e.sa_p2mt) )
+                        ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt);
                     gfn += trunk;
                     break;
                 }
@@ -683,8 +689,8 @@
         }
         else /* gfn assigned with 4k */
         {
-            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.avail1) )
-                ept_set_entry(p2m, gfn, mfn, order, e.avail1);
+            if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.sa_p2mt) )
+                ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt);
         }
     }
     p2m_unlock(p2m);
@@ -710,10 +716,10 @@
                                        ept_page_level - 1, ot, nt);
         else
         {
-            if ( epte[i].avail1 != ot )
+            if ( epte[i].sa_p2mt != ot )
                 continue;
 
-            epte[i].avail1 = nt;
+            epte[i].sa_p2mt = nt;
             ept_p2m_type_to_flags(epte + i, nt);
         }
     }
@@ -787,9 +793,9 @@
 
             index = gfn_remainder >> order;
             ept_entry = table + index;
-            if ( ept_entry->avail1 != p2m_invalid )
+            if ( ept_entry->sa_p2mt != p2m_invalid )
             {
-                ( ept_entry->avail1 == p2m_populate_on_demand ) ? 
+                ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? 
                 ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) :
                 ( mfn = _mfn(ept_entry->mfn), is_pod = 0 );
 
diff -r bfd13358b8bf xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/arch/x86/mm/p2m.c	Sat Nov 27 12:08:57 2010 -0800
@@ -1800,6 +1800,7 @@
     struct page_info *page, *p2m_top;
     unsigned int page_count = 0;
     unsigned long gfn = -1UL;
+    struct domain *d = p2m->domain;
 
     p2m_lock(p2m);
 
@@ -1828,6 +1829,10 @@
 
     p2m->phys_table = pagetable_from_mfn(page_to_mfn(p2m_top));
 
+    if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        iommu_set_pgd(d);
+
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
diff -r bfd13358b8bf xen/drivers/passthrough/iommu.c
--- a/xen/drivers/passthrough/iommu.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/iommu.c	Sat Nov 27 12:08:57 2010 -0800
@@ -47,6 +47,7 @@
 bool_t __read_mostly iommu_snoop = 1;
 bool_t __read_mostly iommu_qinval = 1;
 bool_t __read_mostly iommu_intremap = 1;
+bool_t __read_mostly iommu_hap_pt_share;
 bool_t __read_mostly amd_iommu_debug;
 bool_t __read_mostly amd_iommu_perdev_intremap;
 
@@ -174,10 +175,11 @@
     if ( has_arch_pdevs(d) && !need_iommu(d) )
     {
         d->need_iommu = 1;
-        rc = iommu_populate_page_table(d);
+        if ( !iommu_hap_pt_share )
+            rc = iommu_populate_page_table(d);
         goto done;
     }
-done:    
+done:
     spin_unlock(&pcidevs_lock);
     return rc;
 }
diff -r bfd13358b8bf xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/vtd/iommu.c	Sat Nov 27 12:08:57 2010 -0800
@@ -33,6 +33,8 @@
 #include <xen/keyhandler.h>
 #include <asm/msi.h>
 #include <asm/irq.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/p2m.h>
 #include <mach_apic.h>
 #include "iommu.h"
 #include "dmar.h"
@@ -43,6 +45,9 @@
 #define nr_ioapics              iosapic_get_nr_iosapics()
 #endif
 
+static int sharept = 1;
+boolean_param("sharept", sharept);
+
 int nr_iommus;
 
 static void setup_dom0_devices(struct domain *d);
@@ -1627,6 +1632,9 @@
     if ( list_empty(&acpi_drhd_units) )
         return;
 
+    if ( iommu_hap_pt_share )
+        return;
+
     spin_lock(&hd->mapping_lock);
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
@@ -1645,6 +1653,10 @@
     int flush_dev_iotlb;
     int iommu_domid;
 
+    /* Do nothing if VT-d shares EPT page table */
+    if ( iommu_hap_pt_share )
+        return 0;
+
     /* do nothing if dom0 and iommu supports pass thru */
     if ( iommu_passthrough && (d->domain_id == 0) )
         return 0;
@@ -1715,6 +1727,63 @@
     return 0;
 }
 
+void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present)
+{
+    struct acpi_drhd_unit *drhd;
+    struct iommu *iommu = NULL;
+    struct hvm_iommu *hd = domain_hvm_iommu(d);
+    int flush_dev_iotlb;
+    int iommu_domid;
+
+    iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
+
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( !test_bit(iommu->index, &hd->iommu_bitmap) )
+            continue;
+
+        flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0;
+        iommu_domid= domain_iommu_domid(d, iommu);
+        if ( iommu_domid == -1 )
+            continue;
+        if ( iommu_flush_iotlb_psi(iommu, iommu_domid,
+                                   (paddr_t)gfn << PAGE_SHIFT_4K, 1,
+                                   !present, flush_dev_iotlb) )
+            iommu_flush_write_buffer(iommu);
+    }
+}
+
+static int vtd_ept_page_compatible(struct iommu *iommu)
+{
+    u64 cap = iommu->cap;
+
+    if ( ept_has_2mb(cpu_has_vmx_ept_2mb) != cap_sps_2mb(cap) )
+        return 0;
+
+    if ( ept_has_1gb(cpu_has_vmx_ept_1gb) != cap_sps_1gb(cap) )
+        return 0;
+
+    return 1;
+}
+
+/*
+ * set VT-d page table directory to EPT table if allowed
+ */
+void iommu_set_pgd(struct domain *d)
+{
+    struct hvm_iommu *hd  = domain_hvm_iommu(d);
+    mfn_t pgd_mfn;
+
+    ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled );
+
+    if ( !iommu_hap_pt_share )
+        return;
+
+    pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
+    hd->pgd_maddr = pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
+}
+
 static int domain_rmrr_mapped(struct domain *d,
                               struct acpi_rmrr_unit *rmrr)
 {
@@ -1871,6 +1940,9 @@
     unsigned long flags;
     struct irq_cfg *cfg;
 
+    /*
+     * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.  
+     */
     for_each_drhd_unit ( drhd )
     {
         iommu = drhd->iommu;
@@ -1895,6 +1967,9 @@
         spin_unlock_irqrestore(&iommu->register_lock, flags);
     }
 
+    /*
+     * Enable queue invalidation
+     */   
     for_each_drhd_unit ( drhd )
     {
         iommu = drhd->iommu;
@@ -1910,6 +1985,9 @@
         }
     }
 
+    /*
+     * Enable interrupt remapping
+     */  
     if ( iommu_intremap )
     {
         int apic;
@@ -1926,7 +2004,6 @@
             }
         }
     }
-
     if ( iommu_intremap )
     {
         for_each_drhd_unit ( drhd )
@@ -1941,6 +2018,11 @@
         }
     }
 
+    /*
+     * Set root entries for each VT-d engine.  After set root entry,
+     * must globally invalidate context cache, and then globally
+     * invalidate IOTLB
+     */
     for_each_drhd_unit ( drhd )
     {
         iommu = drhd->iommu;
@@ -1951,12 +2033,27 @@
             return -EIO;
         }
     }
+    iommu_flush_all();
 
     /*
-     * After set root entry, must globally invalidate context cache, and
-     * then globally invalidate IOTLB
+     * Determine whether EPT and VT-d page tables can be shared or not.
      */
-    iommu_flush_all();
+    iommu_hap_pt_share = TRUE;
+    for_each_drhd_unit ( drhd )
+    {
+        iommu = drhd->iommu;
+        if ( (drhd->iommu->nr_pt_levels != VTD_PAGE_TABLE_LEVEL_4) ||
+              !vtd_ept_page_compatible(drhd->iommu) )
+            iommu_hap_pt_share = FALSE;
+    }
+
+    /* keep boot flag sharept as safe fallback. remove after feature matures */
+    if ( !sharept )
+        iommu_hap_pt_share = FALSE;
+
+    gdprintk(XENLOG_INFO VTDPREFIX,
+             "VT-d page table %s with EPT table\n",
+             iommu_hap_pt_share ? "shares" : "not shares");
 
     return 0;
 }
diff -r bfd13358b8bf xen/drivers/passthrough/vtd/iommu.h
--- a/xen/drivers/passthrough/vtd/iommu.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/drivers/passthrough/vtd/iommu.h	Sat Nov 27 12:08:57 2010 -0800
@@ -68,15 +68,19 @@
 /*
  * Decoding Capability Register
  */
-#define cap_read_drain(c)    (((c) >> 55) & 1)
-#define cap_write_drain(c)    (((c) >> 54) & 1)
-#define cap_max_amask_val(c)    (((c) >> 48) & 0x3f)
-#define cap_num_fault_regs(c)    ((((c) >> 40) & 0xff) + 1)
+#define cap_read_drain(c)      (((c) >> 55) & 1)
+#define cap_write_drain(c)     (((c) >> 54) & 1)
+#define cap_max_amask_val(c)   (((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)  ((((c) >> 40) & 0xff) + 1)
 #define cap_pgsel_inv(c)       (((c) >> 39) & 1)
 
-#define cap_super_page_val(c)    (((c) >> 34) & 0xf)
+#define cap_super_page_val(c)  (((c) >> 34) & 0xf)
 #define cap_super_offset(c)    (((find_first_bit(&cap_super_page_val(c), 4)) \
-                    * OFFSET_STRIDE) + 21)
+                                 * OFFSET_STRIDE) + 21)
+#define cap_sps_2mb(c)         ((c >> 34) & 1)
+#define cap_sps_1gb(c)         ((c >> 35) & 1)
+#define cap_sps_512gb(c)       ((c >> 36) & 1)
+#define cap_sps_1tb(c)         ((c >> 37) & 1)
 
 #define cap_fault_reg_offset(c)    ((((c) >> 24) & 0x3ff) * 16)
 
diff -r bfd13358b8bf xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h	Sat Nov 27 12:08:57 2010 -0800
@@ -30,15 +30,21 @@
 
 typedef union {
     struct {
-        u64 r       :   1,
-        w           :   1,
-        x           :   1,
-        emt         :   3, /* EPT Memory type */
-        ipat        :   1, /* Ignore PAT memory type */
-        sp          :   1, /* Is this a superpage? */
-        avail1      :   4,
-        mfn         :   40,
-        avail2      :   12;
+        u64 r       :   1,  /* bit 0 - Read permission */
+        w           :   1,  /* bit 1 - Write permission */
+        x           :   1,  /* bit 2 - Execute permission */
+        emt         :   3,  /* bits 5:3 - EPT Memory type */
+        ipat        :   1,  /* bit 6 - Ignore PAT memory type */
+        sp          :   1,  /* bit 7 - Is this a superpage? */
+        rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
+        avail1      :   1,  /* bit 10 - Software available 1 */
+        rsvd2_snp   :   1,  /* bit 11 - Used for VT-d snoop control
+                               in shared EPT/VT-d usage */
+        mfn         :   40, /* bits 51:12 - Machine physical frame number */
+        sa_p2mt     :   10, /* bits 61:52 - Software available 2 */
+        rsvd3_tm    :   1,  /* bit 62 - Used for VT-d transient-mapping
+                               hint in shared EPT/VT-d usage */
+        avail3      :   1;  /* bit 63 - Software available 3 */
     };
     u64 epte;
 } ept_entry_t;
@@ -208,6 +214,11 @@
 #define cpu_has_vmx_ept_invept_single_context   \
     (vmx_ept_vpid_cap & VMX_EPT_INVEPT_SINGLE_CONTEXT)
 
+#define EPT_2MB_SHIFT     16
+#define EPT_1GB_SHIFT     17
+#define ept_has_2mb(c)    ((c >> EPT_2MB_SHIFT) & 1)
+#define ept_has_1gb(c)    ((c >> EPT_1GB_SHIFT) & 1)
+
 #define INVEPT_SINGLE_CONTEXT   1
 #define INVEPT_ALL_CONTEXT      2
 
diff -r bfd13358b8bf xen/include/xen/iommu.h
--- a/xen/include/xen/iommu.h	Tue Dec 07 18:32:04 2010 +0000
+++ b/xen/include/xen/iommu.h	Sat Nov 27 12:08:57 2010 -0800
@@ -30,6 +30,7 @@
 extern bool_t force_iommu, iommu_verbose;
 extern bool_t iommu_workaround_bios_bug, iommu_passthrough;
 extern bool_t iommu_snoop, iommu_qinval, iommu_intremap;
+extern bool_t iommu_hap_pt_share;
 extern bool_t amd_iommu_debug;
 extern bool_t amd_iommu_perdev_intremap;
 
@@ -85,7 +86,8 @@
 int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                    unsigned int flags);
 int iommu_unmap_page(struct domain *d, unsigned long gfn);
-
+void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present);
+void iommu_set_pgd(struct domain *d);
 void iommu_domain_teardown(struct domain *d);
 int hvm_do_IRQ_dpci(struct domain *d, unsigned int irq);
 int dpci_ioport_intercept(ioreq_t *p);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: RE: [RFC][PATCH][VTD] EPT/VT-d page table sharing
  2010-12-11  2:01                 ` Kay, Allen M
@ 2010-12-21 17:08                   ` Ian Campbell
  0 siblings, 0 replies; 12+ messages in thread
From: Ian Campbell @ 2010-12-21 17:08 UTC (permalink / raw)
  To: Kay, Allen M; +Cc: Tim Deegan, Wei Wang2, Keir Fraser, xen-devel, Han, Weidong

On Sat, 2010-12-11 at 02:01 +0000, Kay, Allen M wrote:
> Hi Tim,
> 
> Thanks for your feedback.  In attached updated patch, I have:
> 
>    1) moved setting of iommu_hap_pt_share flag to init_vtd_hw(). This function initialized other vt-d specific features.  Let me know if you know of a better place.
>    2) rename avail2 field to sa_p2mt.
> 
> Let me know if you have any additional comments.

Hi Allen,

This change appears to be responsible for the recent failures to boot
xen-unstable on Intel hardware, which is stopping the regression test
runs.

The symptom is a hang just before the hypervisor scrubs the pages. e.g.
http://www.chiark.greenend.org.uk/~xensrcts/logs/3470/test-amd64-i386-rhel6hvm-intel/
was successful while
http://www.chiark.greenend.org.uk/~xensrcts/logs/3863/test-amd64-i386-rhel6hvm-intel/
was not.

Booting with "sharept=0" resolves the boot failure.

It's been quite a long time (~2 weeks) since we last had a test case
(not all down to this issue) so unless you have a quick fix I propose
the temporary workaround below.

Thanks,
Ian.

EPT/VT-d: disable page sharing by default.

Currently sharing these page tables causes a hang on boot on some
hardware. Disable by default until this is resolved.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

diff -r ff1b80ccecd9 xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c	Fri Dec 17 16:12:37 2010 +0000
+++ b/xen/drivers/passthrough/vtd/iommu.c	Tue Dec 21 16:24:34 2010 +0000
@@ -45,7 +45,7 @@
 #define nr_ioapics              iosapic_get_nr_iosapics()
 #endif
 
-static int sharept = 1;
+static int sharept = 0;
 boolean_param("sharept", sharept);
 
 int nr_iommus;

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2010-12-21 17:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-03 16:03 [PATCH 4/4] amd iommu: Large io page support - implementation Wei Wang2
2010-12-07  0:47 ` Kay, Allen M
2010-12-07 11:20   ` Wei Wang2
2010-12-07 18:00     ` Keir Fraser
2010-12-07 18:21       ` Kay, Allen M
2010-12-08 10:03       ` Tim Deegan
2010-12-08 13:02         ` Wei Wang2
2010-12-08 13:11           ` Tim Deegan
2010-12-08 18:38             ` [RFC][PATCH][VTD] EPT/VT-d page table sharing Kay, Allen M
2010-12-09 10:12               ` Tim Deegan
2010-12-11  2:01                 ` Kay, Allen M
2010-12-21 17:08                   ` Ian Campbell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.