All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] x86/mm: get/set PoD target related adjustments
@ 2022-01-04  9:39 Jan Beulich
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Jan Beulich @ 2022-01-04  9:39 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, George Dunlap

1: IOMMU/x86: disallow device assignment to PoD guests
2: x86/mm: tidy XENMEM_{get,set}_pod_target handling

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-01-04  9:39 [PATCH v3 0/2] x86/mm: get/set PoD target related adjustments Jan Beulich
@ 2022-01-04  9:41 ` Jan Beulich
  2022-01-27 15:02   ` Ping: " Jan Beulich
                     ` (2 more replies)
  2022-01-04  9:41 ` [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling Jan Beulich
  2022-04-11  9:47 ` [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
  2 siblings, 3 replies; 20+ messages in thread
From: Jan Beulich @ 2022-01-04  9:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	George Dunlap, Paul Durrant, Tamas K Lengyel, Petre Pircalabu,
	Alexandru Isaila

While it is okay for IOMMU page tables to get set up for guests starting
in PoD mode, actual device assignment may only occur once all PoD
entries have been removed from the P2M. So far this was enforced only
for boot-time assignment, and only in the tool stack.

Also use the new function to replace p2m_pod_entry_count(): Its unlocked
access to p2m->pod.entry_count wasn't really okay (irrespective of the
result being stale by the time the caller gets to see it).

To allow the tool stack to see a consistent snapshot of PoD state, move
the tail of XENMEM_{get,set}_pod_target handling into a function, adding
proper locking there.

In libxl take the liberty to use the new local variable r also for a
pre-existing call into libxc.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
permit device assignment by actively resolving all remaining PoD entries.

Initially I thought this was introduced by f89f555827a6 ("remove late
(on-demand) construction of IOMMU page tables"), but without
arch_iommu_use_permitted() checking for PoD I think the issue has been
there before that.
---
v3: In p2m_pod_set_mem_target() move check down.
v2: New.

--- a/tools/libs/light/libxl_pci.c
+++ b/tools/libs/light/libxl_pci.c
@@ -1619,8 +1619,13 @@ void libxl__device_pci_add(libxl__egc *e
     pas->callback = device_pci_add_stubdom_done;
 
     if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
-        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
-        if (rc) {
+        int r;
+        uint64_t cache, ents;
+
+        rc = ERROR_FAIL;
+
+        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
+        if (r) {
             LOGD(ERROR, domid,
                  "PCI device %04x:%02x:%02x.%u %s?",
                  pci->domain, pci->bus, pci->dev, pci->func,
@@ -1628,6 +1633,22 @@ void libxl__device_pci_add(libxl__egc *e
                  : "already assigned to a different guest");
             goto out;
         }
+
+        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
+        if (r) {
+            LOGED(ERROR, domid, "Cannot determine PoD status");
+            goto out;
+        }
+        /*
+         * In principle it is sufficient for the domain to have ballooned down
+         * enough such that ents <= cache.  But any remaining entries would
+         * need resolving first.  Until such time when this gets effected,
+         * refuse assignment as long as any entries are left.
+         */
+        if (ents /* > cache */) {
+            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
+            goto out;
+        }
     }
 
     rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -20,6 +20,7 @@
  */
 
 #include <xen/event.h>
+#include <xen/iocap.h>
 #include <xen/ioreq.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
@@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
 
     ASSERT( pod_target >= p2m->pod.count );
 
-    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        ret = -ENOTEMPTY;
+    else
+        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
 
 out:
     pod_unlock(p2m);
@@ -367,6 +371,23 @@ out:
     return ret;
 }
 
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    ASSERT(is_hvm_domain(d));
+
+    pod_lock(p2m);
+    lock_page_alloc(p2m);
+
+    target->tot_pages       = domain_tot_pages(d);
+    target->pod_cache_pages = p2m->pod.count;
+    target->pod_entries     = p2m->pod.entry_count;
+
+    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
+}
+
 int p2m_pod_empty_cache(struct domain *d)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
@@ -1384,6 +1405,9 @@ guest_physmap_mark_populate_on_demand(st
     if ( !paging_mode_translate(d) )
         return -EINVAL;
 
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        return -ENOTEMPTY;
+
     do {
         rc = mark_populate_on_demand(d, gfn, chunk_order);
 
@@ -1405,3 +1429,20 @@ void p2m_pod_init(struct p2m_domain *p2m
     for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
         p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
 }
+
+bool p2m_pod_active(const struct domain *d)
+{
+    struct p2m_domain *p2m;
+    bool res;
+
+    if ( !is_hvm_domain(d) )
+        return false;
+
+    p2m = p2m_get_hostp2m(d);
+
+    pod_lock(p2m);
+    res = p2m->pod.entry_count | p2m->pod.count;
+    pod_unlock(p2m);
+
+    return res;
+}
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4778,7 +4778,6 @@ long arch_memory_op(unsigned long cmd, X
     {
         xen_pod_target_t target;
         struct domain *d;
-        struct p2m_domain *p2m;
 
         if ( copy_from_guest(&target, arg, 1) )
             return -EFAULT;
@@ -4787,7 +4786,9 @@ long arch_memory_op(unsigned long cmd, X
         if ( d == NULL )
             return -ESRCH;
 
-        if ( cmd == XENMEM_set_pod_target )
+        if ( !is_hvm_domain(d) )
+            rc = -EINVAL;
+        else if ( cmd == XENMEM_set_pod_target )
             rc = xsm_set_pod_target(XSM_PRIV, d);
         else
             rc = xsm_get_pod_target(XSM_PRIV, d);
@@ -4813,10 +4814,7 @@ long arch_memory_op(unsigned long cmd, X
         }
         else if ( rc >= 0 )
         {
-            p2m = p2m_get_hostp2m(d);
-            target.tot_pages       = domain_tot_pages(d);
-            target.pod_cache_pages = p2m->pod.count;
-            target.pod_entries     = p2m->pod.entry_count;
+            p2m_pod_get_mem_target(d, &target);
 
             if ( __copy_to_guest(arg, &target, 1) )
             {
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
 
             rc = -EXDEV;
             /* Disallow paging in a PoD guest */
-            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
+            if ( p2m_pod_active(d) )
                 break;
 
             /* domain_pause() not required here, see XSA-99 */
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -454,11 +454,12 @@ bool arch_iommu_use_permitted(const stru
 {
     /*
      * Prevent device assign if mem paging, mem sharing or log-dirty
-     * have been enabled for this domain.
+     * have been enabled for this domain, or if PoD is still in active use.
      */
     return d == dom_io ||
            (likely(!mem_sharing_enabled(d)) &&
             likely(!mem_paging_enabled(d)) &&
+            likely(!p2m_pod_active(d)) &&
             likely(!p2m_get_hostp2m(d)->global_logdirty));
 }
 
--- a/xen/arch/x86/include/asm/p2m.h
+++ b/xen/arch/x86/include/asm/p2m.h
@@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d
  * domain matches target */
 int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
 
+/* Obtain a consistent snapshot of PoD related domain state. */
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
+
+/* Check whether PoD is (still) active in a domain. */
+bool p2m_pod_active(const struct domain *d);
+
 /* Scan pod cache when offline/broken page triggered */
 int
 p2m_pod_offline_or_broken_hit(struct page_info *p);
@@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct pag
 void
 p2m_pod_offline_or_broken_replace(struct page_info *p);
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return p2m->pod.entry_count;
-}
-
 void p2m_pod_init(struct p2m_domain *p2m);
 
 #else
@@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(st
     return 0;
 }
 
+static inline bool p2m_pod_active(const struct domain *d)
+{
+    return false;
+}
+
 static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
 {
     return 0;
@@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_br
     ASSERT_UNREACHABLE();
 }
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return 0;
-}
-
 static inline void p2m_pod_init(struct p2m_domain *p2m) {}
 
 #endif



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-01-04  9:39 [PATCH v3 0/2] x86/mm: get/set PoD target related adjustments Jan Beulich
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
@ 2022-01-04  9:41 ` Jan Beulich
  2022-01-27 15:03   ` Ping: " Jan Beulich
  2022-02-02 15:14   ` Roger Pau Monné
  2022-04-11  9:47 ` [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
  2 siblings, 2 replies; 20+ messages in thread
From: Jan Beulich @ 2022-01-04  9:41 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, George Dunlap

Do away with the "pod_target_out_unlock" label. In particular by folding
if()-s, the logic can be expressed with less code (and no goto-s) this
way.

Limit scope of "p2m", constifying it at the same time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Re-base over new earlier patch.

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4789,23 +4789,17 @@ long arch_memory_op(unsigned long cmd, X
         if ( !is_hvm_domain(d) )
             rc = -EINVAL;
         else if ( cmd == XENMEM_set_pod_target )
-            rc = xsm_set_pod_target(XSM_PRIV, d);
-        else
-            rc = xsm_get_pod_target(XSM_PRIV, d);
-
-        if ( rc != 0 )
-            goto pod_target_out_unlock;
-
-        if ( cmd == XENMEM_set_pod_target )
         {
-            if ( target.target_pages > d->max_pages )
-            {
+            rc = xsm_set_pod_target(XSM_PRIV, d);
+            if ( rc )
+                ASSERT(rc < 0);
+            else if ( target.target_pages > d->max_pages )
                 rc = -EINVAL;
-                goto pod_target_out_unlock;
-            }
-
-            rc = p2m_pod_set_mem_target(d, target.target_pages);
+            else
+                rc = p2m_pod_set_mem_target(d, target.target_pages);
         }
+        else
+            rc = xsm_get_pod_target(XSM_PRIV, d);
 
         if ( rc == -ERESTART )
         {
@@ -4817,13 +4811,9 @@ long arch_memory_op(unsigned long cmd, X
             p2m_pod_get_mem_target(d, &target);
 
             if ( __copy_to_guest(arg, &target, 1) )
-            {
-                rc= -EFAULT;
-                goto pod_target_out_unlock;
-            }
+                rc = -EFAULT;
         }
 
-    pod_target_out_unlock:
         rcu_unlock_domain(d);
         return rc;
     }



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Ping: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
@ 2022-01-27 15:02   ` Jan Beulich
  2022-01-28 16:07   ` Anthony PERARD
  2022-02-02 16:13   ` Roger Pau Monné
  2 siblings, 0 replies; 20+ messages in thread
From: Jan Beulich @ 2022-01-27 15:02 UTC (permalink / raw)
  To: Wei Liu, Anthony Perard, Andrew Cooper, Tamas K Lengyel, Paul Durrant
  Cc: Roger Pau Monné,
	George Dunlap, Petre Pircalabu, Alexandru Isaila, xen-devel

On 04.01.2022 10:41, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Ping? (Anthony, I realize you weren't Cc-ed on the original submission.)

Jan

> ---
> If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
> permit device assignment by actively resolving all remaining PoD entries.
> 
> Initially I thought this was introduced by f89f555827a6 ("remove late
> (on-demand) construction of IOMMU page tables"), but without
> arch_iommu_use_permitted() checking for PoD I think the issue has been
> there before that.
> ---
> v3: In p2m_pod_set_mem_target() move check down.
> v2: New.
> 
> --- a/tools/libs/light/libxl_pci.c
> +++ b/tools/libs/light/libxl_pci.c
> @@ -1619,8 +1619,13 @@ void libxl__device_pci_add(libxl__egc *e
>      pas->callback = device_pci_add_stubdom_done;
>  
>      if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
> -        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> -        if (rc) {
> +        int r;
> +        uint64_t cache, ents;
> +
> +        rc = ERROR_FAIL;
> +
> +        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> +        if (r) {
>              LOGD(ERROR, domid,
>                   "PCI device %04x:%02x:%02x.%u %s?",
>                   pci->domain, pci->bus, pci->dev, pci->func,
> @@ -1628,6 +1633,22 @@ void libxl__device_pci_add(libxl__egc *e
>                   : "already assigned to a different guest");
>              goto out;
>          }
> +
> +        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
> +        if (r) {
> +            LOGED(ERROR, domid, "Cannot determine PoD status");
> +            goto out;
> +        }
> +        /*
> +         * In principle it is sufficient for the domain to have ballooned down
> +         * enough such that ents <= cache.  But any remaining entries would
> +         * need resolving first.  Until such time when this gets effected,
> +         * refuse assignment as long as any entries are left.
> +         */
> +        if (ents /* > cache */) {
> +            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
> +            goto out;
> +        }
>      }
>  
>      rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include <xen/event.h>
> +#include <xen/iocap.h>
>  #include <xen/ioreq.h>
>  #include <xen/mm.h>
>  #include <xen/sched.h>
> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        ret = -ENOTEMPTY;
> +    else
> +        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>  
>  out:
>      pod_unlock(p2m);
> @@ -367,6 +371,23 @@ out:
>      return ret;
>  }
>  
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    ASSERT(is_hvm_domain(d));
> +
> +    pod_lock(p2m);
> +    lock_page_alloc(p2m);
> +
> +    target->tot_pages       = domain_tot_pages(d);
> +    target->pod_cache_pages = p2m->pod.count;
> +    target->pod_entries     = p2m->pod.entry_count;
> +
> +    unlock_page_alloc(p2m);
> +    pod_unlock(p2m);
> +}
> +
>  int p2m_pod_empty_cache(struct domain *d)
>  {
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> @@ -1384,6 +1405,9 @@ guest_physmap_mark_populate_on_demand(st
>      if ( !paging_mode_translate(d) )
>          return -EINVAL;
>  
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        return -ENOTEMPTY;
> +
>      do {
>          rc = mark_populate_on_demand(d, gfn, chunk_order);
>  
> @@ -1405,3 +1429,20 @@ void p2m_pod_init(struct p2m_domain *p2m
>      for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
>          p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
>  }
> +
> +bool p2m_pod_active(const struct domain *d)
> +{
> +    struct p2m_domain *p2m;
> +    bool res;
> +
> +    if ( !is_hvm_domain(d) )
> +        return false;
> +
> +    p2m = p2m_get_hostp2m(d);
> +
> +    pod_lock(p2m);
> +    res = p2m->pod.entry_count | p2m->pod.count;
> +    pod_unlock(p2m);
> +
> +    return res;
> +}
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4778,7 +4778,6 @@ long arch_memory_op(unsigned long cmd, X
>      {
>          xen_pod_target_t target;
>          struct domain *d;
> -        struct p2m_domain *p2m;
>  
>          if ( copy_from_guest(&target, arg, 1) )
>              return -EFAULT;
> @@ -4787,7 +4786,9 @@ long arch_memory_op(unsigned long cmd, X
>          if ( d == NULL )
>              return -ESRCH;
>  
> -        if ( cmd == XENMEM_set_pod_target )
> +        if ( !is_hvm_domain(d) )
> +            rc = -EINVAL;
> +        else if ( cmd == XENMEM_set_pod_target )
>              rc = xsm_set_pod_target(XSM_PRIV, d);
>          else
>              rc = xsm_get_pod_target(XSM_PRIV, d);
> @@ -4813,10 +4814,7 @@ long arch_memory_op(unsigned long cmd, X
>          }
>          else if ( rc >= 0 )
>          {
> -            p2m = p2m_get_hostp2m(d);
> -            target.tot_pages       = domain_tot_pages(d);
> -            target.pod_cache_pages = p2m->pod.count;
> -            target.pod_entries     = p2m->pod.entry_count;
> +            p2m_pod_get_mem_target(d, &target);
>  
>              if ( __copy_to_guest(arg, &target, 1) )
>              {
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>  
>              rc = -EXDEV;
>              /* Disallow paging in a PoD guest */
> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> +            if ( p2m_pod_active(d) )
>                  break;
>  
>              /* domain_pause() not required here, see XSA-99 */
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -454,11 +454,12 @@ bool arch_iommu_use_permitted(const stru
>  {
>      /*
>       * Prevent device assign if mem paging, mem sharing or log-dirty
> -     * have been enabled for this domain.
> +     * have been enabled for this domain, or if PoD is still in active use.
>       */
>      return d == dom_io ||
>             (likely(!mem_sharing_enabled(d)) &&
>              likely(!mem_paging_enabled(d)) &&
> +            likely(!p2m_pod_active(d)) &&
>              likely(!p2m_get_hostp2m(d)->global_logdirty));
>  }
>  
> --- a/xen/arch/x86/include/asm/p2m.h
> +++ b/xen/arch/x86/include/asm/p2m.h
> @@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d
>   * domain matches target */
>  int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
>  
> +/* Obtain a consistent snapshot of PoD related domain state. */
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
> +
> +/* Check whether PoD is (still) active in a domain. */
> +bool p2m_pod_active(const struct domain *d);
> +
>  /* Scan pod cache when offline/broken page triggered */
>  int
>  p2m_pod_offline_or_broken_hit(struct page_info *p);
> @@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct pag
>  void
>  p2m_pod_offline_or_broken_replace(struct page_info *p);
>  
> -static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
> -{
> -    return p2m->pod.entry_count;
> -}
> -
>  void p2m_pod_init(struct p2m_domain *p2m);
>  
>  #else
> @@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(st
>      return 0;
>  }
>  
> +static inline bool p2m_pod_active(const struct domain *d)
> +{
> +    return false;
> +}
> +
>  static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
>  {
>      return 0;
> @@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_br
>      ASSERT_UNREACHABLE();
>  }
>  
> -static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
> -{
> -    return 0;
> -}
> -
>  static inline void p2m_pod_init(struct p2m_domain *p2m) {}
>  
>  #endif
> 
> 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Ping: [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-01-04  9:41 ` [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling Jan Beulich
@ 2022-01-27 15:03   ` Jan Beulich
  2022-02-02 15:14   ` Roger Pau Monné
  1 sibling, 0 replies; 20+ messages in thread
From: Jan Beulich @ 2022-01-27 15:03 UTC (permalink / raw)
  To: Andrew Cooper, Roger Pau Monné; +Cc: Wei Liu, George Dunlap, xen-devel

On 04.01.2022 10:41, Jan Beulich wrote:
> Do away with the "pod_target_out_unlock" label. In particular by folding
> if()-s, the logic can be expressed with less code (and no goto-s) this
> way.
> 
> Limit scope of "p2m", constifying it at the same time.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Ping?

Jan

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4789,23 +4789,17 @@ long arch_memory_op(unsigned long cmd, X
>          if ( !is_hvm_domain(d) )
>              rc = -EINVAL;
>          else if ( cmd == XENMEM_set_pod_target )
> -            rc = xsm_set_pod_target(XSM_PRIV, d);
> -        else
> -            rc = xsm_get_pod_target(XSM_PRIV, d);
> -
> -        if ( rc != 0 )
> -            goto pod_target_out_unlock;
> -
> -        if ( cmd == XENMEM_set_pod_target )
>          {
> -            if ( target.target_pages > d->max_pages )
> -            {
> +            rc = xsm_set_pod_target(XSM_PRIV, d);
> +            if ( rc )
> +                ASSERT(rc < 0);
> +            else if ( target.target_pages > d->max_pages )
>                  rc = -EINVAL;
> -                goto pod_target_out_unlock;
> -            }
> -
> -            rc = p2m_pod_set_mem_target(d, target.target_pages);
> +            else
> +                rc = p2m_pod_set_mem_target(d, target.target_pages);
>          }
> +        else
> +            rc = xsm_get_pod_target(XSM_PRIV, d);
>  
>          if ( rc == -ERESTART )
>          {
> @@ -4817,13 +4811,9 @@ long arch_memory_op(unsigned long cmd, X
>              p2m_pod_get_mem_target(d, &target);
>  
>              if ( __copy_to_guest(arg, &target, 1) )
> -            {
> -                rc= -EFAULT;
> -                goto pod_target_out_unlock;
> -            }
> +                rc = -EFAULT;
>          }
>  
> -    pod_target_out_unlock:
>          rcu_unlock_domain(d);
>          return rc;
>      }
> 
> 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
  2022-01-27 15:02   ` Ping: " Jan Beulich
@ 2022-01-28 16:07   ` Anthony PERARD
  2022-02-02 16:13   ` Roger Pau Monné
  2 siblings, 0 replies; 20+ messages in thread
From: Anthony PERARD @ 2022-01-28 16:07 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, Roger Pau Monné,
	George Dunlap, Paul Durrant, Tamas K Lengyel, Petre Pircalabu,
	Alexandru Isaila

On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Anthony PERARD <anthony.perard@citrix.com>

Thanks,

-- 
Anthony PERARD


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-01-04  9:41 ` [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling Jan Beulich
  2022-01-27 15:03   ` Ping: " Jan Beulich
@ 2022-02-02 15:14   ` Roger Pau Monné
  2022-02-02 15:29     ` Jan Beulich
  1 sibling, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-02 15:14 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap

On Tue, Jan 04, 2022 at 10:41:53AM +0100, Jan Beulich wrote:
> Do away with the "pod_target_out_unlock" label. In particular by folding
> if()-s, the logic can be expressed with less code (and no goto-s) this
> way.
> 
> Limit scope of "p2m", constifying it at the same time.

Is this stale? I cannot find any reference to a p2m variable in the
chunks below.

> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Code LGTM, but commit message likely needs dropping of that last
sentence or the block using p2m needs to be adjusted.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-02-02 15:14   ` Roger Pau Monné
@ 2022-02-02 15:29     ` Jan Beulich
  2022-02-04  9:28       ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2022-02-02 15:29 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap

On 02.02.2022 16:14, Roger Pau Monné wrote:
> On Tue, Jan 04, 2022 at 10:41:53AM +0100, Jan Beulich wrote:
>> Do away with the "pod_target_out_unlock" label. In particular by folding
>> if()-s, the logic can be expressed with less code (and no goto-s) this
>> way.
>>
>> Limit scope of "p2m", constifying it at the same time.
> 
> Is this stale? I cannot find any reference to a p2m variable in the
> chunks below.

Indeed it is, leftover from rebasing over the introduction of
p2m_pod_get_mem_target() in what is now patch 1. Dropped.

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
  2022-01-27 15:02   ` Ping: " Jan Beulich
  2022-01-28 16:07   ` Anthony PERARD
@ 2022-02-02 16:13   ` Roger Pau Monné
  2022-02-03  8:31     ` Jan Beulich
  2 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-02 16:13 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> While it is okay for IOMMU page tables to get set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
> permit device assignment by actively resolving all remaining PoD entries.
> 
> Initially I thought this was introduced by f89f555827a6 ("remove late
> (on-demand) construction of IOMMU page tables"), but without
> arch_iommu_use_permitted() checking for PoD I think the issue has been
> there before that.
> ---
> v3: In p2m_pod_set_mem_target() move check down.
> v2: New.
> 
> --- a/tools/libs/light/libxl_pci.c
> +++ b/tools/libs/light/libxl_pci.c
> @@ -1619,8 +1619,13 @@ void libxl__device_pci_add(libxl__egc *e
>      pas->callback = device_pci_add_stubdom_done;
>  
>      if (libxl__domain_type(gc, domid) == LIBXL_DOMAIN_TYPE_HVM) {
> -        rc = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> -        if (rc) {
> +        int r;
> +        uint64_t cache, ents;
> +
> +        rc = ERROR_FAIL;
> +
> +        r = xc_test_assign_device(ctx->xch, domid, pci_encode_bdf(pci));
> +        if (r) {
>              LOGD(ERROR, domid,
>                   "PCI device %04x:%02x:%02x.%u %s?",
>                   pci->domain, pci->bus, pci->dev, pci->func,
> @@ -1628,6 +1633,22 @@ void libxl__device_pci_add(libxl__egc *e
>                   : "already assigned to a different guest");
>              goto out;
>          }
> +
> +        r = xc_domain_get_pod_target(ctx->xch, domid, NULL, &cache, &ents);
> +        if (r) {
> +            LOGED(ERROR, domid, "Cannot determine PoD status");
> +            goto out;
> +        }
> +        /*
> +         * In principle it is sufficient for the domain to have ballooned down
> +         * enough such that ents <= cache.  But any remaining entries would
> +         * need resolving first.  Until such time when this gets effected,
> +         * refuse assignment as long as any entries are left.
> +         */
> +        if (ents /* > cache */) {
> +            LOGD(ERROR, domid, "Cannot assign device with PoD still active");
> +            goto out;
> +        }
>      }
>  
>      rc = libxl__device_pci_setdefault(gc, domid, pci, !starting);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include <xen/event.h>
> +#include <xen/iocap.h>
>  #include <xen/ioreq.h>
>  #include <xen/mm.h>
>  #include <xen/sched.h>
> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )

Is it possible to have cache flush allowed without any PCI device
assigned? AFAICT the iomem/ioport_caps would only get setup when there
are device passed through?

TBH I would be fine if we just say that PoD cannot be used in
conjunction with an IOMMU, and just check for is_iommu_enable(d) here.

I understand it's technically possible for PoD to be used together
with a domain that will later get a device passed through once PoD is
no longer in use, but I doubt there's much value in supporting that
use case, and I fear we might be introducing corner cases that could
create issues in the future. Overall I think it would be safer to just
disable PoD in conjunction with an IOMMU.

> +        ret = -ENOTEMPTY;
> +    else
> +        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>  
>  out:
>      pod_unlock(p2m);
> @@ -367,6 +371,23 @@ out:
>      return ret;
>  }
>  
> +void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    ASSERT(is_hvm_domain(d));
> +
> +    pod_lock(p2m);
> +    lock_page_alloc(p2m);
> +
> +    target->tot_pages       = domain_tot_pages(d);
> +    target->pod_cache_pages = p2m->pod.count;
> +    target->pod_entries     = p2m->pod.entry_count;
> +
> +    unlock_page_alloc(p2m);
> +    pod_unlock(p2m);
> +}
> +
>  int p2m_pod_empty_cache(struct domain *d)
>  {
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> @@ -1384,6 +1405,9 @@ guest_physmap_mark_populate_on_demand(st
>      if ( !paging_mode_translate(d) )
>          return -EINVAL;
>  
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        return -ENOTEMPTY;
> +
>      do {
>          rc = mark_populate_on_demand(d, gfn, chunk_order);
>  
> @@ -1405,3 +1429,20 @@ void p2m_pod_init(struct p2m_domain *p2m
>      for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
>          p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
>  }
> +
> +bool p2m_pod_active(const struct domain *d)
> +{
> +    struct p2m_domain *p2m;
> +    bool res;
> +
> +    if ( !is_hvm_domain(d) )
> +        return false;
> +
> +    p2m = p2m_get_hostp2m(d);
> +
> +    pod_lock(p2m);
> +    res = p2m->pod.entry_count | p2m->pod.count;
> +    pod_unlock(p2m);
> +
> +    return res;
> +}
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4778,7 +4778,6 @@ long arch_memory_op(unsigned long cmd, X
>      {
>          xen_pod_target_t target;
>          struct domain *d;
> -        struct p2m_domain *p2m;
>  
>          if ( copy_from_guest(&target, arg, 1) )
>              return -EFAULT;
> @@ -4787,7 +4786,9 @@ long arch_memory_op(unsigned long cmd, X
>          if ( d == NULL )
>              return -ESRCH;
>  
> -        if ( cmd == XENMEM_set_pod_target )
> +        if ( !is_hvm_domain(d) )
> +            rc = -EINVAL;
> +        else if ( cmd == XENMEM_set_pod_target )
>              rc = xsm_set_pod_target(XSM_PRIV, d);
>          else
>              rc = xsm_get_pod_target(XSM_PRIV, d);
> @@ -4813,10 +4814,7 @@ long arch_memory_op(unsigned long cmd, X
>          }
>          else if ( rc >= 0 )
>          {
> -            p2m = p2m_get_hostp2m(d);
> -            target.tot_pages       = domain_tot_pages(d);
> -            target.pod_cache_pages = p2m->pod.count;
> -            target.pod_entries     = p2m->pod.entry_count;
> +            p2m_pod_get_mem_target(d, &target);
>  
>              if ( __copy_to_guest(arg, &target, 1) )
>              {
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>  
>              rc = -EXDEV;
>              /* Disallow paging in a PoD guest */
> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> +            if ( p2m_pod_active(d) )

Isn't it fine to just check for entry_count like you suggest in the
change to libxl? This is what p2m_pod_entry_count actually does
(rather than entry_count | count).

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-02 16:13   ` Roger Pau Monné
@ 2022-02-03  8:31     ` Jan Beulich
  2022-02-03  9:04       ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2022-02-03  8:31 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On 02.02.2022 17:13, Roger Pau Monné wrote:
> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>  
>>      ASSERT( pod_target >= p2m->pod.count );
>>  
>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> 
> Is it possible to have cache flush allowed without any PCI device
> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> are device passed through?

One can assign MMIO or ports to a guest the raw way. That's not
secure, but functionally explicitly permitted.

> TBH I would be fine if we just say that PoD cannot be used in
> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> 
> I understand it's technically possible for PoD to be used together
> with a domain that will later get a device passed through once PoD is
> no longer in use, but I doubt there's much value in supporting that
> use case, and I fear we might be introducing corner cases that could
> create issues in the future. Overall I think it would be safer to just
> disable PoD in conjunction with an IOMMU.

I consider it wrong to put in place such a restriction, but I could
perhaps accept you and Andrew thinking this way if this was the only
aspect playing into here. However, this would then want an equivalent
tools side check, and while hunting down where to make the change as
done here, I wasn't able to figure out where that alternative
adjustment would need doing. Hence I would possibly(!) buy into this
only if someone else took care of doing so properly in the tool stack
(including the emission of a sensible error message).

Finally this still leaves out the "raw MMIO / ports" case mentioned
above.

>> --- a/xen/common/vm_event.c
>> +++ b/xen/common/vm_event.c
>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>>  
>>              rc = -EXDEV;
>>              /* Disallow paging in a PoD guest */
>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
>> +            if ( p2m_pod_active(d) )
> 
> Isn't it fine to just check for entry_count like you suggest in the
> change to libxl?

I didn't think it would be, but I'm not entirely sure: If paging was
enabled before a guest actually starts, it wouldn't have any entries
but still be a PoD guest if it has a non-empty cache. The VM event
folks may be able to clarify this either way. But ...

> This is what p2m_pod_entry_count actually does (rather than entry_count | count).

... you really mean "did" here, as I'm removing p2m_pod_entry_count()
in this patch. Of course locking could be added to it instead (or
p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
could go with just the check which precisely matches what the comment
says (IOW otherwise I'd need to additionally know what exactly the
comment is to say).

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-03  8:31     ` Jan Beulich
@ 2022-02-03  9:04       ` Roger Pau Monné
  2022-02-03  9:21         ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-03  9:04 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> On 02.02.2022 17:13, Roger Pau Monné wrote:
> > On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>  
> >>      ASSERT( pod_target >= p2m->pod.count );
> >>  
> >> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> > 
> > Is it possible to have cache flush allowed without any PCI device
> > assigned? AFAICT the iomem/ioport_caps would only get setup when there
> > are device passed through?
> 
> One can assign MMIO or ports to a guest the raw way. That's not
> secure, but functionally explicitly permitted.
> 
> > TBH I would be fine if we just say that PoD cannot be used in
> > conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> > 
> > I understand it's technically possible for PoD to be used together
> > with a domain that will later get a device passed through once PoD is
> > no longer in use, but I doubt there's much value in supporting that
> > use case, and I fear we might be introducing corner cases that could
> > create issues in the future. Overall I think it would be safer to just
> > disable PoD in conjunction with an IOMMU.
> 
> I consider it wrong to put in place such a restriction, but I could
> perhaps accept you and Andrew thinking this way if this was the only
> aspect playing into here. However, this would then want an equivalent
> tools side check, and while hunting down where to make the change as
> done here, I wasn't able to figure out where that alternative
> adjustment would need doing. Hence I would possibly(!) buy into this
> only if someone else took care of doing so properly in the tool stack
> (including the emission of a sensible error message).

What about the (completely untested) chunk below:

diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
index d7a40d7550..e585ef4c5c 100644
--- a/tools/libs/light/libxl_create.c
+++ b/tools/libs/light/libxl_create.c
@@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
     pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
         (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
 
-    /* We cannot have PoD and PCI device assignment at the same time
+    /* We cannot have PoD and an active IOMMU at the same time
      * for HVM guest. It was reported that IOMMU cannot work with PoD
      * enabled because it needs to populated entire page table for
-     * guest. To stay on the safe side, we disable PCI device
-     * assignment when PoD is enabled.
+     * guest.
      */
     if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
-        d_config->num_pcidevs && pod_enabled) {
+        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
+        pod_enabled) {
         ret = ERROR_INVAL;
-        LOGD(ERROR, domid,
-             "PCI device assignment for HVM guest failed due to PoD enabled");
+        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
         goto error_out;
     }
 


> Finally this still leaves out the "raw MMIO / ports" case mentioned
> above.

But the raw MMIO 'mode' doesn't care much about PoD, because if
there's no PCI device assigned there's no IOMMU setup, and thus such
raw MMIO regions (could?) belong to a device that's not constrained by
the guest p2m anyway?

> >> --- a/xen/common/vm_event.c
> >> +++ b/xen/common/vm_event.c
> >> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
> >>  
> >>              rc = -EXDEV;
> >>              /* Disallow paging in a PoD guest */
> >> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> >> +            if ( p2m_pod_active(d) )
> > 
> > Isn't it fine to just check for entry_count like you suggest in the
> > change to libxl?
> 
> I didn't think it would be, but I'm not entirely sure: If paging was
> enabled before a guest actually starts, it wouldn't have any entries
> but still be a PoD guest if it has a non-empty cache. The VM event
> folks may be able to clarify this either way. But ...
> 
> > This is what p2m_pod_entry_count actually does (rather than entry_count | count).
> 
> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
> in this patch. Of course locking could be added to it instead (or
> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
> could go with just the check which precisely matches what the comment
> says (IOW otherwise I'd need to additionally know what exactly the
> comment is to say).

Could you briefly mention this in the commit message? Ie: VM event
code is also adjusted to make sure PoD is not in use and cannot be
used during the guest lifetime?

Thanks, Roger.


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-03  9:04       ` Roger Pau Monné
@ 2022-02-03  9:21         ` Jan Beulich
  2022-02-03  9:52           ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2022-02-03  9:21 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On 03.02.2022 10:04, Roger Pau Monné wrote:
> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
>> On 02.02.2022 17:13, Roger Pau Monné wrote:
>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>>>  
>>>>      ASSERT( pod_target >= p2m->pod.count );
>>>>  
>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>>
>>> Is it possible to have cache flush allowed without any PCI device
>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
>>> are device passed through?
>>
>> One can assign MMIO or ports to a guest the raw way. That's not
>> secure, but functionally explicitly permitted.
>>
>>> TBH I would be fine if we just say that PoD cannot be used in
>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
>>>
>>> I understand it's technically possible for PoD to be used together
>>> with a domain that will later get a device passed through once PoD is
>>> no longer in use, but I doubt there's much value in supporting that
>>> use case, and I fear we might be introducing corner cases that could
>>> create issues in the future. Overall I think it would be safer to just
>>> disable PoD in conjunction with an IOMMU.
>>
>> I consider it wrong to put in place such a restriction, but I could
>> perhaps accept you and Andrew thinking this way if this was the only
>> aspect playing into here. However, this would then want an equivalent
>> tools side check, and while hunting down where to make the change as
>> done here, I wasn't able to figure out where that alternative
>> adjustment would need doing. Hence I would possibly(!) buy into this
>> only if someone else took care of doing so properly in the tool stack
>> (including the emission of a sensible error message).
> 
> What about the (completely untested) chunk below:
> 
> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> index d7a40d7550..e585ef4c5c 100644
> --- a/tools/libs/light/libxl_create.c
> +++ b/tools/libs/light/libxl_create.c
> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
>  
> -    /* We cannot have PoD and PCI device assignment at the same time
> +    /* We cannot have PoD and an active IOMMU at the same time
>       * for HVM guest. It was reported that IOMMU cannot work with PoD
>       * enabled because it needs to populated entire page table for
> -     * guest. To stay on the safe side, we disable PCI device
> -     * assignment when PoD is enabled.
> +     * guest.
>       */
>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> -        d_config->num_pcidevs && pod_enabled) {
> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> +        pod_enabled) {
>          ret = ERROR_INVAL;
> -        LOGD(ERROR, domid,
> -             "PCI device assignment for HVM guest failed due to PoD enabled");
> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
>          goto error_out;
>      }

Perhaps. Seeing this I actually recall coming across this check during
my investigation. Not changing it along the lines of what you do was
then really more because of me not being convinced of the extra
restriction; I clearly misremembered when writing the earlier reply.
If we were to do what you suggest, I'd like to ask that the comment be
changed differently, though: "We cannot ..." then isn't really true
anymore. We choose not to permit this mode; "cannot" only applies to
actual device assignment (and of course only as long as there aren't
restartable IOMMU faults).

>> Finally this still leaves out the "raw MMIO / ports" case mentioned
>> above.
> 
> But the raw MMIO 'mode' doesn't care much about PoD, because if
> there's no PCI device assigned there's no IOMMU setup, and thus such
> raw MMIO regions (could?) belong to a device that's not constrained by
> the guest p2m anyway?

Hmm, yes, true.

>>>> --- a/xen/common/vm_event.c
>>>> +++ b/xen/common/vm_event.c
>>>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
>>>>  
>>>>              rc = -EXDEV;
>>>>              /* Disallow paging in a PoD guest */
>>>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
>>>> +            if ( p2m_pod_active(d) )
>>>
>>> Isn't it fine to just check for entry_count like you suggest in the
>>> change to libxl?
>>
>> I didn't think it would be, but I'm not entirely sure: If paging was
>> enabled before a guest actually starts, it wouldn't have any entries
>> but still be a PoD guest if it has a non-empty cache. The VM event
>> folks may be able to clarify this either way. But ...
>>
>>> This is what p2m_pod_entry_count actually does (rather than entry_count | count).
>>
>> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
>> in this patch. Of course locking could be added to it instead (or
>> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
>> could go with just the check which precisely matches what the comment
>> says (IOW otherwise I'd need to additionally know what exactly the
>> comment is to say).
> 
> Could you briefly mention this in the commit message? Ie: VM event
> code is also adjusted to make sure PoD is not in use and cannot be
> used during the guest lifetime?

I've added

"Nor was the use of that function in line with the immediately preceding
 comment: A PoD guest isn't just one with a non-zero entry count, but
 also one with a non-empty cache (e.g. prior to actually launching the
 guest)."

to the already existing paragraph about the removal of that function.

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-03  9:21         ` Jan Beulich
@ 2022-02-03  9:52           ` Roger Pau Monné
  2022-02-03 10:20             ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-03  9:52 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
> On 03.02.2022 10:04, Roger Pau Monné wrote:
> > On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> >> On 02.02.2022 17:13, Roger Pau Monné wrote:
> >>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>>>  
> >>>>      ASSERT( pod_target >= p2m->pod.count );
> >>>>  
> >>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> >>>
> >>> Is it possible to have cache flush allowed without any PCI device
> >>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> >>> are device passed through?
> >>
> >> One can assign MMIO or ports to a guest the raw way. That's not
> >> secure, but functionally explicitly permitted.
> >>
> >>> TBH I would be fine if we just say that PoD cannot be used in
> >>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> >>>
> >>> I understand it's technically possible for PoD to be used together
> >>> with a domain that will later get a device passed through once PoD is
> >>> no longer in use, but I doubt there's much value in supporting that
> >>> use case, and I fear we might be introducing corner cases that could
> >>> create issues in the future. Overall I think it would be safer to just
> >>> disable PoD in conjunction with an IOMMU.
> >>
> >> I consider it wrong to put in place such a restriction, but I could
> >> perhaps accept you and Andrew thinking this way if this was the only
> >> aspect playing into here. However, this would then want an equivalent
> >> tools side check, and while hunting down where to make the change as
> >> done here, I wasn't able to figure out where that alternative
> >> adjustment would need doing. Hence I would possibly(!) buy into this
> >> only if someone else took care of doing so properly in the tool stack
> >> (including the emission of a sensible error message).
> > 
> > What about the (completely untested) chunk below:
> > 
> > diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> > index d7a40d7550..e585ef4c5c 100644
> > --- a/tools/libs/light/libxl_create.c
> > +++ b/tools/libs/light/libxl_create.c
> > @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
> >      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
> >          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
> >  
> > -    /* We cannot have PoD and PCI device assignment at the same time
> > +    /* We cannot have PoD and an active IOMMU at the same time
> >       * for HVM guest. It was reported that IOMMU cannot work with PoD
> >       * enabled because it needs to populated entire page table for
> > -     * guest. To stay on the safe side, we disable PCI device
> > -     * assignment when PoD is enabled.
> > +     * guest.
> >       */
> >      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> > -        d_config->num_pcidevs && pod_enabled) {
> > +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> > +        pod_enabled) {
> >          ret = ERROR_INVAL;
> > -        LOGD(ERROR, domid,
> > -             "PCI device assignment for HVM guest failed due to PoD enabled");
> > +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
> >          goto error_out;
> >      }
> 
> Perhaps. Seeing this I actually recall coming across this check during
> my investigation. Not changing it along the lines of what you do was
> then really more because of me not being convinced of the extra
> restriction; I clearly misremembered when writing the earlier reply.
> If we were to do what you suggest, I'd like to ask that the comment be
> changed differently, though: "We cannot ..." then isn't really true
> anymore. We choose not to permit this mode; "cannot" only applies to
> actual device assignment (and of course only as long as there aren't
> restartable IOMMU faults).

I'm fine with an adjusted wording here. This was mostly a placement
suggestion, but I didn't gave much thought to the error message.

> >> Finally this still leaves out the "raw MMIO / ports" case mentioned
> >> above.
> > 
> > But the raw MMIO 'mode' doesn't care much about PoD, because if
> > there's no PCI device assigned there's no IOMMU setup, and thus such
> > raw MMIO regions (could?) belong to a device that's not constrained by
> > the guest p2m anyway?
> 
> Hmm, yes, true.
> 
> >>>> --- a/xen/common/vm_event.c
> >>>> +++ b/xen/common/vm_event.c
> >>>> @@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
> >>>>  
> >>>>              rc = -EXDEV;
> >>>>              /* Disallow paging in a PoD guest */
> >>>> -            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
> >>>> +            if ( p2m_pod_active(d) )
> >>>
> >>> Isn't it fine to just check for entry_count like you suggest in the
> >>> change to libxl?
> >>
> >> I didn't think it would be, but I'm not entirely sure: If paging was
> >> enabled before a guest actually starts, it wouldn't have any entries
> >> but still be a PoD guest if it has a non-empty cache. The VM event
> >> folks may be able to clarify this either way. But ...
> >>
> >>> This is what p2m_pod_entry_count actually does (rather than entry_count | count).
> >>
> >> ... you really mean "did" here, as I'm removing p2m_pod_entry_count()
> >> in this patch. Of course locking could be added to it instead (or
> >> p2m_pod_get_mem_target() be used in its place), but I'd prefer if we
> >> could go with just the check which precisely matches what the comment
> >> says (IOW otherwise I'd need to additionally know what exactly the
> >> comment is to say).
> > 
> > Could you briefly mention this in the commit message? Ie: VM event
> > code is also adjusted to make sure PoD is not in use and cannot be
> > used during the guest lifetime?
> 
> I've added
> 
> "Nor was the use of that function in line with the immediately preceding
>  comment: A PoD guest isn't just one with a non-zero entry count, but
>  also one with a non-empty cache (e.g. prior to actually launching the
>  guest)."
> 
> to the already existing paragraph about the removal of that function.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-03  9:52           ` Roger Pau Monné
@ 2022-02-03 10:20             ` Jan Beulich
  2022-02-03 10:23               ` Roger Pau Monné
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2022-02-03 10:20 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On 03.02.2022 10:52, Roger Pau Monné wrote:
> On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
>> On 03.02.2022 10:04, Roger Pau Monné wrote:
>>> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
>>>> On 02.02.2022 17:13, Roger Pau Monné wrote:
>>>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
>>>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>>>>>  
>>>>>>      ASSERT( pod_target >= p2m->pod.count );
>>>>>>  
>>>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>>>>
>>>>> Is it possible to have cache flush allowed without any PCI device
>>>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
>>>>> are device passed through?
>>>>
>>>> One can assign MMIO or ports to a guest the raw way. That's not
>>>> secure, but functionally explicitly permitted.
>>>>
>>>>> TBH I would be fine if we just say that PoD cannot be used in
>>>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
>>>>>
>>>>> I understand it's technically possible for PoD to be used together
>>>>> with a domain that will later get a device passed through once PoD is
>>>>> no longer in use, but I doubt there's much value in supporting that
>>>>> use case, and I fear we might be introducing corner cases that could
>>>>> create issues in the future. Overall I think it would be safer to just
>>>>> disable PoD in conjunction with an IOMMU.
>>>>
>>>> I consider it wrong to put in place such a restriction, but I could
>>>> perhaps accept you and Andrew thinking this way if this was the only
>>>> aspect playing into here. However, this would then want an equivalent
>>>> tools side check, and while hunting down where to make the change as
>>>> done here, I wasn't able to figure out where that alternative
>>>> adjustment would need doing. Hence I would possibly(!) buy into this
>>>> only if someone else took care of doing so properly in the tool stack
>>>> (including the emission of a sensible error message).
>>>
>>> What about the (completely untested) chunk below:
>>>
>>> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
>>> index d7a40d7550..e585ef4c5c 100644
>>> --- a/tools/libs/light/libxl_create.c
>>> +++ b/tools/libs/light/libxl_create.c
>>> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
>>>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
>>>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
>>>  
>>> -    /* We cannot have PoD and PCI device assignment at the same time
>>> +    /* We cannot have PoD and an active IOMMU at the same time
>>>       * for HVM guest. It was reported that IOMMU cannot work with PoD
>>>       * enabled because it needs to populated entire page table for
>>> -     * guest. To stay on the safe side, we disable PCI device
>>> -     * assignment when PoD is enabled.
>>> +     * guest.
>>>       */
>>>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
>>> -        d_config->num_pcidevs && pod_enabled) {
>>> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
>>> +        pod_enabled) {
>>>          ret = ERROR_INVAL;
>>> -        LOGD(ERROR, domid,
>>> -             "PCI device assignment for HVM guest failed due to PoD enabled");
>>> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
>>>          goto error_out;
>>>      }
>>
>> Perhaps. Seeing this I actually recall coming across this check during
>> my investigation. Not changing it along the lines of what you do was
>> then really more because of me not being convinced of the extra
>> restriction; I clearly misremembered when writing the earlier reply.
>> If we were to do what you suggest, I'd like to ask that the comment be
>> changed differently, though: "We cannot ..." then isn't really true
>> anymore. We choose not to permit this mode; "cannot" only applies to
>> actual device assignment (and of course only as long as there aren't
>> restartable IOMMU faults).
> 
> I'm fine with an adjusted wording here. This was mostly a placement
> suggestion, but I didn't gave much thought to the error message.

FTAOD: Are you going to transform this into a proper patch then? While
I wouldn't object to such a behavioral change, I also wouldn't want to
put my name under it. But if it went in, I think I might be able to
then drop the libxl adjustment from my patch.

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests
  2022-02-03 10:20             ` Jan Beulich
@ 2022-02-03 10:23               ` Roger Pau Monné
  0 siblings, 0 replies; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-03 10:23 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On Thu, Feb 03, 2022 at 11:20:15AM +0100, Jan Beulich wrote:
> On 03.02.2022 10:52, Roger Pau Monné wrote:
> > On Thu, Feb 03, 2022 at 10:21:54AM +0100, Jan Beulich wrote:
> >> On 03.02.2022 10:04, Roger Pau Monné wrote:
> >>> On Thu, Feb 03, 2022 at 09:31:03AM +0100, Jan Beulich wrote:
> >>>> On 02.02.2022 17:13, Roger Pau Monné wrote:
> >>>>> On Tue, Jan 04, 2022 at 10:41:32AM +0100, Jan Beulich wrote:
> >>>>>> @@ -359,7 +360,10 @@ p2m_pod_set_mem_target(struct domain *d,
> >>>>>>  
> >>>>>>      ASSERT( pod_target >= p2m->pod.count );
> >>>>>>  
> >>>>>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> >>>>>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> >>>>>
> >>>>> Is it possible to have cache flush allowed without any PCI device
> >>>>> assigned? AFAICT the iomem/ioport_caps would only get setup when there
> >>>>> are device passed through?
> >>>>
> >>>> One can assign MMIO or ports to a guest the raw way. That's not
> >>>> secure, but functionally explicitly permitted.
> >>>>
> >>>>> TBH I would be fine if we just say that PoD cannot be used in
> >>>>> conjunction with an IOMMU, and just check for is_iommu_enable(d) here.
> >>>>>
> >>>>> I understand it's technically possible for PoD to be used together
> >>>>> with a domain that will later get a device passed through once PoD is
> >>>>> no longer in use, but I doubt there's much value in supporting that
> >>>>> use case, and I fear we might be introducing corner cases that could
> >>>>> create issues in the future. Overall I think it would be safer to just
> >>>>> disable PoD in conjunction with an IOMMU.
> >>>>
> >>>> I consider it wrong to put in place such a restriction, but I could
> >>>> perhaps accept you and Andrew thinking this way if this was the only
> >>>> aspect playing into here. However, this would then want an equivalent
> >>>> tools side check, and while hunting down where to make the change as
> >>>> done here, I wasn't able to figure out where that alternative
> >>>> adjustment would need doing. Hence I would possibly(!) buy into this
> >>>> only if someone else took care of doing so properly in the tool stack
> >>>> (including the emission of a sensible error message).
> >>>
> >>> What about the (completely untested) chunk below:
> >>>
> >>> diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
> >>> index d7a40d7550..e585ef4c5c 100644
> >>> --- a/tools/libs/light/libxl_create.c
> >>> +++ b/tools/libs/light/libxl_create.c
> >>> @@ -1160,17 +1160,16 @@ int libxl__domain_config_setdefault(libxl__gc *gc,
> >>>      pod_enabled = (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV) &&
> >>>          (d_config->b_info.target_memkb < d_config->b_info.max_memkb);
> >>>  
> >>> -    /* We cannot have PoD and PCI device assignment at the same time
> >>> +    /* We cannot have PoD and an active IOMMU at the same time
> >>>       * for HVM guest. It was reported that IOMMU cannot work with PoD
> >>>       * enabled because it needs to populated entire page table for
> >>> -     * guest. To stay on the safe side, we disable PCI device
> >>> -     * assignment when PoD is enabled.
> >>> +     * guest.
> >>>       */
> >>>      if (d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV &&
> >>> -        d_config->num_pcidevs && pod_enabled) {
> >>> +        d_config->c_info.passthrough != LIBXL_PASSTHROUGH_DISABLED &&
> >>> +        pod_enabled) {
> >>>          ret = ERROR_INVAL;
> >>> -        LOGD(ERROR, domid,
> >>> -             "PCI device assignment for HVM guest failed due to PoD enabled");
> >>> +        LOGD(ERROR, domid, "IOMMU cannot be enabled together with PoD");
> >>>          goto error_out;
> >>>      }
> >>
> >> Perhaps. Seeing this I actually recall coming across this check during
> >> my investigation. Not changing it along the lines of what you do was
> >> then really more because of me not being convinced of the extra
> >> restriction; I clearly misremembered when writing the earlier reply.
> >> If we were to do what you suggest, I'd like to ask that the comment be
> >> changed differently, though: "We cannot ..." then isn't really true
> >> anymore. We choose not to permit this mode; "cannot" only applies to
> >> actual device assignment (and of course only as long as there aren't
> >> restartable IOMMU faults).
> > 
> > I'm fine with an adjusted wording here. This was mostly a placement
> > suggestion, but I didn't gave much thought to the error message.
> 
> FTAOD: Are you going to transform this into a proper patch then? While
> I wouldn't object to such a behavioral change, I also wouldn't want to
> put my name under it. But if it went in, I think I might be able to
> then drop the libxl adjustment from my patch.

Oh, I somewhat assumed you would integrate this check into the patch.
I can send a standalone patch myself if that's your preference. Let me
do that now.

Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-02-02 15:29     ` Jan Beulich
@ 2022-02-04  9:28       ` Roger Pau Monné
  2022-02-04  9:36         ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-02-04  9:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap

On Wed, Feb 02, 2022 at 04:29:37PM +0100, Jan Beulich wrote:
> On 02.02.2022 16:14, Roger Pau Monné wrote:
> > On Tue, Jan 04, 2022 at 10:41:53AM +0100, Jan Beulich wrote:
> >> Do away with the "pod_target_out_unlock" label. In particular by folding
> >> if()-s, the logic can be expressed with less code (and no goto-s) this
> >> way.
> >>
> >> Limit scope of "p2m", constifying it at the same time.
> > 
> > Is this stale? I cannot find any reference to a p2m variable in the
> > chunks below.
> 
> Indeed it is, leftover from rebasing over the introduction of
> p2m_pod_get_mem_target() in what is now patch 1. Dropped.

I'm happy with this change with the commit adjusted:

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Not sure if you can commit this now regardless of patch 1?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling
  2022-02-04  9:28       ` Roger Pau Monné
@ 2022-02-04  9:36         ` Jan Beulich
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Beulich @ 2022-02-04  9:36 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap

On 04.02.2022 10:28, Roger Pau Monné wrote:
> On Wed, Feb 02, 2022 at 04:29:37PM +0100, Jan Beulich wrote:
>> On 02.02.2022 16:14, Roger Pau Monné wrote:
>>> On Tue, Jan 04, 2022 at 10:41:53AM +0100, Jan Beulich wrote:
>>>> Do away with the "pod_target_out_unlock" label. In particular by folding
>>>> if()-s, the logic can be expressed with less code (and no goto-s) this
>>>> way.
>>>>
>>>> Limit scope of "p2m", constifying it at the same time.
>>>
>>> Is this stale? I cannot find any reference to a p2m variable in the
>>> chunks below.
>>
>> Indeed it is, leftover from rebasing over the introduction of
>> p2m_pod_get_mem_target() in what is now patch 1. Dropped.
> 
> I'm happy with this change with the commit adjusted:
> 
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

> Not sure if you can commit this now regardless of patch 1?

I think it can be moved ahead; there looks to be only a minor contextual
dependency.

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests
  2022-01-04  9:39 [PATCH v3 0/2] x86/mm: get/set PoD target related adjustments Jan Beulich
  2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
  2022-01-04  9:41 ` [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling Jan Beulich
@ 2022-04-11  9:47 ` Jan Beulich
  2022-04-11 10:29   ` Roger Pau Monné
  2 siblings, 1 reply; 20+ messages in thread
From: Jan Beulich @ 2022-04-11  9:47 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	George Dunlap, Paul Durrant, Tamas K Lengyel, Petre Pircalabu,
	Alexandru Isaila

While it is okay for IOMMU page tables to be set up for guests starting
in PoD mode, actual device assignment may only occur once all PoD
entries have been removed from the P2M. So far this was enforced only
for boot-time assignment, and only in the tool stack.

Also use the new function to replace p2m_pod_entry_count(): Its unlocked
access to p2m->pod.entry_count wasn't really okay (irrespective of the
result being stale by the time the caller gets to see it). Nor was the
use of that function in line with the immediately preceding comment: A
PoD guest isn't just one with a non-zero entry count, but also one with
a non-empty cache (e.g. prior to actually launching the guest).

To allow the tool stack to see a consistent snapshot of PoD state, move
the tail of XENMEM_{get,set}_pod_target handling into a function, adding
proper locking there.

In libxl take the liberty to use the new local variable r also for a
pre-existing call into libxc.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
permit device assignment by actively resolving all remaining PoD entries.

Initially I thought this was introduced by f89f555827a6 ("remove late
(on-demand) construction of IOMMU page tables"), but without
arch_iommu_use_permitted() checking for PoD I think the issue has been
there before that.
---
v4: Drop tool stack side change (superseded by 07449ecfa425). Extend VM
    event related paragraph of description.
v3: In p2m_pod_set_mem_target() move check down.
v2: New.

--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -20,6 +20,7 @@
  */
 
 #include <xen/event.h>
+#include <xen/iocap.h>
 #include <xen/ioreq.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
@@ -360,7 +361,10 @@ p2m_pod_set_mem_target(struct domain *d,
 
     ASSERT( pod_target >= p2m->pod.count );
 
-    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        ret = -ENOTEMPTY;
+    else
+        ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
 
 out:
     pod_unlock(p2m);
@@ -368,6 +372,23 @@ out:
     return ret;
 }
 
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    ASSERT(is_hvm_domain(d));
+
+    pod_lock(p2m);
+    lock_page_alloc(p2m);
+
+    target->tot_pages       = domain_tot_pages(d);
+    target->pod_cache_pages = p2m->pod.count;
+    target->pod_entries     = p2m->pod.entry_count;
+
+    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
+}
+
 int p2m_pod_empty_cache(struct domain *d)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
@@ -1391,6 +1412,9 @@ guest_physmap_mark_populate_on_demand(st
     if ( !paging_mode_translate(d) )
         return -EINVAL;
 
+    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+        return -ENOTEMPTY;
+
     do {
         rc = mark_populate_on_demand(d, gfn, chunk_order);
 
@@ -1412,3 +1436,20 @@ void p2m_pod_init(struct p2m_domain *p2m
     for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
         p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
 }
+
+bool p2m_pod_active(const struct domain *d)
+{
+    struct p2m_domain *p2m;
+    bool res;
+
+    if ( !is_hvm_domain(d) )
+        return false;
+
+    p2m = p2m_get_hostp2m(d);
+
+    pod_lock(p2m);
+    res = p2m->pod.entry_count | p2m->pod.count;
+    pod_unlock(p2m);
+
+    return res;
+}
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4804,7 +4804,6 @@ long arch_memory_op(unsigned long cmd, X
     {
         xen_pod_target_t target;
         struct domain *d;
-        struct p2m_domain *p2m;
 
         if ( copy_from_guest(&target, arg, 1) )
             return -EFAULT;
@@ -4835,10 +4834,7 @@ long arch_memory_op(unsigned long cmd, X
         }
         else if ( rc >= 0 )
         {
-            p2m = p2m_get_hostp2m(d);
-            target.tot_pages       = domain_tot_pages(d);
-            target.pod_cache_pages = p2m->pod.count;
-            target.pod_entries     = p2m->pod.entry_count;
+            p2m_pod_get_mem_target(d, &target);
 
             if ( __copy_to_guest(arg, &target, 1) )
                 rc = -EFAULT;
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, st
 
             rc = -EXDEV;
             /* Disallow paging in a PoD guest */
-            if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
+            if ( p2m_pod_active(d) )
                 break;
 
             /* domain_pause() not required here, see XSA-99 */
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -513,11 +513,12 @@ bool arch_iommu_use_permitted(const stru
 {
     /*
      * Prevent device assign if mem paging, mem sharing or log-dirty
-     * have been enabled for this domain.
+     * have been enabled for this domain, or if PoD is still in active use.
      */
     return d == dom_io ||
            (likely(!mem_sharing_enabled(d)) &&
             likely(!mem_paging_enabled(d)) &&
+            likely(!p2m_pod_active(d)) &&
             likely(!p2m_is_global_logdirty(d)));
 }
 
--- a/xen/arch/x86/include/asm/p2m.h
+++ b/xen/arch/x86/include/asm/p2m.h
@@ -701,6 +701,12 @@ int p2m_pod_empty_cache(struct domain *d
  * domain matches target */
 int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
 
+/* Obtain a consistent snapshot of PoD related domain state. */
+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
+
+/* Check whether PoD is (still) active in a domain. */
+bool p2m_pod_active(const struct domain *d);
+
 /* Scan pod cache when offline/broken page triggered */
 int
 p2m_pod_offline_or_broken_hit(struct page_info *p);
@@ -709,11 +715,6 @@ p2m_pod_offline_or_broken_hit(struct pag
 void
 p2m_pod_offline_or_broken_replace(struct page_info *p);
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return p2m->pod.entry_count;
-}
-
 #else
 
 static inline bool
@@ -727,6 +728,11 @@ static inline int p2m_pod_empty_cache(st
     return 0;
 }
 
+static inline bool p2m_pod_active(const struct domain *d)
+{
+    return false;
+}
+
 static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
 {
     return 0;
@@ -737,11 +743,6 @@ static inline void p2m_pod_offline_or_br
     ASSERT_UNREACHABLE();
 }
 
-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
-{
-    return 0;
-}
-
 #endif
 
 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests
  2022-04-11  9:47 ` [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
@ 2022-04-11 10:29   ` Roger Pau Monné
  2022-04-11 10:43     ` Jan Beulich
  0 siblings, 1 reply; 20+ messages in thread
From: Roger Pau Monné @ 2022-04-11 10:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On Mon, Apr 11, 2022 at 11:47:46AM +0200, Jan Beulich wrote:
> While it is okay for IOMMU page tables to be set up for guests starting
> in PoD mode, actual device assignment may only occur once all PoD
> entries have been removed from the P2M. So far this was enforced only
> for boot-time assignment, and only in the tool stack.
> 
> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
> access to p2m->pod.entry_count wasn't really okay (irrespective of the
> result being stale by the time the caller gets to see it). Nor was the
> use of that function in line with the immediately preceding comment: A
> PoD guest isn't just one with a non-zero entry count, but also one with
> a non-empty cache (e.g. prior to actually launching the guest).
> 
> To allow the tool stack to see a consistent snapshot of PoD state, move
> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
> proper locking there.
> 
> In libxl take the liberty to use the new local variable r also for a
> pre-existing call into libxc.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Just one comment below.

> ---
> If p2m->pod.entry_count == p2m->pod.count it is in principle possible to
> permit device assignment by actively resolving all remaining PoD entries.
> 
> Initially I thought this was introduced by f89f555827a6 ("remove late
> (on-demand) construction of IOMMU page tables"), but without
> arch_iommu_use_permitted() checking for PoD I think the issue has been
> there before that.
> ---
> v4: Drop tool stack side change (superseded by 07449ecfa425). Extend VM
>     event related paragraph of description.
> v3: In p2m_pod_set_mem_target() move check down.
> v2: New.
> 
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include <xen/event.h>
> +#include <xen/iocap.h>
>  #include <xen/ioreq.h>
>  #include <xen/mm.h>
>  #include <xen/sched.h>
> @@ -360,7 +361,10 @@ p2m_pod_set_mem_target(struct domain *d,
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
> +        ret = -ENOTEMPTY;

ENOTEMPTY seems weird here.  I think the reasoning is that the set of
passthrough devices is not empty? IMO it's confusing as the function
itself is related to buffer management, so returning ENOTEMPTY could
be confused with some other condition.

Might be less ambiguous to use EXDEV.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests
  2022-04-11 10:29   ` Roger Pau Monné
@ 2022-04-11 10:43     ` Jan Beulich
  0 siblings, 0 replies; 20+ messages in thread
From: Jan Beulich @ 2022-04-11 10:43 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel, Andrew Cooper, Wei Liu, George Dunlap, Paul Durrant,
	Tamas K Lengyel, Petre Pircalabu, Alexandru Isaila

On 11.04.2022 12:29, Roger Pau Monné wrote:
> On Mon, Apr 11, 2022 at 11:47:46AM +0200, Jan Beulich wrote:
>> While it is okay for IOMMU page tables to be set up for guests starting
>> in PoD mode, actual device assignment may only occur once all PoD
>> entries have been removed from the P2M. So far this was enforced only
>> for boot-time assignment, and only in the tool stack.
>>
>> Also use the new function to replace p2m_pod_entry_count(): Its unlocked
>> access to p2m->pod.entry_count wasn't really okay (irrespective of the
>> result being stale by the time the caller gets to see it). Nor was the
>> use of that function in line with the immediately preceding comment: A
>> PoD guest isn't just one with a non-zero entry count, but also one with
>> a non-empty cache (e.g. prior to actually launching the guest).
>>
>> To allow the tool stack to see a consistent snapshot of PoD state, move
>> the tail of XENMEM_{get,set}_pod_target handling into a function, adding
>> proper locking there.
>>
>> In libxl take the liberty to use the new local variable r also for a
>> pre-existing call into libxc.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

>> --- a/xen/arch/x86/mm/p2m-pod.c
>> +++ b/xen/arch/x86/mm/p2m-pod.c
>> @@ -20,6 +20,7 @@
>>   */
>>  
>>  #include <xen/event.h>
>> +#include <xen/iocap.h>
>>  #include <xen/ioreq.h>
>>  #include <xen/mm.h>
>>  #include <xen/sched.h>
>> @@ -360,7 +361,10 @@ p2m_pod_set_mem_target(struct domain *d,
>>  
>>      ASSERT( pod_target >= p2m->pod.count );
>>  
>> -    ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>> +    if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>> +        ret = -ENOTEMPTY;
> 
> ENOTEMPTY seems weird here.  I think the reasoning is that the set of
> passthrough devices is not empty?

Yes.

> IMO it's confusing as the function
> itself is related to buffer management, so returning ENOTEMPTY could
> be confused with some other condition.
> 
> Might be less ambiguous to use EXDEV.

I don't think there's any particularly good error code to use here.
Hence I've tried to pick one that makes some sense, and which isn't
widely used (this latter aspect would be EXDEV slightly less desirable).

Jan



^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-04-11 10:43 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-04  9:39 [PATCH v3 0/2] x86/mm: get/set PoD target related adjustments Jan Beulich
2022-01-04  9:41 ` [PATCH v3 1/2] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
2022-01-27 15:02   ` Ping: " Jan Beulich
2022-01-28 16:07   ` Anthony PERARD
2022-02-02 16:13   ` Roger Pau Monné
2022-02-03  8:31     ` Jan Beulich
2022-02-03  9:04       ` Roger Pau Monné
2022-02-03  9:21         ` Jan Beulich
2022-02-03  9:52           ` Roger Pau Monné
2022-02-03 10:20             ` Jan Beulich
2022-02-03 10:23               ` Roger Pau Monné
2022-01-04  9:41 ` [PATCH v3 2/2] x86/mm: tidy XENMEM_{get,set}_pod_target handling Jan Beulich
2022-01-27 15:03   ` Ping: " Jan Beulich
2022-02-02 15:14   ` Roger Pau Monné
2022-02-02 15:29     ` Jan Beulich
2022-02-04  9:28       ` Roger Pau Monné
2022-02-04  9:36         ` Jan Beulich
2022-04-11  9:47 ` [PATCH v4] IOMMU/x86: disallow device assignment to PoD guests Jan Beulich
2022-04-11 10:29   ` Roger Pau Monné
2022-04-11 10:43     ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.