xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
@ 2015-07-24  9:41 Jan Beulich
  2015-07-24 10:26 ` Wei Liu
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Jan Beulich @ 2015-07-24  9:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, Keir Fraser,
	Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 9179 bytes --]

... and its callers.

While all non-nested users are made fully honor the semantics of that
type, doing so in the nested case seemed insane (if doable at all,
considering VMCS shadowing), and hence there the respective operations
are simply made fail.

One case not (yet) taken care of is that of a page getting transitioned
to this type after a mapping got established.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
between the setting of the dirty flag and the actual write happening?
I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?

Note that this is conflicting with the altp2m series (adding another
call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
issue in the first place).

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3594,8 +3594,8 @@ int hvm_virtual_to_linear_addr(
 
 /* On non-NULL return, we leave this function holding an additional 
  * ref on the underlying mfn, if any */
-static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
-                                   bool_t permanent)
+static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
+                                  bool_t *writable)
 {
     void *map;
     p2m_type_t p2mt;
@@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
     }
 
     if ( writable )
-        paging_mark_dirty(d, page_to_mfn(page));
+    {
+        if ( !p2m_is_discard_write(p2mt) )
+            paging_mark_dirty(d, page_to_mfn(page));
+        else
+            *writable = 0;
+    }
 
     if ( !permanent )
         return __map_domain_page(page);
@@ -3630,14 +3635,16 @@ static void *__hvm_map_guest_frame(unsig
     return map;
 }
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent)
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable)
 {
-    return __hvm_map_guest_frame(gfn, 1, permanent);
+    *writable = 1;
+    return _hvm_map_guest_frame(gfn, permanent, writable);
 }
 
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
 {
-    return __hvm_map_guest_frame(gfn, 0, permanent);
+    return _hvm_map_guest_frame(gfn, permanent, NULL);
 }
 
 void hvm_unmap_guest_frame(void *p, bool_t permanent)
@@ -3657,7 +3664,7 @@ void hvm_unmap_guest_frame(void *p, bool
     put_page(mfn_to_page(mfn));
 }
 
-static void *hvm_map_entry(unsigned long va)
+static void *hvm_map_entry(unsigned long va, bool_t *writable)
 {
     unsigned long gfn;
     uint32_t pfec;
@@ -3680,7 +3687,7 @@ static void *hvm_map_entry(unsigned long
     if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
         goto fail;
 
-    v = hvm_map_guest_frame_rw(gfn, 0);
+    v = hvm_map_guest_frame_rw(gfn, 0, writable);
     if ( v == NULL )
         goto fail;
 
@@ -3702,6 +3709,7 @@ static int hvm_load_segment_selector(
     struct segment_register desctab, cs, segr;
     struct desc_struct *pdesc, desc;
     u8 dpl, rpl, cpl;
+    bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct vcpu *v = current;
@@ -3738,7 +3746,7 @@ static int hvm_load_segment_selector(
     if ( ((sel & 0xfff8) + 7) > desctab.limit )
         goto fail;
 
-    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
+    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
     if ( pdesc == NULL )
         goto hvm_map_fail;
 
@@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
             break;
         }
     } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              writable && /* except if we are to discard writes */
               (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
 
     /* Force the Accessed flag in our local copy. */
@@ -3834,6 +3843,7 @@ void hvm_task_switch(
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    bool_t otd_writable, ntd_writable;
     unsigned long eflags;
     int exn_raised, rc;
     struct {
@@ -3860,11 +3870,12 @@ void hvm_task_switch(
         goto out;
     }
 
-    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); 
+    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
+                               &otd_writable);
     if ( optss_desc == NULL )
         goto out;
 
-    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); 
+    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
     if ( nptss_desc == NULL )
         goto out;
 
@@ -3996,11 +4007,11 @@ void hvm_task_switch(
     v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
     hvm_update_guest_cr(v, 0);
 
-    if ( (taskswitch_reason == TSW_iret) ||
-         (taskswitch_reason == TSW_jmp) )
+    if ( (taskswitch_reason == TSW_iret ||
+          taskswitch_reason == TSW_jmp) && otd_writable)
         clear_bit(41, optss_desc); /* clear B flag of old task */
 
-    if ( taskswitch_reason != TSW_iret )
+    if ( taskswitch_reason != TSW_iret && ntd_writable )
         set_bit(41, nptss_desc); /* set B flag of new task */
 
     if ( errcode >= 0 )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -75,10 +75,20 @@ int nestedsvm_vmcb_map(struct vcpu *v, u
         nv->nv_vvmcxaddr = VMCX_EADDR;
     }
 
-    if (nv->nv_vvmcx == NULL) {
-        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1);
-        if (nv->nv_vvmcx == NULL)
+    if ( !nv->nv_vvmcx )
+    {
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1,
+                                             &writable);
+
+        if ( !vvmcx )
             return 0;
+        if ( !writable )
+        {
+            hvm_unmap_guest_frame(vvmcx, 1);
+            return 0;
+        }
+        nv->nv_vvmcx = vvmcx;
         nv->nv_vvmcxaddr = vmcbaddr;
     }
 
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1619,10 +1619,23 @@ int nvmx_handle_vmptrld(struct cpu_user_
 
     if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
     {
-        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
-        if ( nvcpu->nv_vvmcx )
-            nvcpu->nv_vvmcxaddr = gpa;
-        if ( !nvcpu->nv_vvmcx ||
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
+
+        if ( vvmcx )
+        {
+            if ( writable )
+            {
+                nvcpu->nv_vvmcx = vvmcx;
+                nvcpu->nv_vvmcxaddr = gpa;
+            }
+            else
+            {
+                hvm_unmap_guest_frame(vvmcx, 1);
+                vvmcx = NULL;
+            }
+        }
+        if ( !vvmcx ||
              !map_io_bitmap_all(v) ||
              !_map_msr_bitmap(v) )
         {
@@ -1676,13 +1689,10 @@ int nvmx_handle_vmclear(struct cpu_user_
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
     if ( gpa & 0xfff )
-    {
-        vmreturn(regs, VMFAIL_INVALID);
-        return X86EMUL_OKAY;
-    }
-    
-    if ( gpa == nvcpu->nv_vvmcxaddr ) 
+        rc = VMFAIL_INVALID;
+    else if ( gpa == nvcpu->nv_vvmcxaddr )
     {
         if ( cpu_has_vmx_vmcs_shadowing )
             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
@@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
     else 
     {
         /* Even if this VMCS isn't the current one, we must clear it. */
-        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
+        bool_t writable;
+
+        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
         if ( vvmcs ) 
-            clear_vvmcs_launched(&nvmx->launched_list,
-                domain_page_map_to_mfn(vvmcs));
-        hvm_unmap_guest_frame(vvmcs, 0);
+        {
+            if ( writable )
+                clear_vvmcs_launched(&nvmx->launched_list,
+                                     domain_page_map_to_mfn(vvmcs));
+            else
+                rc = VMFAIL_VALID;
+            hvm_unmap_guest_frame(vvmcs, 0);
+        }
     }
 
-    vmreturn(regs, VMSUCCEED);
+    vmreturn(regs, rc);
+
     return X86EMUL_OKAY;
 }
 
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -431,7 +431,8 @@ int hvm_virtual_to_linear_addr(
     unsigned int addr_size,
     unsigned long *linear_addr);
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent);
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable);
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent);
 void hvm_unmap_guest_frame(void *p, bool_t permanent);
 



[-- Attachment #2: x86-HVM-map-gf-honor-ro.patch --]
[-- Type: text/plain, Size: 9232 bytes --]

x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()

... and its callers.

While all non-nested users are made fully honor the semantics of that
type, doing so in the nested case seemed insane (if doable at all,
considering VMCS shadowing), and hence there the respective operations
are simply made fail.

One case not (yet) taken care of is that of a page getting transitioned
to this type after a mapping got established.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
between the setting of the dirty flag and the actual write happening?
I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?

Note that this is conflicting with the altp2m series (adding another
call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
issue in the first place).

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3594,8 +3594,8 @@ int hvm_virtual_to_linear_addr(
 
 /* On non-NULL return, we leave this function holding an additional 
  * ref on the underlying mfn, if any */
-static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
-                                   bool_t permanent)
+static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
+                                  bool_t *writable)
 {
     void *map;
     p2m_type_t p2mt;
@@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
     }
 
     if ( writable )
-        paging_mark_dirty(d, page_to_mfn(page));
+    {
+        if ( !p2m_is_discard_write(p2mt) )
+            paging_mark_dirty(d, page_to_mfn(page));
+        else
+            *writable = 0;
+    }
 
     if ( !permanent )
         return __map_domain_page(page);
@@ -3630,14 +3635,16 @@ static void *__hvm_map_guest_frame(unsig
     return map;
 }
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent)
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable)
 {
-    return __hvm_map_guest_frame(gfn, 1, permanent);
+    *writable = 1;
+    return _hvm_map_guest_frame(gfn, permanent, writable);
 }
 
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
 {
-    return __hvm_map_guest_frame(gfn, 0, permanent);
+    return _hvm_map_guest_frame(gfn, permanent, NULL);
 }
 
 void hvm_unmap_guest_frame(void *p, bool_t permanent)
@@ -3657,7 +3664,7 @@ void hvm_unmap_guest_frame(void *p, bool
     put_page(mfn_to_page(mfn));
 }
 
-static void *hvm_map_entry(unsigned long va)
+static void *hvm_map_entry(unsigned long va, bool_t *writable)
 {
     unsigned long gfn;
     uint32_t pfec;
@@ -3680,7 +3687,7 @@ static void *hvm_map_entry(unsigned long
     if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
         goto fail;
 
-    v = hvm_map_guest_frame_rw(gfn, 0);
+    v = hvm_map_guest_frame_rw(gfn, 0, writable);
     if ( v == NULL )
         goto fail;
 
@@ -3702,6 +3709,7 @@ static int hvm_load_segment_selector(
     struct segment_register desctab, cs, segr;
     struct desc_struct *pdesc, desc;
     u8 dpl, rpl, cpl;
+    bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct vcpu *v = current;
@@ -3738,7 +3746,7 @@ static int hvm_load_segment_selector(
     if ( ((sel & 0xfff8) + 7) > desctab.limit )
         goto fail;
 
-    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
+    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
     if ( pdesc == NULL )
         goto hvm_map_fail;
 
@@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
             break;
         }
     } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              writable && /* except if we are to discard writes */
               (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
 
     /* Force the Accessed flag in our local copy. */
@@ -3834,6 +3843,7 @@ void hvm_task_switch(
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    bool_t otd_writable, ntd_writable;
     unsigned long eflags;
     int exn_raised, rc;
     struct {
@@ -3860,11 +3870,12 @@ void hvm_task_switch(
         goto out;
     }
 
-    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); 
+    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
+                               &otd_writable);
     if ( optss_desc == NULL )
         goto out;
 
-    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); 
+    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
     if ( nptss_desc == NULL )
         goto out;
 
@@ -3996,11 +4007,11 @@ void hvm_task_switch(
     v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
     hvm_update_guest_cr(v, 0);
 
-    if ( (taskswitch_reason == TSW_iret) ||
-         (taskswitch_reason == TSW_jmp) )
+    if ( (taskswitch_reason == TSW_iret ||
+          taskswitch_reason == TSW_jmp) && otd_writable)
         clear_bit(41, optss_desc); /* clear B flag of old task */
 
-    if ( taskswitch_reason != TSW_iret )
+    if ( taskswitch_reason != TSW_iret && ntd_writable )
         set_bit(41, nptss_desc); /* set B flag of new task */
 
     if ( errcode >= 0 )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -75,10 +75,20 @@ int nestedsvm_vmcb_map(struct vcpu *v, u
         nv->nv_vvmcxaddr = VMCX_EADDR;
     }
 
-    if (nv->nv_vvmcx == NULL) {
-        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1);
-        if (nv->nv_vvmcx == NULL)
+    if ( !nv->nv_vvmcx )
+    {
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1,
+                                             &writable);
+
+        if ( !vvmcx )
             return 0;
+        if ( !writable )
+        {
+            hvm_unmap_guest_frame(vvmcx, 1);
+            return 0;
+        }
+        nv->nv_vvmcx = vvmcx;
         nv->nv_vvmcxaddr = vmcbaddr;
     }
 
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1619,10 +1619,23 @@ int nvmx_handle_vmptrld(struct cpu_user_
 
     if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
     {
-        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
-        if ( nvcpu->nv_vvmcx )
-            nvcpu->nv_vvmcxaddr = gpa;
-        if ( !nvcpu->nv_vvmcx ||
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
+
+        if ( vvmcx )
+        {
+            if ( writable )
+            {
+                nvcpu->nv_vvmcx = vvmcx;
+                nvcpu->nv_vvmcxaddr = gpa;
+            }
+            else
+            {
+                hvm_unmap_guest_frame(vvmcx, 1);
+                vvmcx = NULL;
+            }
+        }
+        if ( !vvmcx ||
              !map_io_bitmap_all(v) ||
              !_map_msr_bitmap(v) )
         {
@@ -1676,13 +1689,10 @@ int nvmx_handle_vmclear(struct cpu_user_
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
     if ( gpa & 0xfff )
-    {
-        vmreturn(regs, VMFAIL_INVALID);
-        return X86EMUL_OKAY;
-    }
-    
-    if ( gpa == nvcpu->nv_vvmcxaddr ) 
+        rc = VMFAIL_INVALID;
+    else if ( gpa == nvcpu->nv_vvmcxaddr )
     {
         if ( cpu_has_vmx_vmcs_shadowing )
             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
@@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
     else 
     {
         /* Even if this VMCS isn't the current one, we must clear it. */
-        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
+        bool_t writable;
+
+        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
         if ( vvmcs ) 
-            clear_vvmcs_launched(&nvmx->launched_list,
-                domain_page_map_to_mfn(vvmcs));
-        hvm_unmap_guest_frame(vvmcs, 0);
+        {
+            if ( writable )
+                clear_vvmcs_launched(&nvmx->launched_list,
+                                     domain_page_map_to_mfn(vvmcs));
+            else
+                rc = VMFAIL_VALID;
+            hvm_unmap_guest_frame(vvmcs, 0);
+        }
     }
 
-    vmreturn(regs, VMSUCCEED);
+    vmreturn(regs, rc);
+
     return X86EMUL_OKAY;
 }
 
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -431,7 +431,8 @@ int hvm_virtual_to_linear_addr(
     unsigned int addr_size,
     unsigned long *linear_addr);
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent);
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable);
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent);
 void hvm_unmap_guest_frame(void *p, bool_t permanent);
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24  9:41 [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw() Jan Beulich
@ 2015-07-24 10:26 ` Wei Liu
  2015-07-24 10:37   ` Jan Beulich
  2015-07-24 12:02 ` Andrew Cooper
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Wei Liu @ 2015-07-24 10:26 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Wei Liu, Jun Nakajima, Andrew Cooper, Eddie Dong,
	Aravind Gopalakrishnan, suravee.suthikulpanit, xen-devel,
	Boris Ostrovsky, Keir Fraser

On Fri, Jul 24, 2015 at 03:41:26AM -0600, Jan Beulich wrote:
> ... and its callers.
> 
> While all non-nested users are made fully honor the semantics of that
> type, doing so in the nested case seemed insane (if doable at all,
> considering VMCS shadowing), and hence there the respective operations
> are simply made fail.
> 
> One case not (yet) taken care of is that of a page getting transitioned
> to this type after a mapping got established.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
> too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
> between the setting of the dirty flag and the actual write happening?
> I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
> 
> Note that this is conflicting with the altp2m series (adding another
> call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
> issue in the first place).
> 
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -3594,8 +3594,8 @@ int hvm_virtual_to_linear_addr(
>  
>  /* On non-NULL return, we leave this function holding an additional 
>   * ref on the underlying mfn, if any */
> -static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
> -                                   bool_t permanent)
> +static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
> +                                  bool_t *writable)
>  {
>      void *map;
>      p2m_type_t p2mt;
> @@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
>      }
>  
>      if ( writable )

I don't claim I know this piece of code, but checking the pointer but
not the content looks suspicious.

> -        paging_mark_dirty(d, page_to_mfn(page));
> +    {
> +        if ( !p2m_is_discard_write(p2mt) )
> +            paging_mark_dirty(d, page_to_mfn(page));
> +        else
> +            *writable = 0;

You then set *writable here, which makes it even more suspicious.

Wei.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24 10:26 ` Wei Liu
@ 2015-07-24 10:37   ` Jan Beulich
  2015-07-24 10:41     ` Wei Liu
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2015-07-24 10:37 UTC (permalink / raw)
  To: Wei Liu
  Cc: KevinTian, Keir Fraser, suravee.suthikulpanit, AndrewCooper,
	EddieDong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky

>>> On 24.07.15 at 12:26, <wei.liu2@citrix.com> wrote:
> On Fri, Jul 24, 2015 at 03:41:26AM -0600, Jan Beulich wrote:
>> @@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
>>      }
>>  
>>      if ( writable )
> 
> I don't claim I know this piece of code, but checking the pointer but
> not the content looks suspicious.
> 
>> -        paging_mark_dirty(d, page_to_mfn(page));
>> +    {
>> +        if ( !p2m_is_discard_write(p2mt) )
>> +            paging_mark_dirty(d, page_to_mfn(page));
>> +        else
>> +            *writable = 0;
> 
> You then set *writable here, which makes it even more suspicious.

Why? A caller _wanting_ a writable mapping passes non-NULL as
the pointer argument (pre-initialized to point to a variable holding
TRUE aka 1). Upon return the variable will have got set to FALSE
aka 0 if the page shouldn't be written to.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24 10:37   ` Jan Beulich
@ 2015-07-24 10:41     ` Wei Liu
  0 siblings, 0 replies; 17+ messages in thread
From: Wei Liu @ 2015-07-24 10:41 UTC (permalink / raw)
  To: Jan Beulich
  Cc: KevinTian, Wei Liu, suravee.suthikulpanit, AndrewCooper,
	EddieDong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

On Fri, Jul 24, 2015 at 04:37:25AM -0600, Jan Beulich wrote:
> >>> On 24.07.15 at 12:26, <wei.liu2@citrix.com> wrote:
> > On Fri, Jul 24, 2015 at 03:41:26AM -0600, Jan Beulich wrote:
> >> @@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
> >>      }
> >>  
> >>      if ( writable )
> > 
> > I don't claim I know this piece of code, but checking the pointer but
> > not the content looks suspicious.
> > 
> >> -        paging_mark_dirty(d, page_to_mfn(page));
> >> +    {
> >> +        if ( !p2m_is_discard_write(p2mt) )
> >> +            paging_mark_dirty(d, page_to_mfn(page));
> >> +        else
> >> +            *writable = 0;
> > 
> > You then set *writable here, which makes it even more suspicious.
> 
> Why? A caller _wanting_ a writable mapping passes non-NULL as
> the pointer argument (pre-initialized to point to a variable holding
> TRUE aka 1). Upon return the variable will have got set to FALSE
> aka 0 if the page shouldn't be written to.
> 

If this is the convention for using this function then the code you
write is of course fine.

Wei.

> Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24  9:41 [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw() Jan Beulich
  2015-07-24 10:26 ` Wei Liu
@ 2015-07-24 12:02 ` Andrew Cooper
  2015-07-24 12:33   ` Jan Beulich
  2015-07-27 11:09   ` Tim Deegan
  2015-07-31  1:41 ` Tian, Kevin
  2015-07-31 16:06 ` Boris Ostrovsky
  3 siblings, 2 replies; 17+ messages in thread
From: Andrew Cooper @ 2015-07-24 12:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Eddie Dong,
	Aravind Gopalakrishnan, Jun Nakajima, Keir Fraser,
	Boris Ostrovsky

On 24/07/15 10:41, Jan Beulich wrote:
> ... and its callers.
>
> While all non-nested users are made fully honor the semantics of that
> type, doing so in the nested case seemed insane (if doable at all,
> considering VMCS shadowing), and hence there the respective operations
> are simply made fail.

Sorry, but I can't parse this sentence.  Surely in the nested case, it 
is the host p2m type which is relevant to whether a mapping should be 
forced read only?

>
> One case not (yet) taken care of is that of a page getting transitioned
> to this type after a mapping got established.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
> too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
> between the setting of the dirty flag and the actual write happening?
> I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?

It does indeed.  (Ideally the dirty bit should probably be held high for 
the duration that a mapping exists, but that is absolutely infeasible to 
do).

This is definitely not the only issue between the p2m and logdirty. At 
some point I need to see about making migration safe to use while the 
guest is ballooning.  Both PV and HVM guests break in different ways if 
they actually perform ballooning or physmap alteration while logdirty is 
active for live migration purposes.

>
> Note that this is conflicting with the altp2m series (adding another
> call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
> issue in the first place).
>
>
> @@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
>               break;
>           }
>       } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
> +              writable && /* except if we are to discard writes */
>                 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );

I can't recall where I read it in the manual, but I believe it is a 
faultable error to load a descriptor from RO memory if the accessed bit 
is not already set.  This was to prevent a processor livelock when 
running with gdtr pointing into ROM (which was a considered usecase).

Otherwise, the rest of the patch looks fine.

~Andrew

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24 12:02 ` Andrew Cooper
@ 2015-07-24 12:33   ` Jan Beulich
  2015-07-27 11:09   ` Tim Deegan
  1 sibling, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2015-07-24 12:33 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Eddie Dong,
	Aravind Gopalakrishnan, Jun Nakajima, xen-devel, Boris Ostrovsky,
	Keir Fraser

>>> On 24.07.15 at 14:02, <andrew.cooper3@citrix.com> wrote:
> On 24/07/15 10:41, Jan Beulich wrote:
>> ... and its callers.
>>
>> While all non-nested users are made fully honor the semantics of that
>> type, doing so in the nested case seemed insane (if doable at all,
>> considering VMCS shadowing), and hence there the respective operations
>> are simply made fail.
> 
> Sorry, but I can't parse this sentence.  Surely in the nested case, it 
> is the host p2m type which is relevant to whether a mapping should be 
> forced read only?

No, what I mean to say is
- callers outside of nested-HVM code properly obey the write-ignore
  semantics
- callers inside nested-HVM code would be too cumbersome (and
  maybe impossible) to fix, and hence they're being made return
  failure to their callers.

>> Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
>> too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
>> between the setting of the dirty flag and the actual write happening?
>> I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
> 
> It does indeed.  (Ideally the dirty bit should probably be held high for 
> the duration that a mapping exists, but that is absolutely infeasible to 
> do).

I don't see this being too difficult, the more that for transient
mappings it doesn't really matter (if there's a race, then setting
the flag after the write(s) is good enough). For permanent
mappings I can't see why we wouldn't be able to add a (short)
linked list of pages paging_log_dirty_op() should always set the
dirty flags for.

>> @@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
>>               break;
>>           }
>>       } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
>> +              writable && /* except if we are to discard writes */
>>                 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
> 
> I can't recall where I read it in the manual, but I believe it is a 
> faultable error to load a descriptor from RO memory if the accessed bit 
> is not already set.  This was to prevent a processor livelock when 
> running with gdtr pointing into ROM (which was a considered usecase).

I don't see why a processor would live-lock in such a case. It can do
the write, and ignore whether it actually too effect. I don't see why
it would e.g. spin until it sees the flag set. (Note that a cmpxchg()
like loop alone wouldn't have that problem, i.e. for a live lock to occur
there would still need to be an outer loop doing the checking).

But even it there was such (perhaps even model specific) behavior,
without having a pointer to where this is specified (and hence what
precise fault [and error code] to raise), I wouldn't want to go that
route here.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24 12:02 ` Andrew Cooper
  2015-07-24 12:33   ` Jan Beulich
@ 2015-07-27 11:09   ` Tim Deegan
  2015-08-11 13:51     ` Jan Beulich
  1 sibling, 1 reply; 17+ messages in thread
From: Tim Deegan @ 2015-07-27 11:09 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Kevin Tian, Wei Liu, Jan Beulich, Jun Nakajima, Eddie Dong,
	Aravind Gopalakrishnan, suravee.suthikulpanit, xen-devel,
	Boris Ostrovsky, Keir Fraser

At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
> On 24/07/15 10:41, Jan Beulich wrote:
> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
> > between the setting of the dirty flag and the actual write happening?
> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
> 
> It does indeed.  (Ideally the dirty bit should probably be held high for 
> the duration that a mapping exists, but that is absolutely infeasible to 
> do).

IMO that would not be very useful -- a well-behaved toolstack will
have to make sure that relevant mappings are torn down before
stop-and-copy.  Forcing the dirty bit high in the meantime just makes
every intermediate pass send a wasted copy of the page, without
actually closing the race window if the tools are buggy.

If we want to catch these bugs, it might be useful to have a flag
that the tools can set when stop-and-copy begins, to indicate any
subsequent mark_dirty() calls are "too late".

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24  9:41 [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw() Jan Beulich
  2015-07-24 10:26 ` Wei Liu
  2015-07-24 12:02 ` Andrew Cooper
@ 2015-07-31  1:41 ` Tian, Kevin
  2015-07-31 16:06 ` Boris Ostrovsky
  3 siblings, 0 replies; 17+ messages in thread
From: Tian, Kevin @ 2015-07-31  1:41 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Wei Liu, suravee.suthikulpanit, Andrew Cooper, Dong, Eddie,
	Aravind Gopalakrishnan, Nakajima, Jun, Keir Fraser,
	Boris Ostrovsky

> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Friday, July 24, 2015 5:41 PM
> 
> ... and its callers.
> 
> While all non-nested users are made fully honor the semantics of that
> type, doing so in the nested case seemed insane (if doable at all,
> considering VMCS shadowing), and hence there the respective operations
> are simply made fail.
> 
> One case not (yet) taken care of is that of a page getting transitioned
> to this type after a mapping got established.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Kevin Tian <kevin.tian@intel.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-24  9:41 [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw() Jan Beulich
                   ` (2 preceding siblings ...)
  2015-07-31  1:41 ` Tian, Kevin
@ 2015-07-31 16:06 ` Boris Ostrovsky
  2015-08-11 10:32   ` Jan Beulich
  2015-08-14 10:38   ` Jan Beulich
  3 siblings, 2 replies; 17+ messages in thread
From: Boris Ostrovsky @ 2015-07-31 16:06 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, Keir Fraser

On 07/24/2015 05:41 AM, Jan Beulich wrote:
>   
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -1619,10 +1619,23 @@ int nvmx_handle_vmptrld(struct cpu_user_
>   
>       if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
>       {
> -        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
> -        if ( nvcpu->nv_vvmcx )
> -            nvcpu->nv_vvmcxaddr = gpa;
> -        if ( !nvcpu->nv_vvmcx ||
> +        bool_t writable;
> +        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
> +

...

> @@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
>       else
>       {
>           /* Even if this VMCS isn't the current one, we must clear it. */
> -        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
> +        bool_t writable;
> +
> +        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);

Since you replaced 'gpa >> PAGE_SHIFT' with 'paddr_to_pfn(gpa' above, 
perhaps it should be replaced here too.

Other than that,
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-31 16:06 ` Boris Ostrovsky
@ 2015-08-11 10:32   ` Jan Beulich
  2015-08-14 10:38   ` Jan Beulich
  1 sibling, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2015-08-11 10:32 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Keir Fraser

>>> On 31.07.15 at 18:06, <boris.ostrovsky@oracle.com> wrote:
> On 07/24/2015 05:41 AM, Jan Beulich wrote:
>> @@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
>>       else
>>       {
>>           /* Even if this VMCS isn't the current one, we must clear it. */
>> -        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
>> +        bool_t writable;
>> +
>> +        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
> 
> Since you replaced 'gpa >> PAGE_SHIFT' with 'paddr_to_pfn(gpa' above, 
> perhaps it should be replaced here too.

Yes indeed.

> Other than that,
> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Thanks, Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-27 11:09   ` Tim Deegan
@ 2015-08-11 13:51     ` Jan Beulich
  2015-08-11 14:34       ` Tim Deegan
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2015-08-11 13:51 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

>>> On 27.07.15 at 13:09, <tim@xen.org> wrote:
> At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
>> On 24/07/15 10:41, Jan Beulich wrote:
>> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
>> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
>> > between the setting of the dirty flag and the actual write happening?
>> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
>> 
>> It does indeed.  (Ideally the dirty bit should probably be held high for 
>> the duration that a mapping exists, but that is absolutely infeasible to 
>> do).
> 
> IMO that would not be very useful -- a well-behaved toolstack will
> have to make sure that relevant mappings are torn down before
> stop-and-copy.  Forcing the dirty bit high in the meantime just makes
> every intermediate pass send a wasted copy of the page, without
> actually closing the race window if the tools are buggy.

Making sure such mappings got torn down in time doesn't help
when the most recent write happened _after_ the most recent
clearing of the dirty flag in a pass prior to stop-and-copy. But
yes, holding the dirty bit high would cause overhead. Yet
setting it only in hvm_unmap_guest_frame() wouldn't, as I now
realize, address the problem either, as that may happen e.g.
only upon guest destruction (i.e. after the stop-and-copy pass).
I.e. for guest pages currently mapped this way we'd really need
a mechanism to avoid their dirty flags to get set for initial passes,
but force it set on the final one.

And other than Andrew says I think tracking these mappings
(namely permanent ones) isn't infeasible, the more that there
shouldn't be that many of them. With them being tracked the
model then would be to set the dirty flag along with removing a
page from the tracking set, and report the dirty flags set on the
final pass (to make this work without interface changes we could
use the suspended state of the domain as indicator of the final
pass being in progress) for all pages still in the tracking set.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-08-11 13:51     ` Jan Beulich
@ 2015-08-11 14:34       ` Tim Deegan
  2015-08-11 15:37         ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Deegan @ 2015-08-11 14:34 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

At 07:51 -0600 on 11 Aug (1439279513), Jan Beulich wrote:
> >>> On 27.07.15 at 13:09, <tim@xen.org> wrote:
> > At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
> >> On 24/07/15 10:41, Jan Beulich wrote:
> >> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
> >> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
> >> > between the setting of the dirty flag and the actual write happening?
> >> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
> >> 
> >> It does indeed.  (Ideally the dirty bit should probably be held high for 
> >> the duration that a mapping exists, but that is absolutely infeasible to 
> >> do).
> > 
> > IMO that would not be very useful -- a well-behaved toolstack will
> > have to make sure that relevant mappings are torn down before
> > stop-and-copy.  Forcing the dirty bit high in the meantime just makes
> > every intermediate pass send a wasted copy of the page, without
> > actually closing the race window if the tools are buggy.
> 
> Making sure such mappings got torn down in time doesn't help
> when the most recent write happened _after_ the most recent
> clearing of the dirty flag in a pass prior to stop-and-copy.

This is why e.g. __gnttab_unmap_common sets the dirty bit again
as it unmaps.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-08-11 14:34       ` Tim Deegan
@ 2015-08-11 15:37         ` Jan Beulich
  2015-08-11 15:45           ` Tim Deegan
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2015-08-11 15:37 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

>>> On 11.08.15 at 16:34, <tim@xen.org> wrote:
> At 07:51 -0600 on 11 Aug (1439279513), Jan Beulich wrote:
>> >>> On 27.07.15 at 13:09, <tim@xen.org> wrote:
>> > At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
>> >> On 24/07/15 10:41, Jan Beulich wrote:
>> >> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
>> >> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
>> >> > between the setting of the dirty flag and the actual write happening?
>> >> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
>> >> 
>> >> It does indeed.  (Ideally the dirty bit should probably be held high for 
>> >> the duration that a mapping exists, but that is absolutely infeasible to 
>> >> do).
>> > 
>> > IMO that would not be very useful -- a well-behaved toolstack will
>> > have to make sure that relevant mappings are torn down before
>> > stop-and-copy.  Forcing the dirty bit high in the meantime just makes
>> > every intermediate pass send a wasted copy of the page, without
>> > actually closing the race window if the tools are buggy.
>> 
>> Making sure such mappings got torn down in time doesn't help
>> when the most recent write happened _after_ the most recent
>> clearing of the dirty flag in a pass prior to stop-and-copy.
> 
> This is why e.g. __gnttab_unmap_common sets the dirty bit again
> as it unmaps.

And how does this help when the mapping survives until the guest
gets suspended? And why would doing it _again_ when unmapping
be better than doing it _only_ then?

But in any event I read this as agreement that moving (or in the
worst case replicating) the hvm_map_guest_frame_rw() one into
hvm_unmap_guest_frame() would be an appropriate thing to do.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-08-11 15:37         ` Jan Beulich
@ 2015-08-11 15:45           ` Tim Deegan
  2015-08-11 16:01             ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Deegan @ 2015-08-11 15:45 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

At 09:37 -0600 on 11 Aug (1439285833), Jan Beulich wrote:
> >>> On 11.08.15 at 16:34, <tim@xen.org> wrote:
> > At 07:51 -0600 on 11 Aug (1439279513), Jan Beulich wrote:
> >> >>> On 27.07.15 at 13:09, <tim@xen.org> wrote:
> >> > At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
> >> >> On 24/07/15 10:41, Jan Beulich wrote:
> >> >> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
> >> >> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
> >> >> > between the setting of the dirty flag and the actual write happening?
> >> >> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
> >> >> 
> >> >> It does indeed.  (Ideally the dirty bit should probably be held high for 
> >> >> the duration that a mapping exists, but that is absolutely infeasible to 
> >> >> do).
> >> > 
> >> > IMO that would not be very useful -- a well-behaved toolstack will
> >> > have to make sure that relevant mappings are torn down before
> >> > stop-and-copy.  Forcing the dirty bit high in the meantime just makes
> >> > every intermediate pass send a wasted copy of the page, without
> >> > actually closing the race window if the tools are buggy.
> >> 
> >> Making sure such mappings got torn down in time doesn't help
> >> when the most recent write happened _after_ the most recent
> >> clearing of the dirty flag in a pass prior to stop-and-copy.
> > 
> > This is why e.g. __gnttab_unmap_common sets the dirty bit again
> > as it unmaps.
> 
> And how does this help when the mapping survives until the guest
> gets suspended?

Suspended is fine, so long as it happens before the final read of the
bitmap.

> And why would doing it _again_ when unmapping
> be better than doing it _only_ then?

My mistake - it is of course done _only_ then.

> But in any event I read this as agreement that moving (or in the
> worst case replicating) the hvm_map_guest_frame_rw() one into
> hvm_unmap_guest_frame() would be an appropriate thing to do.

Yep!

Tim.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-08-11 15:45           ` Tim Deegan
@ 2015-08-11 16:01             ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2015-08-11 16:01 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Boris Ostrovsky, Keir Fraser

>>> On 11.08.15 at 17:45, <tim@xen.org> wrote:
> At 09:37 -0600 on 11 Aug (1439285833), Jan Beulich wrote:
>> >>> On 11.08.15 at 16:34, <tim@xen.org> wrote:
>> > At 07:51 -0600 on 11 Aug (1439279513), Jan Beulich wrote:
>> >> >>> On 27.07.15 at 13:09, <tim@xen.org> wrote:
>> >> > At 13:02 +0100 on 24 Jul (1437742964), Andrew Cooper wrote:
>> >> >> On 24/07/15 10:41, Jan Beulich wrote:
>> >> >> > Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
>> >> >> > too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
>> >> >> > between the setting of the dirty flag and the actual write happening?
>> >> >> > I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?
>> >> >> 
>> >> >> It does indeed.  (Ideally the dirty bit should probably be held high for 
>> >> >> the duration that a mapping exists, but that is absolutely infeasible to 
>> >> >> do).
>> >> > 
>> >> > IMO that would not be very useful -- a well-behaved toolstack will
>> >> > have to make sure that relevant mappings are torn down before
>> >> > stop-and-copy.  Forcing the dirty bit high in the meantime just makes
>> >> > every intermediate pass send a wasted copy of the page, without
>> >> > actually closing the race window if the tools are buggy.
>> >> 
>> >> Making sure such mappings got torn down in time doesn't help
>> >> when the most recent write happened _after_ the most recent
>> >> clearing of the dirty flag in a pass prior to stop-and-copy.
>> > 
>> > This is why e.g. __gnttab_unmap_common sets the dirty bit again
>> > as it unmaps.
>> 
>> And how does this help when the mapping survives until the guest
>> gets suspended?
> 
> Suspended is fine, so long as it happens before the final read of the
> bitmap.

I don't follow - nothing unmaps the page in that case, so it wouldn't
get marked dirty. But I now realize this is the wrong example: The
mapping domain wouldn't get a mapped grant saved, and the
granting domain has to rely on the mappings of the grants to be
properly torn down before the final pass anyway. Whereas here
we're talking about mappings held and dirtying done by Xen itself.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-07-31 16:06 ` Boris Ostrovsky
  2015-08-11 10:32   ` Jan Beulich
@ 2015-08-14 10:38   ` Jan Beulich
  2015-08-14 13:26     ` Boris Ostrovsky
  1 sibling, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2015-08-14 10:38 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Keir Fraser

>>> On 31.07.15 at 18:06, <boris.ostrovsky@oracle.com> wrote:
> On 07/24/2015 05:41 AM, Jan Beulich wrote:
>> @@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
>>       else
>>       {
>>           /* Even if this VMCS isn't the current one, we must clear it. */
>> -        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
>> +        bool_t writable;
>> +
>> +        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
> 
> Since you replaced 'gpa >> PAGE_SHIFT' with 'paddr_to_pfn(gpa' above, 
> perhaps it should be replaced here too.
> 
> Other than that,
> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

I took the liberty to downgrade this to an ack (covering SVM) on
v2 (since the SVM part didn't change).

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
  2015-08-14 10:38   ` Jan Beulich
@ 2015-08-14 13:26     ` Boris Ostrovsky
  0 siblings, 0 replies; 17+ messages in thread
From: Boris Ostrovsky @ 2015-08-14 13:26 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, xen-devel,
	Keir Fraser

On 08/14/2015 06:38 AM, Jan Beulich wrote:
>>>> On 31.07.15 at 18:06, <boris.ostrovsky@oracle.com> wrote:
>> On 07/24/2015 05:41 AM, Jan Beulich wrote:
>>> @@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
>>>        else
>>>        {
>>>            /* Even if this VMCS isn't the current one, we must clear it. */
>>> -        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
>>> +        bool_t writable;
>>> +
>>> +        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
>> Since you replaced 'gpa >> PAGE_SHIFT' with 'paddr_to_pfn(gpa' above,
>> perhaps it should be replaced here too.
>>
>> Other than that,
>> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> I took the liberty to downgrade this to an ack (covering SVM) on
> v2 (since the SVM part didn't change).

Sure. (I also reviewed v2 so you can use either tag).

-boris

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-08-14 13:27 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-24  9:41 [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw() Jan Beulich
2015-07-24 10:26 ` Wei Liu
2015-07-24 10:37   ` Jan Beulich
2015-07-24 10:41     ` Wei Liu
2015-07-24 12:02 ` Andrew Cooper
2015-07-24 12:33   ` Jan Beulich
2015-07-27 11:09   ` Tim Deegan
2015-08-11 13:51     ` Jan Beulich
2015-08-11 14:34       ` Tim Deegan
2015-08-11 15:37         ` Jan Beulich
2015-08-11 15:45           ` Tim Deegan
2015-08-11 16:01             ` Jan Beulich
2015-07-31  1:41 ` Tian, Kevin
2015-07-31 16:06 ` Boris Ostrovsky
2015-08-11 10:32   ` Jan Beulich
2015-08-14 10:38   ` Jan Beulich
2015-08-14 13:26     ` Boris Ostrovsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).