[PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()

* [PATCH] x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()
@ 2015-07-24  9:41 Jan Beulich
  2015-07-24 10:26 ` Wei Liu
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Jan Beulich @ 2015-07-24  9:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Wei Liu, suravee.suthikulpanit, Andrew Cooper,
	Eddie Dong, Aravind Gopalakrishnan, Jun Nakajima, Keir Fraser,
	Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 9179 bytes --]

... and its callers.

While all non-nested users are made fully honor the semantics of that
type, doing so in the nested case seemed insane (if doable at all,
considering VMCS shadowing), and hence there the respective operations
are simply made fail.

One case not (yet) taken care of is that of a page getting transitioned
to this type after a mapping got established.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
between the setting of the dirty flag and the actual write happening?
I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?

Note that this is conflicting with the altp2m series (adding another
call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
issue in the first place).

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3594,8 +3594,8 @@ int hvm_virtual_to_linear_addr(
 
 /* On non-NULL return, we leave this function holding an additional 
  * ref on the underlying mfn, if any */
-static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
-                                   bool_t permanent)
+static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
+                                  bool_t *writable)
 {
     void *map;
     p2m_type_t p2mt;
@@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
     }
 
     if ( writable )
-        paging_mark_dirty(d, page_to_mfn(page));
+    {
+        if ( !p2m_is_discard_write(p2mt) )
+            paging_mark_dirty(d, page_to_mfn(page));
+        else
+            *writable = 0;
+    }
 
     if ( !permanent )
         return __map_domain_page(page);
@@ -3630,14 +3635,16 @@ static void *__hvm_map_guest_frame(unsig
     return map;
 }
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent)
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable)
 {
-    return __hvm_map_guest_frame(gfn, 1, permanent);
+    *writable = 1;
+    return _hvm_map_guest_frame(gfn, permanent, writable);
 }
 
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
 {
-    return __hvm_map_guest_frame(gfn, 0, permanent);
+    return _hvm_map_guest_frame(gfn, permanent, NULL);
 }
 
 void hvm_unmap_guest_frame(void *p, bool_t permanent)
@@ -3657,7 +3664,7 @@ void hvm_unmap_guest_frame(void *p, bool
     put_page(mfn_to_page(mfn));
 }
 
-static void *hvm_map_entry(unsigned long va)
+static void *hvm_map_entry(unsigned long va, bool_t *writable)
 {
     unsigned long gfn;
     uint32_t pfec;
@@ -3680,7 +3687,7 @@ static void *hvm_map_entry(unsigned long
     if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
         goto fail;
 
-    v = hvm_map_guest_frame_rw(gfn, 0);
+    v = hvm_map_guest_frame_rw(gfn, 0, writable);
     if ( v == NULL )
         goto fail;
 
@@ -3702,6 +3709,7 @@ static int hvm_load_segment_selector(
     struct segment_register desctab, cs, segr;
     struct desc_struct *pdesc, desc;
     u8 dpl, rpl, cpl;
+    bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct vcpu *v = current;
@@ -3738,7 +3746,7 @@ static int hvm_load_segment_selector(
     if ( ((sel & 0xfff8) + 7) > desctab.limit )
         goto fail;
 
-    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
+    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
     if ( pdesc == NULL )
         goto hvm_map_fail;
 
@@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
             break;
         }
     } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              writable && /* except if we are to discard writes */
               (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
 
     /* Force the Accessed flag in our local copy. */
@@ -3834,6 +3843,7 @@ void hvm_task_switch(
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    bool_t otd_writable, ntd_writable;
     unsigned long eflags;
     int exn_raised, rc;
     struct {
@@ -3860,11 +3870,12 @@ void hvm_task_switch(
         goto out;
     }
 
-    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); 
+    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
+                               &otd_writable);
     if ( optss_desc == NULL )
         goto out;
 
-    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); 
+    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
     if ( nptss_desc == NULL )
         goto out;
 
@@ -3996,11 +4007,11 @@ void hvm_task_switch(
     v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
     hvm_update_guest_cr(v, 0);
 
-    if ( (taskswitch_reason == TSW_iret) ||
-         (taskswitch_reason == TSW_jmp) )
+    if ( (taskswitch_reason == TSW_iret ||
+          taskswitch_reason == TSW_jmp) && otd_writable)
         clear_bit(41, optss_desc); /* clear B flag of old task */
 
-    if ( taskswitch_reason != TSW_iret )
+    if ( taskswitch_reason != TSW_iret && ntd_writable )
         set_bit(41, nptss_desc); /* set B flag of new task */
 
     if ( errcode >= 0 )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -75,10 +75,20 @@ int nestedsvm_vmcb_map(struct vcpu *v, u
         nv->nv_vvmcxaddr = VMCX_EADDR;
     }
 
-    if (nv->nv_vvmcx == NULL) {
-        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1);
-        if (nv->nv_vvmcx == NULL)
+    if ( !nv->nv_vvmcx )
+    {
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1,
+                                             &writable);
+
+        if ( !vvmcx )
             return 0;
+        if ( !writable )
+        {
+            hvm_unmap_guest_frame(vvmcx, 1);
+            return 0;
+        }
+        nv->nv_vvmcx = vvmcx;
         nv->nv_vvmcxaddr = vmcbaddr;
     }
 
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1619,10 +1619,23 @@ int nvmx_handle_vmptrld(struct cpu_user_
 
     if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
     {
-        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
-        if ( nvcpu->nv_vvmcx )
-            nvcpu->nv_vvmcxaddr = gpa;
-        if ( !nvcpu->nv_vvmcx ||
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
+
+        if ( vvmcx )
+        {
+            if ( writable )
+            {
+                nvcpu->nv_vvmcx = vvmcx;
+                nvcpu->nv_vvmcxaddr = gpa;
+            }
+            else
+            {
+                hvm_unmap_guest_frame(vvmcx, 1);
+                vvmcx = NULL;
+            }
+        }
+        if ( !vvmcx ||
              !map_io_bitmap_all(v) ||
              !_map_msr_bitmap(v) )
         {
@@ -1676,13 +1689,10 @@ int nvmx_handle_vmclear(struct cpu_user_
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
     if ( gpa & 0xfff )
-    {
-        vmreturn(regs, VMFAIL_INVALID);
-        return X86EMUL_OKAY;
-    }
-    
-    if ( gpa == nvcpu->nv_vvmcxaddr ) 
+        rc = VMFAIL_INVALID;
+    else if ( gpa == nvcpu->nv_vvmcxaddr )
     {
         if ( cpu_has_vmx_vmcs_shadowing )
             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
@@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
     else 
     {
         /* Even if this VMCS isn't the current one, we must clear it. */
-        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
+        bool_t writable;
+
+        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
         if ( vvmcs ) 
-            clear_vvmcs_launched(&nvmx->launched_list,
-                domain_page_map_to_mfn(vvmcs));
-        hvm_unmap_guest_frame(vvmcs, 0);
+        {
+            if ( writable )
+                clear_vvmcs_launched(&nvmx->launched_list,
+                                     domain_page_map_to_mfn(vvmcs));
+            else
+                rc = VMFAIL_VALID;
+            hvm_unmap_guest_frame(vvmcs, 0);
+        }
     }
 
-    vmreturn(regs, VMSUCCEED);
+    vmreturn(regs, rc);
+
     return X86EMUL_OKAY;
 }
 
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -431,7 +431,8 @@ int hvm_virtual_to_linear_addr(
     unsigned int addr_size,
     unsigned long *linear_addr);
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent);
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable);
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent);
 void hvm_unmap_guest_frame(void *p, bool_t permanent);
 



[-- Attachment #2: x86-HVM-map-gf-honor-ro.patch --]
[-- Type: text/plain, Size: 9232 bytes --]

x86/HVM: honor p2m_ram_ro in hvm_map_guest_frame_rw()

... and its callers.

While all non-nested users are made fully honor the semantics of that
type, doing so in the nested case seemed insane (if doable at all,
considering VMCS shadowing), and hence there the respective operations
are simply made fail.

One case not (yet) taken care of is that of a page getting transitioned
to this type after a mapping got established.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Beyond that log-dirty handling in _hvm_map_guest_frame() looks bogus
too: What if a XEN_DOMCTL_SHADOW_OP_* gets issued and acted upon
between the setting of the dirty flag and the actual write happening?
I.e. shouldn't the flag instead be set in hvm_unmap_guest_frame()?

Note that this is conflicting with the altp2m series (adding another
call to hvm_map_guest_frame_rw(), reviewing of which made me spot the
issue in the first place).

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3594,8 +3594,8 @@ int hvm_virtual_to_linear_addr(
 
 /* On non-NULL return, we leave this function holding an additional 
  * ref on the underlying mfn, if any */
-static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
-                                   bool_t permanent)
+static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
+                                  bool_t *writable)
 {
     void *map;
     p2m_type_t p2mt;
@@ -3618,7 +3618,12 @@ static void *__hvm_map_guest_frame(unsig
     }
 
     if ( writable )
-        paging_mark_dirty(d, page_to_mfn(page));
+    {
+        if ( !p2m_is_discard_write(p2mt) )
+            paging_mark_dirty(d, page_to_mfn(page));
+        else
+            *writable = 0;
+    }
 
     if ( !permanent )
         return __map_domain_page(page);
@@ -3630,14 +3635,16 @@ static void *__hvm_map_guest_frame(unsig
     return map;
 }
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent)
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable)
 {
-    return __hvm_map_guest_frame(gfn, 1, permanent);
+    *writable = 1;
+    return _hvm_map_guest_frame(gfn, permanent, writable);
 }
 
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
 {
-    return __hvm_map_guest_frame(gfn, 0, permanent);
+    return _hvm_map_guest_frame(gfn, permanent, NULL);
 }
 
 void hvm_unmap_guest_frame(void *p, bool_t permanent)
@@ -3657,7 +3664,7 @@ void hvm_unmap_guest_frame(void *p, bool
     put_page(mfn_to_page(mfn));
 }
 
-static void *hvm_map_entry(unsigned long va)
+static void *hvm_map_entry(unsigned long va, bool_t *writable)
 {
     unsigned long gfn;
     uint32_t pfec;
@@ -3680,7 +3687,7 @@ static void *hvm_map_entry(unsigned long
     if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
         goto fail;
 
-    v = hvm_map_guest_frame_rw(gfn, 0);
+    v = hvm_map_guest_frame_rw(gfn, 0, writable);
     if ( v == NULL )
         goto fail;
 
@@ -3702,6 +3709,7 @@ static int hvm_load_segment_selector(
     struct segment_register desctab, cs, segr;
     struct desc_struct *pdesc, desc;
     u8 dpl, rpl, cpl;
+    bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct vcpu *v = current;
@@ -3738,7 +3746,7 @@ static int hvm_load_segment_selector(
     if ( ((sel & 0xfff8) + 7) > desctab.limit )
         goto fail;
 
-    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
+    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
     if ( pdesc == NULL )
         goto hvm_map_fail;
 
@@ -3797,6 +3805,7 @@ static int hvm_load_segment_selector(
             break;
         }
     } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              writable && /* except if we are to discard writes */
               (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
 
     /* Force the Accessed flag in our local copy. */
@@ -3834,6 +3843,7 @@ void hvm_task_switch(
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    bool_t otd_writable, ntd_writable;
     unsigned long eflags;
     int exn_raised, rc;
     struct {
@@ -3860,11 +3870,12 @@ void hvm_task_switch(
         goto out;
     }
 
-    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); 
+    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
+                               &otd_writable);
     if ( optss_desc == NULL )
         goto out;
 
-    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); 
+    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
     if ( nptss_desc == NULL )
         goto out;
 
@@ -3996,11 +4007,11 @@ void hvm_task_switch(
     v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
     hvm_update_guest_cr(v, 0);
 
-    if ( (taskswitch_reason == TSW_iret) ||
-         (taskswitch_reason == TSW_jmp) )
+    if ( (taskswitch_reason == TSW_iret ||
+          taskswitch_reason == TSW_jmp) && otd_writable)
         clear_bit(41, optss_desc); /* clear B flag of old task */
 
-    if ( taskswitch_reason != TSW_iret )
+    if ( taskswitch_reason != TSW_iret && ntd_writable )
         set_bit(41, nptss_desc); /* set B flag of new task */
 
     if ( errcode >= 0 )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -75,10 +75,20 @@ int nestedsvm_vmcb_map(struct vcpu *v, u
         nv->nv_vvmcxaddr = VMCX_EADDR;
     }
 
-    if (nv->nv_vvmcx == NULL) {
-        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1);
-        if (nv->nv_vvmcx == NULL)
+    if ( !nv->nv_vvmcx )
+    {
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1,
+                                             &writable);
+
+        if ( !vvmcx )
             return 0;
+        if ( !writable )
+        {
+            hvm_unmap_guest_frame(vvmcx, 1);
+            return 0;
+        }
+        nv->nv_vvmcx = vvmcx;
         nv->nv_vvmcxaddr = vmcbaddr;
     }
 
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1619,10 +1619,23 @@ int nvmx_handle_vmptrld(struct cpu_user_
 
     if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
     {
-        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
-        if ( nvcpu->nv_vvmcx )
-            nvcpu->nv_vvmcxaddr = gpa;
-        if ( !nvcpu->nv_vvmcx ||
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
+
+        if ( vvmcx )
+        {
+            if ( writable )
+            {
+                nvcpu->nv_vvmcx = vvmcx;
+                nvcpu->nv_vvmcxaddr = gpa;
+            }
+            else
+            {
+                hvm_unmap_guest_frame(vvmcx, 1);
+                vvmcx = NULL;
+            }
+        }
+        if ( !vvmcx ||
              !map_io_bitmap_all(v) ||
              !_map_msr_bitmap(v) )
         {
@@ -1676,13 +1689,10 @@ int nvmx_handle_vmclear(struct cpu_user_
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
     if ( gpa & 0xfff )
-    {
-        vmreturn(regs, VMFAIL_INVALID);
-        return X86EMUL_OKAY;
-    }
-    
-    if ( gpa == nvcpu->nv_vvmcxaddr ) 
+        rc = VMFAIL_INVALID;
+    else if ( gpa == nvcpu->nv_vvmcxaddr )
     {
         if ( cpu_has_vmx_vmcs_shadowing )
             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
@@ -1693,14 +1703,22 @@ int nvmx_handle_vmclear(struct cpu_user_
     else 
     {
         /* Even if this VMCS isn't the current one, we must clear it. */
-        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
+        bool_t writable;
+
+        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0, &writable);
         if ( vvmcs ) 
-            clear_vvmcs_launched(&nvmx->launched_list,
-                domain_page_map_to_mfn(vvmcs));
-        hvm_unmap_guest_frame(vvmcs, 0);
+        {
+            if ( writable )
+                clear_vvmcs_launched(&nvmx->launched_list,
+                                     domain_page_map_to_mfn(vvmcs));
+            else
+                rc = VMFAIL_VALID;
+            hvm_unmap_guest_frame(vvmcs, 0);
+        }
     }
 
-    vmreturn(regs, VMSUCCEED);
+    vmreturn(regs, rc);
+
     return X86EMUL_OKAY;
 }
 
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -431,7 +431,8 @@ int hvm_virtual_to_linear_addr(
     unsigned int addr_size,
     unsigned long *linear_addr);
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent);
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable);
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent);
 void hvm_unmap_guest_frame(void *p, bool_t permanent);
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread