All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
@ 2018-09-12  9:47 Alexandru Isaila
  2018-09-13 14:17 ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Alexandru Isaila @ 2018-09-12  9:47 UTC (permalink / raw)
  To: xen-devel
  Cc: tamas, wei.liu2, rcojocaru, george.dunlap, andrew.cooper3, tim,
	jbeulich, Alexandru Isaila

The original version of the patch emulated the current instruction
(which, as a side-effect, emulated the page-walk as well), however we
need finer-grained control. We want to emulate the page-walk, but still
get an EPT violation event if the current instruction would trigger one.
This patch performs just the page-walk emulation.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>

---
Changes since V1:
	- Changed guest_walk_tables() to set A bit on each level and
          check if there was any A set. If not the it will set the D bit
          according to the write flags and cr0.wp
---
 xen/arch/x86/mm/guest_walk.c     | 23 ++++++++++++++++++++++-
 xen/arch/x86/mm/hap/guest_walk.c | 32 +++++++++++++++++++++++++++++++-
 xen/arch/x86/mm/hap/hap.c        | 12 ++++++++----
 xen/arch/x86/mm/hap/private.h    | 10 ++++++++++
 xen/arch/x86/mm/mem_access.c     |  5 ++++-
 xen/arch/x86/mm/shadow/multi.c   |  6 +++---
 xen/include/asm-x86/guest_pt.h   |  3 ++-
 xen/include/asm-x86/paging.h     |  5 ++++-
 8 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index f67aeda3d0..c99c48fa8a 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -82,7 +82,8 @@ static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
 bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                   unsigned long va, walk_t *gw,
-                  uint32_t walk, mfn_t top_mfn, void *top_map)
+                  uint32_t walk, mfn_t top_mfn, void *top_map,
+                  bool set_ad)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -95,6 +96,7 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     uint32_t gflags, rc;
     unsigned int leaf_level;
     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
+    bool accessed = false;
 
 #define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
 #define AR_ACCUM_OR  (_PAGE_NX_BIT)
@@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     ar_and &= gflags;
     ar_or  |= gflags;
 
+    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
+                               &gw->l4e.l4, false) )
+        accessed = true;
+
     /* Map the l3 table */
     l3p = map_domain_gfn(p2m,
                          guest_l4e_get_gfn(gw->l4e),
@@ -179,6 +185,10 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     ar_and &= gflags;
     ar_or  |= gflags;
 
+    if ( set_ad && set_ad_bits(&l3p[guest_l3_table_offset(va)].l3,
+                               &gw->l3e.l3, false) )
+        accessed = true;
+
     if ( gflags & _PAGE_PSE )
     {
         /*
@@ -278,6 +288,10 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     ar_and &= gflags;
     ar_or  |= gflags;
 
+    if ( set_ad && set_ad_bits(&l2p[guest_l2_table_offset(va)].l2,
+                               &gw->l2e.l2, false) )
+        accessed = true;
+
     if ( gflags & _PAGE_PSE )
     {
         /*
@@ -362,6 +376,13 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
      */
     ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
 
+    if ( set_ad )
+    {
+        set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
+                    (ar & _PAGE_RW) && !accessed && !guest_wp_enabled(v));
+        goto out;
+    }
+
     /*
      * Sanity check.  If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
      * should have caused a translation failure before we get here.
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index 3b8ee2efce..4cbbf69095 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,6 +29,10 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
+#define _hap_page_walk_set_ad_bits(levels)                                     \
+        hap_page_walk_set_ad_bits_##levels##_levels
+#define hap_page_walk_set_ad_bits(levels) _hap_page_walk_set_ad_bits(levels)
+
 #define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
 #define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
 
@@ -39,6 +43,32 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/guest_pt.h>
 #include <asm/p2m.h>
 
+void hap_page_walk_set_ad_bits(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m,
+    unsigned long va, uint32_t walk, unsigned long cr3)
+{
+    walk_t gw;
+    mfn_t top_mfn;
+    void *top_map;
+    gfn_t top_gfn;
+    struct page_info *top_page;
+    p2m_type_t p2mt;
+
+    top_gfn = _gfn(cr3 >> PAGE_SHIFT);
+    top_page = p2m_get_page_from_gfn(p2m, top_gfn, &p2mt, NULL,
+                                     P2M_ALLOC | P2M_UNSHARE);
+    top_mfn = page_to_mfn(top_page);
+
+    /* Map the top-level table and call the tree-walker */
+    ASSERT(mfn_valid(top_mfn));
+    top_map = map_domain_page(top_mfn);
+#if GUEST_PAGING_LEVELS == 3
+    top_map += (cr3 & ~(PAGE_MASK | 31));
+#endif
+
+    guest_walk_tables(v, p2m, va, &gw, walk, top_mfn, top_map, true);
+}
+
 unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
     struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
@@ -91,7 +121,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map, false);
     unmap_domain_page(top_map);
     put_page(top_page);
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 3d651b94c3..ca046b78df 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -768,7 +768,8 @@ static const struct paging_mode hap_paging_real_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 1
+    .guest_levels           = 1,
+    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_2_levels
 };
 
 static const struct paging_mode hap_paging_protected_mode = {
@@ -779,7 +780,8 @@ static const struct paging_mode hap_paging_protected_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 2
+    .guest_levels           = 2,
+    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_2_levels
 };
 
 static const struct paging_mode hap_paging_pae_mode = {
@@ -790,7 +792,8 @@ static const struct paging_mode hap_paging_pae_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 3
+    .guest_levels           = 3,
+    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_3_levels
 };
 
 static const struct paging_mode hap_paging_long_mode = {
@@ -801,7 +804,8 @@ static const struct paging_mode hap_paging_long_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 4
+    .guest_levels           = 4,
+    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_4_levels
 };
 
 /*
diff --git a/xen/arch/x86/mm/hap/private.h b/xen/arch/x86/mm/hap/private.h
index 973fbe8be5..abb933c4f8 100644
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -47,4 +47,14 @@ unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
     struct p2m_domain *p2m, unsigned long cr3,
     paddr_t ga, uint32_t *pfec, unsigned int *page_order);
 
+void hap_page_walk_set_ad_bits_2_levels(struct vcpu *v, struct p2m_domain *p2m,
+                                        unsigned long va, uint32_t walk,
+                                        unsigned long cr3);
+void hap_page_walk_set_ad_bits_3_levels(struct vcpu *v, struct p2m_domain *p2m,
+                                        unsigned long va, uint32_t walk,
+                                        unsigned long cr3);
+void hap_page_walk_set_ad_bits_4_levels(struct vcpu *v, struct p2m_domain *p2m,
+                                        unsigned long va,  uint32_t walk,
+                                        unsigned long cr3);
+
 #endif /* __HAP_PRIVATE_H__ */
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index a8b3e99ec4..8b644946f0 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -214,7 +214,10 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
          d->arch.monitor.inguest_pagefault_disabled &&
          npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
     {
-        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
+        struct hvm_hw_cpu ctxt;
+
+        hvm_funcs.save_cpu_ctxt(v, &ctxt);
+        paging_get_hostmode(v)->page_walk_set_ad_bits(v, p2m, gla, 0, ctxt.cr3);
 
         return true;
     }
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 5cb216f0db..d3df9be57c 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -178,12 +178,12 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
     return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
                              INVALID_MFN,
-                             v->arch.paging.shadow.gl3e
+                             v->arch.paging.shadow.gl3e,
 #else /* 32 or 64 */
                              pagetable_get_mfn(v->arch.guest_table),
-                             v->arch.paging.shadow.guest_vtable
+                             v->arch.paging.shadow.guest_vtable,
 #endif
-                             );
+                            false);
 }
 
 /* This validation is called with lock held, and after write permission
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 8684b83fd6..3dfb7fa966 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -425,7 +425,8 @@ static inline unsigned int guest_walk_to_page_order(const walk_t *gw)
 
 bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
-                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
+                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map,
+                  bool set_ad);
 
 /* Pretty-print the contents of a guest-walk */
 static inline void print_gw(const walk_t *gw)
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index b51e1709d3..076ca204f5 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -127,7 +127,10 @@ struct paging_mode {
     void          (*write_p2m_entry       )(struct domain *d, unsigned long gfn,
                                             l1_pgentry_t *p, l1_pgentry_t new,
                                             unsigned int level);
-
+    void          (*page_walk_set_ad_bits )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long va, uint32_t walk,
+                                            unsigned long cr3);
     unsigned int guest_levels;
 
     /* paging support extension */
-- 
2.17.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-12  9:47 [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks Alexandru Isaila
@ 2018-09-13 14:17 ` Jan Beulich
  2018-09-18  9:47   ` Isaila Alexandru
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2018-09-13 14:17 UTC (permalink / raw)
  To: aisaila
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Andrew Cooper, Tim Deegan, xen-devel

>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
> The original version of the patch emulated the current instruction
> (which, as a side-effect, emulated the page-walk as well), however we
> need finer-grained control. We want to emulate the page-walk, but still
> get an EPT violation event if the current instruction would trigger one.
> This patch performs just the page-walk emulation.

Rather than making this basically a revision log, could you please focus
on what you actually want to achieve?

As to the title: "Suppress ..." please.

> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
>      ar_and &= gflags;
>      ar_or  |= gflags;
>  
> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
> +                               &gw->l4e.l4, false) )
> +        accessed = true;

It is in particular this seemingly odd (and redundant with what's done
later in the function) which needs thorough explanation.

> @@ -362,6 +376,13 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
>       */
>      ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
>  
> +    if ( set_ad )
> +    {
> +        set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
> +                    (ar & _PAGE_RW) && !accessed && !guest_wp_enabled(v));
> +        goto out;
> +    }

Why does A bits being set (or not) in non-leaf (possibly, but even that's
inconsistent in your handling) tables have any meaning for whether the
D bit wants setting in the leaf page table? Similarly, why does it matter
whether the accumulated permissions allow writing, irrespective of
whether the operation was actually a write? Same for CR0.WP.

> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -768,7 +768,8 @@ static const struct paging_mode hap_paging_real_mode = {
>      .update_cr3             = hap_update_cr3,
>      .update_paging_modes    = hap_update_paging_modes,
>      .write_p2m_entry        = hap_write_p2m_entry,
> -    .guest_levels           = 1
> +    .guest_levels           = 1,
> +    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_2_levels
>  };

Here and elsewhere, please add new hooks next to other hooks,
not at the end.

> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -214,7 +214,10 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>           d->arch.monitor.inguest_pagefault_disabled &&
>           npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>      {
> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
> +        struct hvm_hw_cpu ctxt;
> +
> +        hvm_funcs.save_cpu_ctxt(v, &ctxt);

Pretty expensive an operation when all you're after is
v->arch.hvm.guest_cr[3].

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-13 14:17 ` Jan Beulich
@ 2018-09-18  9:47   ` Isaila Alexandru
  2018-09-18 10:17     ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Isaila Alexandru @ 2018-09-18  9:47 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Andrew Cooper, Tim Deegan, xen-devel

On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
> > > > On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
> > 
> > The original version of the patch emulated the current instruction
> > (which, as a side-effect, emulated the page-walk as well), however
> > we
> > need finer-grained control. We want to emulate the page-walk, but
> > still
> > get an EPT violation event if the current instruction would trigger
> > one.
> > This patch performs just the page-walk emulation.
> 
> Rather than making this basically a revision log, could you please
> focus
> on what you actually want to achieve?
> 
> As to the title: "Suppress ..." please.
> 
> > @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
> > p2m_domain *p2m,
> >      ar_and &= gflags;
> >      ar_or  |= gflags;
> >  
> > +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
> > +                               &gw->l4e.l4, false) )
> > +        accessed = true;
> 
> It is in particular this seemingly odd (and redundant with what's
> done
> later in the function) which needs thorough explanation.

On this patch I've followed Andrew Cooper's suggestion on how to set
A/D Bits:

"While walking down the levels, set any missing A bits and remember if
we
set any.  If we set A bits, consider ourselves complete and exit back
to
the guest.  If no A bits were set, and the access was a write (which we
know from the EPT violation information), then set the leaf D bit."

If I misunderstood the comment please clarify.

Thanks, 
Alex

> 
> > @@ -362,6 +376,13 @@ guest_walk_tables(struct vcpu *v, struct
> > p2m_domain *p2m,
> >       */
> >      ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
> >  
> > +    if ( set_ad )
> > +    {
> > +        set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw-
> > >l1e.l1,
> > +                    (ar & _PAGE_RW) && !accessed &&
> > !guest_wp_enabled(v));
> > +        goto out;
> > +    }
> 
> Why does A bits being set (or not) in non-leaf (possibly, but even
> that's
> inconsistent in your handling) tables have any meaning for whether
> the
> D bit wants setting in the leaf page table? Similarly, why does it
> matter
> whether the accumulated permissions allow writing, irrespective of
> whether the operation was actually a write? Same for CR0.WP.
> 
> > --- a/xen/arch/x86/mm/hap/hap.c
> > +++ b/xen/arch/x86/mm/hap/hap.c
> > @@ -768,7 +768,8 @@ static const struct paging_mode
> > hap_paging_real_mode = {
> >      .update_cr3             = hap_update_cr3,
> >      .update_paging_modes    = hap_update_paging_modes,
> >      .write_p2m_entry        = hap_write_p2m_entry,
> > -    .guest_levels           = 1
> > +    .guest_levels           = 1,
> > +    .page_walk_set_ad_bits  = hap_page_walk_set_ad_bits_2_levels
> >  };
> 
> Here and elsewhere, please add new hooks next to other hooks,
> not at the end.
> 
> > --- a/xen/arch/x86/mm/mem_access.c
> > +++ b/xen/arch/x86/mm/mem_access.c
> > @@ -214,7 +214,10 @@ bool p2m_mem_access_check(paddr_t gpa,
> > unsigned long gla,
> >           d->arch.monitor.inguest_pagefault_disabled &&
> >           npfec.kind != npfec_kind_with_gla ) /* don't send a
> > mem_event */
> >      {
> > -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL,
> > TRAP_invalid_op, X86_EVENT_NO_EC);
> > +        struct hvm_hw_cpu ctxt;
> > +
> > +        hvm_funcs.save_cpu_ctxt(v, &ctxt);
> 
> Pretty expensive an operation when all you're after is
> v->arch.hvm.guest_cr[3].
> 
> Jan
> 
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-18  9:47   ` Isaila Alexandru
@ 2018-09-18 10:17     ` Jan Beulich
  2018-09-18 18:20       ` Andrew Cooper
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2018-09-18 10:17 UTC (permalink / raw)
  To: aisaila
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Andrew Cooper, Tim Deegan, xen-devel

>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>> > > > On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>> > 
>> > The original version of the patch emulated the current instruction
>> > (which, as a side-effect, emulated the page-walk as well), however
>> > we
>> > need finer-grained control. We want to emulate the page-walk, but
>> > still
>> > get an EPT violation event if the current instruction would trigger
>> > one.
>> > This patch performs just the page-walk emulation.
>> 
>> Rather than making this basically a revision log, could you please
>> focus
>> on what you actually want to achieve?
>> 
>> As to the title: "Suppress ..." please.
>> 
>> > @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>> > p2m_domain *p2m,
>> >      ar_and &= gflags;
>> >      ar_or  |= gflags;
>> >  
>> > +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>> > +                               &gw->l4e.l4, false) )
>> > +        accessed = true;
>> 
>> It is in particular this seemingly odd (and redundant with what's
>> done
>> later in the function) which needs thorough explanation.
> 
> On this patch I've followed Andrew Cooper's suggestion on how to set
> A/D Bits:
> 
> "While walking down the levels, set any missing A bits and remember if we
> set any.  If we set A bits, consider ourselves complete and exit back to
> the guest.  If no A bits were set, and the access was a write (which we
> know from the EPT violation information), then set the leaf D bit."
> 
> If I misunderstood the comment please clarify.

It doesn't look to me as if you misunderstood anything, but only Andrew
can say for sure. However, none of this was in the description of your
patch (neither as part of the description, nor as code comment), and I
think you'd even have to greatly extend on this in order to explain to
everyone why the resulting behavior is still architecturally correct. In no
case should you assume anyone reading your patch (now or in the
future) has participated in the earlier discussion.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-18 10:17     ` Jan Beulich
@ 2018-09-18 18:20       ` Andrew Cooper
  2018-09-18 18:58         ` Razvan Cojocaru
  2018-09-19  8:53         ` Jan Beulich
  0 siblings, 2 replies; 16+ messages in thread
From: Andrew Cooper @ 2018-09-18 18:20 UTC (permalink / raw)
  To: Jan Beulich, aisaila
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Tim Deegan, xen-devel

On 18/09/18 11:17, Jan Beulich wrote:
>>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>>>>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>> The original version of the patch emulated the current instruction
>>>> (which, as a side-effect, emulated the page-walk as well), however
>>>> we
>>>> need finer-grained control. We want to emulate the page-walk, but
>>>> still
>>>> get an EPT violation event if the current instruction would trigger
>>>> one.
>>>> This patch performs just the page-walk emulation.
>>> Rather than making this basically a revision log, could you please
>>> focus
>>> on what you actually want to achieve?
>>>
>>> As to the title: "Suppress ..." please.
>>>
>>>> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>>>> p2m_domain *p2m,
>>>>      ar_and &= gflags;
>>>>      ar_or  |= gflags;
>>>>  
>>>> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>>>> +                               &gw->l4e.l4, false) )
>>>> +        accessed = true;
>>> It is in particular this seemingly odd (and redundant with what's
>>> done
>>> later in the function) which needs thorough explanation.
>> On this patch I've followed Andrew Cooper's suggestion on how to set
>> A/D Bits:
>>
>> "While walking down the levels, set any missing A bits and remember if we
>> set any.  If we set A bits, consider ourselves complete and exit back to
>> the guest.  If no A bits were set, and the access was a write (which we
>> know from the EPT violation information), then set the leaf D bit."
>>
>> If I misunderstood the comment please clarify.
> It doesn't look to me as if you misunderstood anything, but only Andrew
> can say for sure. However, none of this was in the description of your
> patch (neither as part of the description, nor as code comment), and I
> think you'd even have to greatly extend on this in order to explain to
> everyone why the resulting behavior is still architecturally correct. In no
> case should you assume anyone reading your patch (now or in the
> future) has participated in the earlier discussion.

The problem we have is that, while we know the EPT Violation was for a
write of an A or D bit to a write-protected guest pagetable, we don't
know if it was the A or the D bit which was attempting to be set.

Furthermore (without emulating the instruction, which is what we are
trying to avoid), we can't reconstruct the access.

Access bits are only written if they were missing before, but may be set
speculatively.  Dirty bits are only set when a write is retired.  From a
practical point of view, the pipeline sets A and D bits as separate actions.

Following this logic (and assuming for now a single vcpu), if we get a
GPT EPT Violation, and there are missing access bits on the walk, then
the fault is definitely from setting an access bit.  Set all access bits
and call it done.  If we get a GPT EPT Violation and all access bits
were set, then it was definitely from setting the Dirty bit.

For multi-vcpu scenarios, things get racy.  Setting all the Access bits
is safe because its a speculative action, but a speculatively load on
one vcpu can race with a write (to a read-only mapping) on the other
vcpu, and would trick this algorithm into setting the dirty bit when the
write would have faulted (and not set the dirty bit).

Do we have numbers on how many the GPT EPT Violations are for (only)
access sets, and how many are for dirty tsets?  Would the first half of
the algorithm (which is definitely not racy) still be a net perf win?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-18 18:20       ` Andrew Cooper
@ 2018-09-18 18:58         ` Razvan Cojocaru
  2018-09-19  8:53         ` Jan Beulich
  1 sibling, 0 replies; 16+ messages in thread
From: Razvan Cojocaru @ 2018-09-18 18:58 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich, aisaila
  Cc: George Dunlap, Tim Deegan, Tamas K Lengyel, Wei Liu, xen-devel

On 9/18/18 9:20 PM, Andrew Cooper wrote:
> On 18/09/18 11:17, Jan Beulich wrote:
>>>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>>>>>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>>> The original version of the patch emulated the current instruction
>>>>> (which, as a side-effect, emulated the page-walk as well), however
>>>>> we
>>>>> need finer-grained control. We want to emulate the page-walk, but
>>>>> still
>>>>> get an EPT violation event if the current instruction would trigger
>>>>> one.
>>>>> This patch performs just the page-walk emulation.
>>>> Rather than making this basically a revision log, could you please
>>>> focus
>>>> on what you actually want to achieve?
>>>>
>>>> As to the title: "Suppress ..." please.
>>>>
>>>>> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>>>>> p2m_domain *p2m,
>>>>>      ar_and &= gflags;
>>>>>      ar_or  |= gflags;
>>>>>  
>>>>> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>>>>> +                               &gw->l4e.l4, false) )
>>>>> +        accessed = true;
>>>> It is in particular this seemingly odd (and redundant with what's
>>>> done
>>>> later in the function) which needs thorough explanation.
>>> On this patch I've followed Andrew Cooper's suggestion on how to set
>>> A/D Bits:
>>>
>>> "While walking down the levels, set any missing A bits and remember if we
>>> set any.  If we set A bits, consider ourselves complete and exit back to
>>> the guest.  If no A bits were set, and the access was a write (which we
>>> know from the EPT violation information), then set the leaf D bit."
>>>
>>> If I misunderstood the comment please clarify.
>> It doesn't look to me as if you misunderstood anything, but only Andrew
>> can say for sure. However, none of this was in the description of your
>> patch (neither as part of the description, nor as code comment), and I
>> think you'd even have to greatly extend on this in order to explain to
>> everyone why the resulting behavior is still architecturally correct. In no
>> case should you assume anyone reading your patch (now or in the
>> future) has participated in the earlier discussion.
> 
> The problem we have is that, while we know the EPT Violation was for a
> write of an A or D bit to a write-protected guest pagetable, we don't
> know if it was the A or the D bit which was attempting to be set.
> 
> Furthermore (without emulating the instruction, which is what we are
> trying to avoid), we can't reconstruct the access.
> 
> Access bits are only written if they were missing before, but may be set
> speculatively.  Dirty bits are only set when a write is retired.  From a
> practical point of view, the pipeline sets A and D bits as separate actions.
> 
> Following this logic (and assuming for now a single vcpu), if we get a
> GPT EPT Violation, and there are missing access bits on the walk, then
> the fault is definitely from setting an access bit.  Set all access bits
> and call it done.  If we get a GPT EPT Violation and all access bits
> were set, then it was definitely from setting the Dirty bit.
> 
> For multi-vcpu scenarios, things get racy.  Setting all the Access bits
> is safe because its a speculative action, but a speculatively load on
> one vcpu can race with a write (to a read-only mapping) on the other
> vcpu, and would trick this algorithm into setting the dirty bit when the
> write would have faulted (and not set the dirty bit).
> 
> Do we have numbers on how many the GPT EPT Violations are for (only)
> access sets, and how many are for dirty tsets?  Would the first half of
> the algorithm (which is definitely not racy) still be a net perf win?

Last time I've counted with a simple test there were 25 Ds to 19062 As,
so yes, most of these are setting access bits, and yes, it looks like
it's still worth doing even when setting the A bits alone - though of
course we'd prefer to avoid vm_events for both.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-18 18:20       ` Andrew Cooper
  2018-09-18 18:58         ` Razvan Cojocaru
@ 2018-09-19  8:53         ` Jan Beulich
  2018-09-19 13:41           ` Andrew Cooper
  1 sibling, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2018-09-19  8:53 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Tim Deegan, xen-devel, aisaila

>>> On 18.09.18 at 20:20, <andrew.cooper3@citrix.com> wrote:
> On 18/09/18 11:17, Jan Beulich wrote:
>>>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>>>>>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>>> The original version of the patch emulated the current instruction
>>>>> (which, as a side-effect, emulated the page-walk as well), however
>>>>> we
>>>>> need finer-grained control. We want to emulate the page-walk, but
>>>>> still
>>>>> get an EPT violation event if the current instruction would trigger
>>>>> one.
>>>>> This patch performs just the page-walk emulation.
>>>> Rather than making this basically a revision log, could you please
>>>> focus
>>>> on what you actually want to achieve?
>>>>
>>>> As to the title: "Suppress ..." please.
>>>>
>>>>> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>>>>> p2m_domain *p2m,
>>>>>      ar_and &= gflags;
>>>>>      ar_or  |= gflags;
>>>>>  
>>>>> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>>>>> +                               &gw->l4e.l4, false) )
>>>>> +        accessed = true;
>>>> It is in particular this seemingly odd (and redundant with what's
>>>> done
>>>> later in the function) which needs thorough explanation.
>>> On this patch I've followed Andrew Cooper's suggestion on how to set
>>> A/D Bits:
>>>
>>> "While walking down the levels, set any missing A bits and remember if we
>>> set any.  If we set A bits, consider ourselves complete and exit back to
>>> the guest.  If no A bits were set, and the access was a write (which we
>>> know from the EPT violation information), then set the leaf D bit."
>>>
>>> If I misunderstood the comment please clarify.
>> It doesn't look to me as if you misunderstood anything, but only Andrew
>> can say for sure. However, none of this was in the description of your
>> patch (neither as part of the description, nor as code comment), and I
>> think you'd even have to greatly extend on this in order to explain to
>> everyone why the resulting behavior is still architecturally correct. In no
>> case should you assume anyone reading your patch (now or in the
>> future) has participated in the earlier discussion.
> 
> The problem we have is that, while we know the EPT Violation was for a
> write of an A or D bit to a write-protected guest pagetable, we don't
> know if it was the A or the D bit which was attempting to be set.
> 
> Furthermore (without emulating the instruction, which is what we are
> trying to avoid), we can't reconstruct the access.
> 
> Access bits are only written if they were missing before, but may be set
> speculatively.  Dirty bits are only set when a write is retired.  From a
> practical point of view, the pipeline sets A and D bits as separate actions.
> 
> Following this logic (and assuming for now a single vcpu), if we get a
> GPT EPT Violation, and there are missing access bits on the walk, then
> the fault is definitely from setting an access bit.

Definitely? Is there anything guaranteeing architecturally that an access
bit related EPT violation would be delivered earlier than any other one
on that same or a lower page table level? It doesn't matter much for
the implementation (because of it being permissible to set the A bits
speculatively, as you also say further down, and any other violation
then re-occurring after exiting back to the guest once the A bits are
all set), but since we're discussing here what exactly the patch
description should contain, I think I'd prefer this to be fully correct there.

Or wait - I think I can agree with "definitely", provided you further
restrict the context: "..., if we get a GPT EPT Write Violation ...". But
from what I can tell the patch'es change to p2m_mem_access_check()
doesn't apply (or pass on) any of these qualifications at all.

>  Set all access bits
> and call it done.  If we get a GPT EPT Violation and all access bits
> were set, then it was definitely from setting the Dirty bit.
> 
> For multi-vcpu scenarios, things get racy.  Setting all the Access bits
> is safe because its a speculative action, but a speculatively load on
> one vcpu can race with a write (to a read-only mapping) on the other
> vcpu, and would trick this algorithm into setting the dirty bit when the
> write would have faulted (and not set the dirty bit).
> 
> Do we have numbers on how many the GPT EPT Violations are for (only)
> access sets, and how many are for dirty tsets?  Would the first half of
> the algorithm (which is definitely not racy) still be a net perf win?

Does Windows make use of A bits at all? I'd expect most OSes to
simply set them right away, and actively use of the D bits.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-19  8:53         ` Jan Beulich
@ 2018-09-19 13:41           ` Andrew Cooper
  2018-09-19 14:31             ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Cooper @ 2018-09-19 13:41 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Tim Deegan, xen-devel, aisaila

On 19/09/18 09:53, Jan Beulich wrote:
>>>> On 18.09.18 at 20:20, <andrew.cooper3@citrix.com> wrote:
>> On 18/09/18 11:17, Jan Beulich wrote:
>>>>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>>>>>>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>>>> The original version of the patch emulated the current instruction
>>>>>> (which, as a side-effect, emulated the page-walk as well), however
>>>>>> we
>>>>>> need finer-grained control. We want to emulate the page-walk, but
>>>>>> still
>>>>>> get an EPT violation event if the current instruction would trigger
>>>>>> one.
>>>>>> This patch performs just the page-walk emulation.
>>>>> Rather than making this basically a revision log, could you please
>>>>> focus
>>>>> on what you actually want to achieve?
>>>>>
>>>>> As to the title: "Suppress ..." please.
>>>>>
>>>>>> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>>>>>> p2m_domain *p2m,
>>>>>>      ar_and &= gflags;
>>>>>>      ar_or  |= gflags;
>>>>>>  
>>>>>> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>>>>>> +                               &gw->l4e.l4, false) )
>>>>>> +        accessed = true;
>>>>> It is in particular this seemingly odd (and redundant with what's
>>>>> done
>>>>> later in the function) which needs thorough explanation.
>>>> On this patch I've followed Andrew Cooper's suggestion on how to set
>>>> A/D Bits:
>>>>
>>>> "While walking down the levels, set any missing A bits and remember if we
>>>> set any.  If we set A bits, consider ourselves complete and exit back to
>>>> the guest.  If no A bits were set, and the access was a write (which we
>>>> know from the EPT violation information), then set the leaf D bit."
>>>>
>>>> If I misunderstood the comment please clarify.
>>> It doesn't look to me as if you misunderstood anything, but only Andrew
>>> can say for sure. However, none of this was in the description of your
>>> patch (neither as part of the description, nor as code comment), and I
>>> think you'd even have to greatly extend on this in order to explain to
>>> everyone why the resulting behavior is still architecturally correct. In no
>>> case should you assume anyone reading your patch (now or in the
>>> future) has participated in the earlier discussion.
>> The problem we have is that, while we know the EPT Violation was for a
>> write of an A or D bit to a write-protected guest pagetable, we don't
>> know if it was the A or the D bit which was attempting to be set.
>>
>> Furthermore (without emulating the instruction, which is what we are
>> trying to avoid), we can't reconstruct the access.
>>
>> Access bits are only written if they were missing before, but may be set
>> speculatively.  Dirty bits are only set when a write is retired.  From a
>> practical point of view, the pipeline sets A and D bits as separate actions.
>>
>> Following this logic (and assuming for now a single vcpu), if we get a
>> GPT EPT Violation, and there are missing access bits on the walk, then
>> the fault is definitely from setting an access bit.
> Definitely?

Yes

>  Is there anything guaranteeing architecturally that an access
> bit related EPT violation would be delivered earlier than any other one
> on that same or a lower page table level?

No, but why does that matter?

Architecturally defined or not, we know that the action the processor
was trying to perform was to set an A/D bit, because we got a vmexit
telling us so.

>  It doesn't matter much for
> the implementation (because of it being permissible to set the A bits
> speculatively, as you also say further down, and any other violation
> then re-occurring after exiting back to the guest once the A bits are
> all set), but since we're discussing here what exactly the patch
> description should contain, I think I'd prefer this to be fully correct there.
>
> Or wait - I think I can agree with "definitely", provided you further
> restrict the context: "..., if we get a GPT EPT Write Violation ...". But
> from what I can tell the patch'es change to p2m_mem_access_check()
> doesn't apply (or pass on) any of these qualifications at all.

I've not looked at the patch in detail yet.  I'm tempted to suggest
rearranging guest_walk_tables() to just set the access bits on the
decent, rather than at the end.  This matches how some hardware behaves
when pulling entries into the paging structure cache.

>
>>  Set all access bits
>> and call it done.  If we get a GPT EPT Violation and all access bits
>> were set, then it was definitely from setting the Dirty bit.
>>
>> For multi-vcpu scenarios, things get racy.  Setting all the Access bits
>> is safe because its a speculative action, but a speculatively load on
>> one vcpu can race with a write (to a read-only mapping) on the other
>> vcpu, and would trick this algorithm into setting the dirty bit when the
>> write would have faulted (and not set the dirty bit).
>>
>> Do we have numbers on how many the GPT EPT Violations are for (only)
>> access sets, and how many are for dirty tsets?  Would the first half of
>> the algorithm (which is definitely not racy) still be a net perf win?
> Does Windows make use of A bits at all? I'd expect most OSes to
> simply set them right away, and actively use of the D bits.

What gives you the expectation that OSes wouldn't use A bits?

For paging out, the best options are non-accessed non-dirty page because
their contents can be discarded immediately and reread from disk at a
later point.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-09-19 13:41           ` Andrew Cooper
@ 2018-09-19 14:31             ` Jan Beulich
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2018-09-19 14:31 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Tamas K Lengyel, Wei Liu, Razvan Cojocaru, George Dunlap,
	Tim Deegan, xen-devel, aisaila

>>> On 19.09.18 at 15:41, <andrew.cooper3@citrix.com> wrote:
> On 19/09/18 09:53, Jan Beulich wrote:
>>>>> On 18.09.18 at 20:20, <andrew.cooper3@citrix.com> wrote:
>>> On 18/09/18 11:17, Jan Beulich wrote:
>>>>>>> On 18.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>>> On Thu, 2018-09-13 at 08:17 -0600, Jan Beulich wrote:
>>>>>>>>> On 12.09.18 at 11:47, <aisaila@bitdefender.com> wrote:
>>>>>>> The original version of the patch emulated the current instruction
>>>>>>> (which, as a side-effect, emulated the page-walk as well), however
>>>>>>> we
>>>>>>> need finer-grained control. We want to emulate the page-walk, but
>>>>>>> still
>>>>>>> get an EPT violation event if the current instruction would trigger
>>>>>>> one.
>>>>>>> This patch performs just the page-walk emulation.
>>>>>> Rather than making this basically a revision log, could you please
>>>>>> focus
>>>>>> on what you actually want to achieve?
>>>>>>
>>>>>> As to the title: "Suppress ..." please.
>>>>>>
>>>>>>> @@ -149,6 +151,10 @@ guest_walk_tables(struct vcpu *v, struct
>>>>>>> p2m_domain *p2m,
>>>>>>>      ar_and &= gflags;
>>>>>>>      ar_or  |= gflags;
>>>>>>>  
>>>>>>> +    if ( set_ad && set_ad_bits(&l4p[guest_l4_table_offset(va)].l4,
>>>>>>> +                               &gw->l4e.l4, false) )
>>>>>>> +        accessed = true;
>>>>>> It is in particular this seemingly odd (and redundant with what's
>>>>>> done
>>>>>> later in the function) which needs thorough explanation.
>>>>> On this patch I've followed Andrew Cooper's suggestion on how to set
>>>>> A/D Bits:
>>>>>
>>>>> "While walking down the levels, set any missing A bits and remember if we
>>>>> set any.  If we set A bits, consider ourselves complete and exit back to
>>>>> the guest.  If no A bits were set, and the access was a write (which we
>>>>> know from the EPT violation information), then set the leaf D bit."
>>>>>
>>>>> If I misunderstood the comment please clarify.
>>>> It doesn't look to me as if you misunderstood anything, but only Andrew
>>>> can say for sure. However, none of this was in the description of your
>>>> patch (neither as part of the description, nor as code comment), and I
>>>> think you'd even have to greatly extend on this in order to explain to
>>>> everyone why the resulting behavior is still architecturally correct. In no
>>>> case should you assume anyone reading your patch (now or in the
>>>> future) has participated in the earlier discussion.
>>> The problem we have is that, while we know the EPT Violation was for a
>>> write of an A or D bit to a write-protected guest pagetable, we don't
>>> know if it was the A or the D bit which was attempting to be set.
>>>
>>> Furthermore (without emulating the instruction, which is what we are
>>> trying to avoid), we can't reconstruct the access.
>>>
>>> Access bits are only written if they were missing before, but may be set
>>> speculatively.  Dirty bits are only set when a write is retired.  From a
>>> practical point of view, the pipeline sets A and D bits as separate actions.
>>>
>>> Following this logic (and assuming for now a single vcpu), if we get a
>>> GPT EPT Violation, and there are missing access bits on the walk, then
>>> the fault is definitely from setting an access bit.
>> Definitely?
> 
> Yes
> 
>>  Is there anything guaranteeing architecturally that an access
>> bit related EPT violation would be delivered earlier than any other one
>> on that same or a lower page table level?
> 
> No, but why does that matter?
> 
> Architecturally defined or not, we know that the action the processor
> was trying to perform was to set an A/D bit, because we got a vmexit
> telling us so.

Well, as per what I had written further down in my earlier reply (starting
with "Or wait ..."), nothing in what you said about the EPT violation made
explicit that this was a violation from a write attempt. Without that, it
very much matters (afaict), as there are other reasons for getting EPT
violations from page table accesses.

>>  It doesn't matter much for
>> the implementation (because of it being permissible to set the A bits
>> speculatively, as you also say further down, and any other violation
>> then re-occurring after exiting back to the guest once the A bits are
>> all set), but since we're discussing here what exactly the patch
>> description should contain, I think I'd prefer this to be fully correct there.
>>
>> Or wait - I think I can agree with "definitely", provided you further
>> restrict the context: "..., if we get a GPT EPT Write Violation ...". But
>> from what I can tell the patch'es change to p2m_mem_access_check()
>> doesn't apply (or pass on) any of these qualifications at all.
> 
> I've not looked at the patch in detail yet.  I'm tempted to suggest
> rearranging guest_walk_tables() to just set the access bits on the
> decent, rather than at the end.  This matches how some hardware behaves
> when pulling entries into the paging structure cache.

I'm not opposed to that; I believe this property has changed over the
years, as I certainly do recall that early (386ish, 486ish) descriptions
of page walks suggested that A bits indeed got set after the full walk
only, and I further believe the original implementation simply took this
behavior as reference.

What I am not very happy about is the addition of a (poorly described)
new parameter to the function, making it sort of bail in the middle.

>>>  Set all access bits
>>> and call it done.  If we get a GPT EPT Violation and all access bits
>>> were set, then it was definitely from setting the Dirty bit.
>>>
>>> For multi-vcpu scenarios, things get racy.  Setting all the Access bits
>>> is safe because its a speculative action, but a speculatively load on
>>> one vcpu can race with a write (to a read-only mapping) on the other
>>> vcpu, and would trick this algorithm into setting the dirty bit when the
>>> write would have faulted (and not set the dirty bit).
>>>
>>> Do we have numbers on how many the GPT EPT Violations are for (only)
>>> access sets, and how many are for dirty tsets?  Would the first half of
>>> the algorithm (which is definitely not racy) still be a net perf win?
>> Does Windows make use of A bits at all? I'd expect most OSes to
>> simply set them right away, and actively use of the D bits.
> 
> What gives you the expectation that OSes wouldn't use A bits?
> 
> For paging out, the best options are non-accessed non-dirty page because
> their contents can be discarded immediately and reread from disk at a
> later point.

Discard-immediately is solely bound to the D bit. The A bit only tells an
OS whether a page was accessed after the bit was cleared the last time.
I will surely agree this is a possible strategy, and I was merely wondering
whether Windows (other than iirc Linux) actually makes use of this. The
reason I wouldn't really expect their use is because repeatedly (and
perhaps frequently) clearing A bits from all PTEs is sort of work intensive
(as is locating a PTE with its A bit clear). Yet if you don't clear them
frequently, the information a set A bit carries does not look to be overly
useful.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-02-23 17:46 ` Wei Liu
@ 2018-02-26  8:15   ` Jan Beulich
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2018-02-26  8:15 UTC (permalink / raw)
  To: wei.liu2
  Cc: tim, sstabellini, rcojocaru, George.Dunlap, andrew.cooper3,
	ian.jackson, xen-devel, tamas, Alexandru Isaila

>>> On 23.02.18 at 18:46, <wei.liu2@citrix.com> wrote:
> On Mon, Jan 08, 2018 at 02:49:44PM +0200, Alexandru Isaila wrote:
>> ---
>>  tools/libxc/include/xenctrl.h |  2 ++
>>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++
> 
> These changes look sensible to me.
> 
>>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>>  xen/include/asm-x86/domain.h  |  6 ++++++
>>  xen/include/asm-x86/monitor.h |  3 ++-
>>  xen/include/public/domctl.h   |  2 ++
> 
> You might want to bump domctl version number it is has been done already
> in this release.

For one this sentence confuses me by its wording alone. And then
all the patch does is add a #define, which doesn't alter the
interface in an incompatible way (which is what the interface
version tries to express, even if - I agree with Andrew here -
limitations).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-02-23 22:31     ` Tamas K Lengyel
@ 2018-02-23 22:36       ` Razvan Cojocaru
  0 siblings, 0 replies; 16+ messages in thread
From: Razvan Cojocaru @ 2018-02-23 22:36 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Stefano Stabellini, wei.liu2, George Dunlap, Andrew Cooper, tim,
	Xen-devel, Jan Beulich, Alexandru Isaila, Ian Jackson

On 02/24/2018 12:31 AM, Tamas K Lengyel wrote:
> On Fri, Feb 23, 2018 at 3:25 PM, Razvan Cojocaru
> <rcojocaru@bitdefender.com> wrote:
>> On 02/24/2018 12:06 AM, Tamas K Lengyel wrote:
>>> On Mon, Jan 8, 2018 at 5:49 AM, Alexandru Isaila
>>> <aisaila@bitdefender.com> wrote:
>>>> This patch is adding a way to enable/disable nested pagefault
>>>> events. It introduces the xc_monitor_nested_pagefault function
>>>> and adds the nested_pagefault_disabled in the monitor structure.
>>>> This is needed by the introspection so it will only get gla
>>>> faults and not get spammed with other faults.
>>>> In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
>>>> v->arch.sse_pg_dirty.gla are used to mark that this is the
>>>> second time a fault occurs and the dirty bit is set.
>>>>
>>>> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
>>>>
>>>> ---
>>>> Changes since V1:
>>>>         - Rb V1
>>>>         - Add comment in domctl.h
>>>> ---
>>>>  tools/libxc/include/xenctrl.h |  2 ++
>>>>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++
>>>>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>>>>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>>>>  xen/include/asm-x86/domain.h  |  6 ++++++
>>>>  xen/include/asm-x86/monitor.h |  3 ++-
>>>>  xen/include/public/domctl.h   |  2 ++
>>>>  7 files changed, 66 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
>>>> index 09e1363..112c974 100644
>>>> --- a/tools/libxc/include/xenctrl.h
>>>> +++ b/tools/libxc/include/xenctrl.h
>>>> @@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, uint32_t domain_id,
>>>>                                   bool enable);
>>>>  int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
>>>>                               bool enable, bool sync, bool allow_userspace);
>>>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>>>> +                                bool disable);
>>>>  int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
>>>>                                  bool enable, bool sync);
>>>>  int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
>>>> diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
>>>> index 0233b87..e96c56d 100644
>>>> --- a/tools/libxc/xc_monitor.c
>>>> +++ b/tools/libxc/xc_monitor.c
>>>> @@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id, bool enable,
>>>>      return do_domctl(xch, &domctl);
>>>>  }
>>>>
>>>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>>>> +                                bool disable)
>>>> +{
>>>> +    DECLARE_DOMCTL;
>>>> +
>>>> +    domctl.cmd = XEN_DOMCTL_monitor_op;
>>>> +    domctl.domain = domain_id;
>>>> +    domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
>>>> +                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
>>>> +    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT;
>>>> +
>>>> +    return do_domctl(xch, &domctl);
>>>> +}
>>>> +
>>>>  int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
>>>>                                  bool enable)
>>>>  {
>>>> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
>>>> index c0cd017..07a334b 100644
>>>> --- a/xen/arch/x86/mm/mem_access.c
>>>> +++ b/xen/arch/x86/mm/mem_access.c
>>>> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>>>>      return violation;
>>>>  }
>>>>
>>>> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
>>>> +{
>>>> +    struct hvm_hw_cpu ctxt;
>>>> +    uint32_t pfec = 0;
>>>> +
>>>> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
>>>> +
>>>> +    if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
>>>> +         && ga == v->arch.pg_dirty.gla )
>>>> +        pfec = PFEC_write_access;
>>>> +
>>>> +    paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, &pfec, NULL);
>>>> +
>>>> +    v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
>>>> +    v->arch.pg_dirty.gla = ga;
>>>> +}
>>>> +
>>>>  bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>>>                            struct npfec npfec,
>>>>                            vm_event_request_t **req_ptr)
>>>> @@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>>>          }
>>>>      }
>>>>
>>>> +    if ( vm_event_check_ring(d->vm_event_monitor) &&
>>>> +         d->arch.monitor.nested_pagefault_disabled &&
>>>> +         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>>> +    {
>>>> +        v->arch.vm_event->emulate_flags = 0;
>>>> +        p2m_set_ad_bits(v, gla);
>>>> +
>>>> +        return true;
>>>> +    }
>>>> +
>>>>      *req_ptr = NULL;
>>>>      req = xzalloc(vm_event_request_t);
>>>>      if ( req )
>>>> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
>>>> index f229e69..e35b619 100644
>>>> --- a/xen/arch/x86/monitor.c
>>>> +++ b/xen/arch/x86/monitor.c
>>>> @@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
>>>>          break;
>>>>      }
>>>>
>>>> +    case XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT:
>>>> +    {
>>>> +        bool old_status = ad->monitor.nested_pagefault_disabled;
>>>> +
>>>> +        if ( unlikely(old_status == requested_status) )
>>>> +            return -EEXIST;
>>>> +
>>>> +        domain_pause(d);
>>>> +        ad->monitor.nested_pagefault_disabled = requested_status;
>>>> +        domain_unpause(d);
>>>> +        break;
>>>> +    }
>>>> +
>>>>      case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
>>>>      {
>>>>          bool old_status = ad->monitor.descriptor_access_enabled;
>>>> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
>>>> index 4679d54..099af7c 100644
>>>> --- a/xen/include/asm-x86/domain.h
>>>> +++ b/xen/include/asm-x86/domain.h
>>>> @@ -412,6 +412,7 @@ struct arch_domain
>>>>          unsigned int descriptor_access_enabled                             : 1;
>>>>          unsigned int guest_request_userspace_enabled                       : 1;
>>>>          unsigned int emul_unimplemented_enabled                            : 1;
>>>> +        unsigned int nested_pagefault_disabled                             : 1;
>>>
>>> All other options are "_enabled" here, so adding one that's flipped
>>> just looks out of place. Any objections to making this match the rest?
>>> Also, naming it "nested" just makes me think this is somehow would be
>>> related to nested virtualization, but that's not the case. These would
>>> be just regular pagefaults in the guest, so naming the monitor option
>>> simply "pagefault" would look better to me in general.
>> Hello Tamas,
>>
>> Here's the thinking behind preferring "disabled" to "enabled": we want
>> to keep the default behaviour as it is currently, and the current
>> behaviour is to send out _all_ EPT fault vm_events (caused by page walks
>> or not).
>>
>> Now, struct arch_domain is being zeroed out on init, so if we name this
>> "enabled", then that's the behaviour we're starting out with. We have no
>> problem with that, but it changes the current default behaviour.
> 
> We can keep the "disabled" naming but then please add a comment to the field
> saying that by default all events are sent, this is used to filter
> pagefaults out.

Will do, no problem.

>> So either we name this new field "disabled", or we rename it to
>> "enabled" (if we rename it, we either need to set it as a special case
>> on init, or modify the default behaviour to be _not_ sending out
>> page-walk-caused EPT events).
>>
>> If you feel strongly about options 2.A or 2.C we don't have a problem
>> changing the code.
>>
>> About "pagefault", it reads more confusing to me, since all EPT-related
>> vm_events are basically page faults. But maybe that's just me.
> 
> True. It's just confusing with "nested" also having multiple meanings.
> Perhaps "inguest_pagefaults"?

Sure, if nobody else objects, we'll change "nested" to "inguest".


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-02-23 22:25   ` Razvan Cojocaru
@ 2018-02-23 22:31     ` Tamas K Lengyel
  2018-02-23 22:36       ` Razvan Cojocaru
  0 siblings, 1 reply; 16+ messages in thread
From: Tamas K Lengyel @ 2018-02-23 22:31 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Stefano Stabellini, wei.liu2, George Dunlap, Andrew Cooper, tim,
	Xen-devel, Jan Beulich, Alexandru Isaila, Ian Jackson

On Fri, Feb 23, 2018 at 3:25 PM, Razvan Cojocaru
<rcojocaru@bitdefender.com> wrote:
> On 02/24/2018 12:06 AM, Tamas K Lengyel wrote:
>> On Mon, Jan 8, 2018 at 5:49 AM, Alexandru Isaila
>> <aisaila@bitdefender.com> wrote:
>>> This patch is adding a way to enable/disable nested pagefault
>>> events. It introduces the xc_monitor_nested_pagefault function
>>> and adds the nested_pagefault_disabled in the monitor structure.
>>> This is needed by the introspection so it will only get gla
>>> faults and not get spammed with other faults.
>>> In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
>>> v->arch.sse_pg_dirty.gla are used to mark that this is the
>>> second time a fault occurs and the dirty bit is set.
>>>
>>> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
>>>
>>> ---
>>> Changes since V1:
>>>         - Rb V1
>>>         - Add comment in domctl.h
>>> ---
>>>  tools/libxc/include/xenctrl.h |  2 ++
>>>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++
>>>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>>>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>>>  xen/include/asm-x86/domain.h  |  6 ++++++
>>>  xen/include/asm-x86/monitor.h |  3 ++-
>>>  xen/include/public/domctl.h   |  2 ++
>>>  7 files changed, 66 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
>>> index 09e1363..112c974 100644
>>> --- a/tools/libxc/include/xenctrl.h
>>> +++ b/tools/libxc/include/xenctrl.h
>>> @@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, uint32_t domain_id,
>>>                                   bool enable);
>>>  int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
>>>                               bool enable, bool sync, bool allow_userspace);
>>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>>> +                                bool disable);
>>>  int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
>>>                                  bool enable, bool sync);
>>>  int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
>>> diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
>>> index 0233b87..e96c56d 100644
>>> --- a/tools/libxc/xc_monitor.c
>>> +++ b/tools/libxc/xc_monitor.c
>>> @@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id, bool enable,
>>>      return do_domctl(xch, &domctl);
>>>  }
>>>
>>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>>> +                                bool disable)
>>> +{
>>> +    DECLARE_DOMCTL;
>>> +
>>> +    domctl.cmd = XEN_DOMCTL_monitor_op;
>>> +    domctl.domain = domain_id;
>>> +    domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
>>> +                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
>>> +    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT;
>>> +
>>> +    return do_domctl(xch, &domctl);
>>> +}
>>> +
>>>  int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
>>>                                  bool enable)
>>>  {
>>> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
>>> index c0cd017..07a334b 100644
>>> --- a/xen/arch/x86/mm/mem_access.c
>>> +++ b/xen/arch/x86/mm/mem_access.c
>>> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>>>      return violation;
>>>  }
>>>
>>> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
>>> +{
>>> +    struct hvm_hw_cpu ctxt;
>>> +    uint32_t pfec = 0;
>>> +
>>> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
>>> +
>>> +    if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
>>> +         && ga == v->arch.pg_dirty.gla )
>>> +        pfec = PFEC_write_access;
>>> +
>>> +    paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, &pfec, NULL);
>>> +
>>> +    v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
>>> +    v->arch.pg_dirty.gla = ga;
>>> +}
>>> +
>>>  bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>>                            struct npfec npfec,
>>>                            vm_event_request_t **req_ptr)
>>> @@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>>          }
>>>      }
>>>
>>> +    if ( vm_event_check_ring(d->vm_event_monitor) &&
>>> +         d->arch.monitor.nested_pagefault_disabled &&
>>> +         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>> +    {
>>> +        v->arch.vm_event->emulate_flags = 0;
>>> +        p2m_set_ad_bits(v, gla);
>>> +
>>> +        return true;
>>> +    }
>>> +
>>>      *req_ptr = NULL;
>>>      req = xzalloc(vm_event_request_t);
>>>      if ( req )
>>> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
>>> index f229e69..e35b619 100644
>>> --- a/xen/arch/x86/monitor.c
>>> +++ b/xen/arch/x86/monitor.c
>>> @@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
>>>          break;
>>>      }
>>>
>>> +    case XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT:
>>> +    {
>>> +        bool old_status = ad->monitor.nested_pagefault_disabled;
>>> +
>>> +        if ( unlikely(old_status == requested_status) )
>>> +            return -EEXIST;
>>> +
>>> +        domain_pause(d);
>>> +        ad->monitor.nested_pagefault_disabled = requested_status;
>>> +        domain_unpause(d);
>>> +        break;
>>> +    }
>>> +
>>>      case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
>>>      {
>>>          bool old_status = ad->monitor.descriptor_access_enabled;
>>> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
>>> index 4679d54..099af7c 100644
>>> --- a/xen/include/asm-x86/domain.h
>>> +++ b/xen/include/asm-x86/domain.h
>>> @@ -412,6 +412,7 @@ struct arch_domain
>>>          unsigned int descriptor_access_enabled                             : 1;
>>>          unsigned int guest_request_userspace_enabled                       : 1;
>>>          unsigned int emul_unimplemented_enabled                            : 1;
>>> +        unsigned int nested_pagefault_disabled                             : 1;
>>
>> All other options are "_enabled" here, so adding one that's flipped
>> just looks out of place. Any objections to making this match the rest?
>> Also, naming it "nested" just makes me think this is somehow would be
>> related to nested virtualization, but that's not the case. These would
>> be just regular pagefaults in the guest, so naming the monitor option
>> simply "pagefault" would look better to me in general.
> Hello Tamas,
>
> Here's the thinking behind preferring "disabled" to "enabled": we want
> to keep the default behaviour as it is currently, and the current
> behaviour is to send out _all_ EPT fault vm_events (caused by page walks
> or not).
>
> Now, struct arch_domain is being zeroed out on init, so if we name this
> "enabled", then that's the behaviour we're starting out with. We have no
> problem with that, but it changes the current default behaviour.

We can keep the "disabled" naming but then please add a comment to the field
saying that by default all events are sent, this is used to filter
pagefaults out.

>
> So either we name this new field "disabled", or we rename it to
> "enabled" (if we rename it, we either need to set it as a special case
> on init, or modify the default behaviour to be _not_ sending out
> page-walk-caused EPT events).
>
> If you feel strongly about options 2.A or 2.C we don't have a problem
> changing the code.
>
> About "pagefault", it reads more confusing to me, since all EPT-related
> vm_events are basically page faults. But maybe that's just me.

True. It's just confusing with "nested" also having multiple meanings.
Perhaps "inguest_pagefaults"?

Tamas

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-02-23 22:06 ` Tamas K Lengyel
@ 2018-02-23 22:25   ` Razvan Cojocaru
  2018-02-23 22:31     ` Tamas K Lengyel
  0 siblings, 1 reply; 16+ messages in thread
From: Razvan Cojocaru @ 2018-02-23 22:25 UTC (permalink / raw)
  To: Tamas K Lengyel, Alexandru Isaila
  Cc: Stefano Stabellini, wei.liu2, George Dunlap, Ian Jackson, tim,
	Xen-devel, Jan Beulich, Andrew Cooper

On 02/24/2018 12:06 AM, Tamas K Lengyel wrote:
> On Mon, Jan 8, 2018 at 5:49 AM, Alexandru Isaila
> <aisaila@bitdefender.com> wrote:
>> This patch is adding a way to enable/disable nested pagefault
>> events. It introduces the xc_monitor_nested_pagefault function
>> and adds the nested_pagefault_disabled in the monitor structure.
>> This is needed by the introspection so it will only get gla
>> faults and not get spammed with other faults.
>> In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
>> v->arch.sse_pg_dirty.gla are used to mark that this is the
>> second time a fault occurs and the dirty bit is set.
>>
>> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
>>
>> ---
>> Changes since V1:
>>         - Rb V1
>>         - Add comment in domctl.h
>> ---
>>  tools/libxc/include/xenctrl.h |  2 ++
>>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++
>>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>>  xen/include/asm-x86/domain.h  |  6 ++++++
>>  xen/include/asm-x86/monitor.h |  3 ++-
>>  xen/include/public/domctl.h   |  2 ++
>>  7 files changed, 66 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
>> index 09e1363..112c974 100644
>> --- a/tools/libxc/include/xenctrl.h
>> +++ b/tools/libxc/include/xenctrl.h
>> @@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, uint32_t domain_id,
>>                                   bool enable);
>>  int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
>>                               bool enable, bool sync, bool allow_userspace);
>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>> +                                bool disable);
>>  int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
>>                                  bool enable, bool sync);
>>  int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
>> diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
>> index 0233b87..e96c56d 100644
>> --- a/tools/libxc/xc_monitor.c
>> +++ b/tools/libxc/xc_monitor.c
>> @@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id, bool enable,
>>      return do_domctl(xch, &domctl);
>>  }
>>
>> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
>> +                                bool disable)
>> +{
>> +    DECLARE_DOMCTL;
>> +
>> +    domctl.cmd = XEN_DOMCTL_monitor_op;
>> +    domctl.domain = domain_id;
>> +    domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
>> +                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
>> +    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT;
>> +
>> +    return do_domctl(xch, &domctl);
>> +}
>> +
>>  int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
>>                                  bool enable)
>>  {
>> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
>> index c0cd017..07a334b 100644
>> --- a/xen/arch/x86/mm/mem_access.c
>> +++ b/xen/arch/x86/mm/mem_access.c
>> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>>      return violation;
>>  }
>>
>> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
>> +{
>> +    struct hvm_hw_cpu ctxt;
>> +    uint32_t pfec = 0;
>> +
>> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
>> +
>> +    if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
>> +         && ga == v->arch.pg_dirty.gla )
>> +        pfec = PFEC_write_access;
>> +
>> +    paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, &pfec, NULL);
>> +
>> +    v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
>> +    v->arch.pg_dirty.gla = ga;
>> +}
>> +
>>  bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>                            struct npfec npfec,
>>                            vm_event_request_t **req_ptr)
>> @@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>          }
>>      }
>>
>> +    if ( vm_event_check_ring(d->vm_event_monitor) &&
>> +         d->arch.monitor.nested_pagefault_disabled &&
>> +         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>> +    {
>> +        v->arch.vm_event->emulate_flags = 0;
>> +        p2m_set_ad_bits(v, gla);
>> +
>> +        return true;
>> +    }
>> +
>>      *req_ptr = NULL;
>>      req = xzalloc(vm_event_request_t);
>>      if ( req )
>> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
>> index f229e69..e35b619 100644
>> --- a/xen/arch/x86/monitor.c
>> +++ b/xen/arch/x86/monitor.c
>> @@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
>>          break;
>>      }
>>
>> +    case XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT:
>> +    {
>> +        bool old_status = ad->monitor.nested_pagefault_disabled;
>> +
>> +        if ( unlikely(old_status == requested_status) )
>> +            return -EEXIST;
>> +
>> +        domain_pause(d);
>> +        ad->monitor.nested_pagefault_disabled = requested_status;
>> +        domain_unpause(d);
>> +        break;
>> +    }
>> +
>>      case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
>>      {
>>          bool old_status = ad->monitor.descriptor_access_enabled;
>> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
>> index 4679d54..099af7c 100644
>> --- a/xen/include/asm-x86/domain.h
>> +++ b/xen/include/asm-x86/domain.h
>> @@ -412,6 +412,7 @@ struct arch_domain
>>          unsigned int descriptor_access_enabled                             : 1;
>>          unsigned int guest_request_userspace_enabled                       : 1;
>>          unsigned int emul_unimplemented_enabled                            : 1;
>> +        unsigned int nested_pagefault_disabled                             : 1;
> 
> All other options are "_enabled" here, so adding one that's flipped
> just looks out of place. Any objections to making this match the rest?
> Also, naming it "nested" just makes me think this is somehow would be
> related to nested virtualization, but that's not the case. These would
> be just regular pagefaults in the guest, so naming the monitor option
> simply "pagefault" would look better to me in general.
Hello Tamas,

Here's the thinking behind preferring "disabled" to "enabled": we want
to keep the default behaviour as it is currently, and the current
behaviour is to send out _all_ EPT fault vm_events (caused by page walks
or not).

Now, struct arch_domain is being zeroed out on init, so if we name this
"enabled", then that's the behaviour we're starting out with. We have no
problem with that, but it changes the current default behaviour.

So either we name this new field "disabled", or we rename it to
"enabled" (if we rename it, we either need to set it as a special case
on init, or modify the default behaviour to be _not_ sending out
page-walk-caused EPT events).

If you feel strongly about options 2.A or 2.C we don't have a problem
changing the code.

About "pagefault", it reads more confusing to me, since all EPT-related
vm_events are basically page faults. But maybe that's just me.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-01-08 12:49 Alexandru Isaila
  2018-02-23 17:46 ` Wei Liu
@ 2018-02-23 22:06 ` Tamas K Lengyel
  2018-02-23 22:25   ` Razvan Cojocaru
  1 sibling, 1 reply; 16+ messages in thread
From: Tamas K Lengyel @ 2018-02-23 22:06 UTC (permalink / raw)
  To: Alexandru Isaila
  Cc: tim, Stefano Stabellini, wei.liu2, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Ian Jackson, Xen-devel,
	Jan Beulich

On Mon, Jan 8, 2018 at 5:49 AM, Alexandru Isaila
<aisaila@bitdefender.com> wrote:
> This patch is adding a way to enable/disable nested pagefault
> events. It introduces the xc_monitor_nested_pagefault function
> and adds the nested_pagefault_disabled in the monitor structure.
> This is needed by the introspection so it will only get gla
> faults and not get spammed with other faults.
> In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
> v->arch.sse_pg_dirty.gla are used to mark that this is the
> second time a fault occurs and the dirty bit is set.
>
> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
>
> ---
> Changes since V1:
>         - Rb V1
>         - Add comment in domctl.h
> ---
>  tools/libxc/include/xenctrl.h |  2 ++
>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++
>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>  xen/include/asm-x86/domain.h  |  6 ++++++
>  xen/include/asm-x86/monitor.h |  3 ++-
>  xen/include/public/domctl.h   |  2 ++
>  7 files changed, 66 insertions(+), 1 deletion(-)
>
> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
> index 09e1363..112c974 100644
> --- a/tools/libxc/include/xenctrl.h
> +++ b/tools/libxc/include/xenctrl.h
> @@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, uint32_t domain_id,
>                                   bool enable);
>  int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
>                               bool enable, bool sync, bool allow_userspace);
> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
> +                                bool disable);
>  int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
>                                  bool enable, bool sync);
>  int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
> diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
> index 0233b87..e96c56d 100644
> --- a/tools/libxc/xc_monitor.c
> +++ b/tools/libxc/xc_monitor.c
> @@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id, bool enable,
>      return do_domctl(xch, &domctl);
>  }
>
> +int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
> +                                bool disable)
> +{
> +    DECLARE_DOMCTL;
> +
> +    domctl.cmd = XEN_DOMCTL_monitor_op;
> +    domctl.domain = domain_id;
> +    domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
> +                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
> +    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT;
> +
> +    return do_domctl(xch, &domctl);
> +}
> +
>  int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
>                                  bool enable)
>  {
> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
> index c0cd017..07a334b 100644
> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>      return violation;
>  }
>
> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
> +{
> +    struct hvm_hw_cpu ctxt;
> +    uint32_t pfec = 0;
> +
> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
> +
> +    if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
> +         && ga == v->arch.pg_dirty.gla )
> +        pfec = PFEC_write_access;
> +
> +    paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, &pfec, NULL);
> +
> +    v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
> +    v->arch.pg_dirty.gla = ga;
> +}
> +
>  bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>                            struct npfec npfec,
>                            vm_event_request_t **req_ptr)
> @@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>          }
>      }
>
> +    if ( vm_event_check_ring(d->vm_event_monitor) &&
> +         d->arch.monitor.nested_pagefault_disabled &&
> +         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
> +    {
> +        v->arch.vm_event->emulate_flags = 0;
> +        p2m_set_ad_bits(v, gla);
> +
> +        return true;
> +    }
> +
>      *req_ptr = NULL;
>      req = xzalloc(vm_event_request_t);
>      if ( req )
> diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
> index f229e69..e35b619 100644
> --- a/xen/arch/x86/monitor.c
> +++ b/xen/arch/x86/monitor.c
> @@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
>          break;
>      }
>
> +    case XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT:
> +    {
> +        bool old_status = ad->monitor.nested_pagefault_disabled;
> +
> +        if ( unlikely(old_status == requested_status) )
> +            return -EEXIST;
> +
> +        domain_pause(d);
> +        ad->monitor.nested_pagefault_disabled = requested_status;
> +        domain_unpause(d);
> +        break;
> +    }
> +
>      case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
>      {
>          bool old_status = ad->monitor.descriptor_access_enabled;
> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
> index 4679d54..099af7c 100644
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -412,6 +412,7 @@ struct arch_domain
>          unsigned int descriptor_access_enabled                             : 1;
>          unsigned int guest_request_userspace_enabled                       : 1;
>          unsigned int emul_unimplemented_enabled                            : 1;
> +        unsigned int nested_pagefault_disabled                             : 1;

All other options are "_enabled" here, so adding one that's flipped
just looks out of place. Any objections to making this match the rest?
Also, naming it "nested" just makes me think this is somehow would be
related to nested virtualization, but that's not the case. These would
be just regular pagefaults in the guest, so naming the monitor option
simply "pagefault" would look better to me in general.

>          struct monitor_msr_bitmap *msr_bitmap;
>          uint64_t write_ctrlreg_mask[4];
>      } monitor;
> @@ -579,6 +580,11 @@ struct arch_vcpu
>      /* A secondary copy of the vcpu time info. */
>      XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>
> +    struct {
> +        unsigned long eip;
> +        unsigned long gla;
> +    } pg_dirty;
> +
>      struct arch_vm_event *vm_event;
>
>      struct msr_vcpu_policy *msr;
> diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
> index a0444d1..a7a0b56 100644
> --- a/xen/include/asm-x86/monitor.h
> +++ b/xen/include/asm-x86/monitor.h
> @@ -84,7 +84,8 @@ static inline uint32_t arch_monitor_get_capabilities(struct domain *d)
>                     (1U << XEN_DOMCTL_MONITOR_EVENT_DEBUG_EXCEPTION) |
>                     (1U << XEN_DOMCTL_MONITOR_EVENT_CPUID) |
>                     (1U << XEN_DOMCTL_MONITOR_EVENT_INTERRUPT) |
> -                   (1U << XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED);
> +                   (1U << XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED) |
> +                   (1U << XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT);
>
>      /* Since we know this is on VMX, we can just call the hvm func */
>      if ( hvm_is_singlestep_supported() )
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index 9ae72959..706c6ea 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -1014,6 +1014,8 @@ struct xen_domctl_psr_cmt_op {
>  #define XEN_DOMCTL_MONITOR_EVENT_INTERRUPT             8
>  #define XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS           9
>  #define XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED    10
> +/* Enabled by default */
> +#define XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT      11
>
>  struct xen_domctl_monitor_op {
>      uint32_t op; /* XEN_DOMCTL_MONITOR_OP_* */
> --
> 2.7.4

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
  2018-01-08 12:49 Alexandru Isaila
@ 2018-02-23 17:46 ` Wei Liu
  2018-02-26  8:15   ` Jan Beulich
  2018-02-23 22:06 ` Tamas K Lengyel
  1 sibling, 1 reply; 16+ messages in thread
From: Wei Liu @ 2018-02-23 17:46 UTC (permalink / raw)
  To: Alexandru Isaila
  Cc: tim, sstabellini, wei.liu2, rcojocaru, George.Dunlap,
	andrew.cooper3, ian.jackson, xen-devel, tamas, jbeulich

On Mon, Jan 08, 2018 at 02:49:44PM +0200, Alexandru Isaila wrote:
> This patch is adding a way to enable/disable nested pagefault
> events. It introduces the xc_monitor_nested_pagefault function
> and adds the nested_pagefault_disabled in the monitor structure.
> This is needed by the introspection so it will only get gla
> faults and not get spammed with other faults.
> In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
> v->arch.sse_pg_dirty.gla are used to mark that this is the
> second time a fault occurs and the dirty bit is set.
> 
> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
> 
> ---
> Changes since V1:
>         - Rb V1
> 	- Add comment in domctl.h
> ---
>  tools/libxc/include/xenctrl.h |  2 ++
>  tools/libxc/xc_monitor.c      | 14 ++++++++++++++

These changes look sensible to me.

>  xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
>  xen/arch/x86/monitor.c        | 13 +++++++++++++
>  xen/include/asm-x86/domain.h  |  6 ++++++
>  xen/include/asm-x86/monitor.h |  3 ++-
>  xen/include/public/domctl.h   |  2 ++

You might want to bump domctl version number it is has been done already
in this release.

Wei.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks
@ 2018-01-08 12:49 Alexandru Isaila
  2018-02-23 17:46 ` Wei Liu
  2018-02-23 22:06 ` Tamas K Lengyel
  0 siblings, 2 replies; 16+ messages in thread
From: Alexandru Isaila @ 2018-01-08 12:49 UTC (permalink / raw)
  To: xen-devel
  Cc: sstabellini, wei.liu2, rcojocaru, george.dunlap, andrew.cooper3,
	ian.jackson, tim, tamas, jbeulich, Alexandru Isaila

This patch is adding a way to enable/disable nested pagefault
events. It introduces the xc_monitor_nested_pagefault function
and adds the nested_pagefault_disabled in the monitor structure.
This is needed by the introspection so it will only get gla
faults and not get spammed with other faults.
In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
v->arch.sse_pg_dirty.gla are used to mark that this is the
second time a fault occurs and the dirty bit is set.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>

---
Changes since V1:
        - Rb V1
	- Add comment in domctl.h
---
 tools/libxc/include/xenctrl.h |  2 ++
 tools/libxc/xc_monitor.c      | 14 ++++++++++++++
 xen/arch/x86/mm/mem_access.c  | 27 +++++++++++++++++++++++++++
 xen/arch/x86/monitor.c        | 13 +++++++++++++
 xen/include/asm-x86/domain.h  |  6 ++++++
 xen/include/asm-x86/monitor.h |  3 ++-
 xen/include/public/domctl.h   |  2 ++
 7 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 09e1363..112c974 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, uint32_t domain_id,
                                  bool enable);
 int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
                              bool enable, bool sync, bool allow_userspace);
+int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
+                                bool disable);
 int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
                                 bool enable, bool sync);
 int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
index 0233b87..e96c56d 100644
--- a/tools/libxc/xc_monitor.c
+++ b/tools/libxc/xc_monitor.c
@@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id, bool enable,
     return do_domctl(xch, &domctl);
 }
 
+int xc_monitor_nested_pagefault(xc_interface *xch, uint32_t domain_id,
+                                bool disable)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT;
+
+    return do_domctl(xch, &domctl);
+}
+
 int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
                                 bool enable)
 {
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index c0cd017..07a334b 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
     return violation;
 }
 
+static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
+{
+    struct hvm_hw_cpu ctxt;
+    uint32_t pfec = 0;
+
+    hvm_funcs.save_cpu_ctxt(v, &ctxt);
+
+    if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
+         && ga == v->arch.pg_dirty.gla )
+        pfec = PFEC_write_access;
+
+    paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, &pfec, NULL);
+
+    v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
+    v->arch.pg_dirty.gla = ga;
+}
+
 bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
                           struct npfec npfec,
                           vm_event_request_t **req_ptr)
@@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
         }
     }
 
+    if ( vm_event_check_ring(d->vm_event_monitor) &&
+         d->arch.monitor.nested_pagefault_disabled &&
+         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
+    {
+        v->arch.vm_event->emulate_flags = 0;
+        p2m_set_ad_bits(v, gla);
+
+        return true;
+    }
+
     *req_ptr = NULL;
     req = xzalloc(vm_event_request_t);
     if ( req )
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index f229e69..e35b619 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
         break;
     }
 
+    case XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT:
+    {
+        bool old_status = ad->monitor.nested_pagefault_disabled;
+
+        if ( unlikely(old_status == requested_status) )
+            return -EEXIST;
+
+        domain_pause(d);
+        ad->monitor.nested_pagefault_disabled = requested_status;
+        domain_unpause(d);
+        break;
+    }
+
     case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
     {
         bool old_status = ad->monitor.descriptor_access_enabled;
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 4679d54..099af7c 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -412,6 +412,7 @@ struct arch_domain
         unsigned int descriptor_access_enabled                             : 1;
         unsigned int guest_request_userspace_enabled                       : 1;
         unsigned int emul_unimplemented_enabled                            : 1;
+        unsigned int nested_pagefault_disabled                             : 1;
         struct monitor_msr_bitmap *msr_bitmap;
         uint64_t write_ctrlreg_mask[4];
     } monitor;
@@ -579,6 +580,11 @@ struct arch_vcpu
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
+    struct {
+        unsigned long eip;
+        unsigned long gla;
+    } pg_dirty;
+
     struct arch_vm_event *vm_event;
 
     struct msr_vcpu_policy *msr;
diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
index a0444d1..a7a0b56 100644
--- a/xen/include/asm-x86/monitor.h
+++ b/xen/include/asm-x86/monitor.h
@@ -84,7 +84,8 @@ static inline uint32_t arch_monitor_get_capabilities(struct domain *d)
                    (1U << XEN_DOMCTL_MONITOR_EVENT_DEBUG_EXCEPTION) |
                    (1U << XEN_DOMCTL_MONITOR_EVENT_CPUID) |
                    (1U << XEN_DOMCTL_MONITOR_EVENT_INTERRUPT) |
-                   (1U << XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED);
+                   (1U << XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED) |
+                   (1U << XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT);
 
     /* Since we know this is on VMX, we can just call the hvm func */
     if ( hvm_is_singlestep_supported() )
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 9ae72959..706c6ea 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1014,6 +1014,8 @@ struct xen_domctl_psr_cmt_op {
 #define XEN_DOMCTL_MONITOR_EVENT_INTERRUPT             8
 #define XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS           9
 #define XEN_DOMCTL_MONITOR_EVENT_EMUL_UNIMPLEMENTED    10
+/* Enabled by default */
+#define XEN_DOMCTL_MONITOR_EVENT_NESTED_PAGEFAULT      11
 
 struct xen_domctl_monitor_op {
     uint32_t op; /* XEN_DOMCTL_MONITOR_OP_* */
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-09-19 14:31 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-12  9:47 [PATCH v2] x86/mm: Suppresses vm_events caused by page-walks Alexandru Isaila
2018-09-13 14:17 ` Jan Beulich
2018-09-18  9:47   ` Isaila Alexandru
2018-09-18 10:17     ` Jan Beulich
2018-09-18 18:20       ` Andrew Cooper
2018-09-18 18:58         ` Razvan Cojocaru
2018-09-19  8:53         ` Jan Beulich
2018-09-19 13:41           ` Andrew Cooper
2018-09-19 14:31             ` Jan Beulich
  -- strict thread matches above, loose matches on Subject: below --
2018-01-08 12:49 Alexandru Isaila
2018-02-23 17:46 ` Wei Liu
2018-02-26  8:15   ` Jan Beulich
2018-02-23 22:06 ` Tamas K Lengyel
2018-02-23 22:25   ` Razvan Cojocaru
2018-02-23 22:31     ` Tamas K Lengyel
2018-02-23 22:36       ` Razvan Cojocaru

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.