All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
@ 2018-08-24 14:11 Alexandru Isaila
  2018-08-24 17:49 ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Alexandru Isaila @ 2018-08-24 14:11 UTC (permalink / raw)
  To: xen-devel
  Cc: tamas, rcojocaru, george.dunlap, andrew.cooper3, tim, jbeulich,
	Alexandru Isaila

The original version of the patch emulated the current instruction
(which, as a side-effect, emulated the page-walk as well), however we
need finer-grained control. We want to emulate the page-walk, but still
get an EPT violation event if the current instruction would trigger one.
This patch performs just the page-walk emulation.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
---
 xen/arch/x86/mm/guest_walk.c     |  7 ++++++-
 xen/arch/x86/mm/hap/guest_walk.c | 32 +++++++++++++++++++++++++++++++-
 xen/arch/x86/mm/hap/hap.c        | 12 ++++++++----
 xen/arch/x86/mm/hap/private.h    | 10 ++++++++++
 xen/arch/x86/mm/mem_access.c     | 15 ++++++++++++++-
 xen/arch/x86/mm/shadow/multi.c   |  6 +++---
 xen/include/asm-x86/guest_pt.h   |  3 ++-
 xen/include/asm-x86/paging.h     |  5 ++++-
 8 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index f67aeda..54140b9 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -82,7 +82,7 @@ static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
 bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                   unsigned long va, walk_t *gw,
-                  uint32_t walk, mfn_t top_mfn, void *top_map)
+                  uint32_t walk, mfn_t top_mfn, void *top_map, uint32_t *gf)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -361,6 +361,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
      * see whether the access should succeed.
      */
     ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
+    if ( gf )
+    {
+        *gf = ar;
+        goto out;
+    }
 
     /*
      * Sanity check.  If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index cb3f9ce..c916b67 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,6 +29,9 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
+#define _hap_pte_flags(levels) hap_pte_flags_##levels##_levels
+#define hap_pte_flags(levels) _hap_pte_flags(levels)
+
 #define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
 #define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
 
@@ -39,6 +42,33 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/guest_pt.h>
 #include <asm/p2m.h>
 
+bool hap_pte_flags(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m,
+    unsigned long va, uint32_t walk, unsigned long cr3,
+    uint32_t *gf)
+{
+    walk_t gw;
+    mfn_t top_mfn;
+    void *top_map;
+    gfn_t top_gfn;
+    struct page_info *top_page;
+    p2m_type_t p2mt;
+
+    top_gfn = _gfn(cr3 >> PAGE_SHIFT);
+    top_page = p2m_get_page_from_gfn(p2m, top_gfn, &p2mt, NULL,
+                                     P2M_ALLOC | P2M_UNSHARE);
+    top_mfn = page_to_mfn(top_page);
+
+    /* Map the top-level table and call the tree-walker */
+    ASSERT(mfn_valid(top_mfn));
+    top_map = map_domain_page(top_mfn);
+#if GUEST_PAGING_LEVELS == 3
+    top_map += (cr3 & ~(PAGE_MASK | 31));
+#endif
+
+    return guest_walk_tables(v, p2m, va, &gw, walk, top_mfn, top_map, gf);
+}
+
 unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
     struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
@@ -91,7 +121,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map, NULL);
     unmap_domain_page(top_map);
     put_page(top_page);
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 812a840..2da7b63 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -767,7 +767,8 @@ static const struct paging_mode hap_paging_real_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 1
+    .guest_levels           = 1,
+    .pte_flags              = hap_pte_flags_2_levels
 };
 
 static const struct paging_mode hap_paging_protected_mode = {
@@ -778,7 +779,8 @@ static const struct paging_mode hap_paging_protected_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 2
+    .guest_levels           = 2,
+    .pte_flags              = hap_pte_flags_2_levels
 };
 
 static const struct paging_mode hap_paging_pae_mode = {
@@ -789,7 +791,8 @@ static const struct paging_mode hap_paging_pae_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 3
+    .guest_levels           = 3,
+    .pte_flags              = hap_pte_flags_3_levels
 };
 
 static const struct paging_mode hap_paging_long_mode = {
@@ -800,7 +803,8 @@ static const struct paging_mode hap_paging_long_mode = {
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
-    .guest_levels           = 4
+    .guest_levels           = 4,
+    .pte_flags              = hap_pte_flags_4_levels
 };
 
 /*
diff --git a/xen/arch/x86/mm/hap/private.h b/xen/arch/x86/mm/hap/private.h
index 973fbe8..615b02a 100644
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -47,4 +47,14 @@ unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
     struct p2m_domain *p2m, unsigned long cr3,
     paddr_t ga, uint32_t *pfec, unsigned int *page_order);
 
+bool hap_pte_flags_2_levels(struct vcpu *v, struct p2m_domain *p2m,
+                            unsigned long va, uint32_t walk, unsigned long cr3,
+                            uint32_t *gf);
+bool hap_pte_flags_3_levels(struct vcpu *v, struct p2m_domain *p2m,
+                            unsigned long va, uint32_t walk, unsigned long cr3,
+                            uint32_t *gf);
+bool hap_pte_flags_4_levels(struct vcpu *v, struct p2m_domain *p2m,
+                            unsigned long va,  uint32_t walk, unsigned long cr3,
+                            uint32_t *gf);
+
 #endif /* __HAP_PRIVATE_H__ */
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index 03a864156..b01194d 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -212,7 +212,20 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
          d->arch.monitor.inguest_pagefault_disabled &&
          npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
     {
-        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
+        struct hvm_hw_cpu ctxt;
+        uint32_t pfec = PFEC_page_present;
+        unsigned long gfn;
+        uint32_t gflags;
+
+        hvm_funcs.save_cpu_ctxt(v, &ctxt);
+        paging_get_hostmode(v)->pte_flags(v, p2m, gla, 0, ctxt.cr3, &gflags);
+        if ( gflags & _PAGE_RW )
+            pfec |= PFEC_write_access;
+
+        if ( gflags & _PAGE_USER )
+            pfec |= PFEC_user_mode;
+
+        gfn = paging_ga_to_gfn_cr3(v, ctxt.cr3, gla, &pfec, NULL);
 
         return true;
     }
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 021ae25..199873f3 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -178,12 +178,12 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
     return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
                              INVALID_MFN,
-                             v->arch.paging.shadow.gl3e
+                             v->arch.paging.shadow.gl3e,
 #else /* 32 or 64 */
                              pagetable_get_mfn(v->arch.guest_table),
-                             v->arch.paging.shadow.guest_vtable
+                             v->arch.paging.shadow.guest_vtable,
 #endif
-                             );
+                            NULL );
 }
 
 /* This validation is called with lock held, and after write permission
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 08031c8..523ae34 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -425,7 +425,8 @@ static inline unsigned int guest_walk_to_page_order(const walk_t *gw)
 
 bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
-                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
+                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map,
+                  uint32_t *gf);
 
 /* Pretty-print the contents of a guest-walk */
 static inline void print_gw(const walk_t *gw)
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index f008551..e5f21c9 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -122,7 +122,10 @@ struct paging_mode {
     void          (*write_p2m_entry       )(struct domain *d, unsigned long gfn,
                                             l1_pgentry_t *p, l1_pgentry_t new,
                                             unsigned int level);
-
+    bool          (*pte_flags             )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long va, uint32_t walk,
+                                            unsigned long cr3, uint32_t *gf);
     unsigned int guest_levels;
 
     /* paging support extension */
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-24 14:11 [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks Alexandru Isaila
@ 2018-08-24 17:49 ` Andrew Cooper
  2018-08-24 18:47   ` Razvan Cojocaru
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2018-08-24 17:49 UTC (permalink / raw)
  To: Alexandru Isaila, xen-devel
  Cc: george.dunlap, tamas, tim, rcojocaru, jbeulich

On 24/08/18 15:11, Alexandru Isaila wrote:
> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
> index 03a864156..b01194d 100644
> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -212,7 +212,20 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>           d->arch.monitor.inguest_pagefault_disabled &&
>           npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>      {
> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
> +        struct hvm_hw_cpu ctxt;
> +        uint32_t pfec = PFEC_page_present;
> +        unsigned long gfn;
> +        uint32_t gflags;
> +
> +        hvm_funcs.save_cpu_ctxt(v, &ctxt);
> +        paging_get_hostmode(v)->pte_flags(v, p2m, gla, 0, ctxt.cr3, &gflags);
> +        if ( gflags & _PAGE_RW )
> +            pfec |= PFEC_write_access;
> +
> +        if ( gflags & _PAGE_USER )
> +            pfec |= PFEC_user_mode;

As I've tried to explain before, this is architecturally incorrect. 
Both need to be derived from the EPT violation, because they are
properties of instruction which caused the fault, not the mapping which
faulted.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-24 17:49 ` Andrew Cooper
@ 2018-08-24 18:47   ` Razvan Cojocaru
  2018-08-27  9:11     ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-24 18:47 UTC (permalink / raw)
  To: Andrew Cooper, Alexandru Isaila, xen-devel
  Cc: george.dunlap, Andrei LUTAS, tamas, tim, jbeulich

On 8/24/18 8:49 PM, Andrew Cooper wrote:
> On 24/08/18 15:11, Alexandru Isaila wrote:
>> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
>> index 03a864156..b01194d 100644
>> --- a/xen/arch/x86/mm/mem_access.c
>> +++ b/xen/arch/x86/mm/mem_access.c
>> @@ -212,7 +212,20 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>           d->arch.monitor.inguest_pagefault_disabled &&
>>           npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>      {
>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>> +        struct hvm_hw_cpu ctxt;
>> +        uint32_t pfec = PFEC_page_present;
>> +        unsigned long gfn;
>> +        uint32_t gflags;
>> +
>> +        hvm_funcs.save_cpu_ctxt(v, &ctxt);
>> +        paging_get_hostmode(v)->pte_flags(v, p2m, gla, 0, ctxt.cr3, &gflags);
>> +        if ( gflags & _PAGE_RW )
>> +            pfec |= PFEC_write_access;
>> +
>> +        if ( gflags & _PAGE_USER )
>> +            pfec |= PFEC_user_mode;
> 
> As I've tried to explain before, this is architecturally incorrect. 
> Both need to be derived from the EPT violation, because they are
> properties of instruction which caused the fault, not the mapping which
> faulted.

Right, I assume you mean starting to use eff_read, eff_write, eff_exec,
and eff_user_exec from ept_qual_t:

619 /* EPT violation qualifications definitions */
620 typedef union ept_qual {
621     unsigned long raw;
622     struct {
623         bool read:1, write:1, fetch:1,
624             eff_read:1, eff_write:1, eff_exec:1, /* eff_user_exec */:1,
625             gla_valid:1,
626             gla_fault:1; /* Valid iff gla_valid. */
627         unsigned long /* pad */:55;
628     };
629 } __transparent__ ept_qual_t;

Unfortunately, I've been told that that's not the way to go since those
fields will be relevant only on newer architectures, and we'd like to be
able to support even older ones. And, of course, the other thing is that
they're VMX / EPT specific, and we were hoping to be able to get SVM
support for free like that.

Would we be able to properly emulate just the walk here (with correct
input) in any way that would support older architectures as well? If
not, is my understanding correct that eff_user_exec would then stand for
PFEC_user_mode, and eff_write for PFEC_write_access in the logic you've
quoted?


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-24 18:47   ` Razvan Cojocaru
@ 2018-08-27  9:11     ` Jan Beulich
  2018-08-27 11:24       ` Razvan Cojocaru
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2018-08-27  9:11 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

>>> On 24.08.18 at 20:47, <rcojocaru@bitdefender.com> wrote:
> On 8/24/18 8:49 PM, Andrew Cooper wrote:
>> On 24/08/18 15:11, Alexandru Isaila wrote:
>>> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
>>> index 03a864156..b01194d 100644
>>> --- a/xen/arch/x86/mm/mem_access.c
>>> +++ b/xen/arch/x86/mm/mem_access.c
>>> @@ -212,7 +212,20 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>>>           d->arch.monitor.inguest_pagefault_disabled &&
>>>           npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>>      {
>>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>>> +        struct hvm_hw_cpu ctxt;
>>> +        uint32_t pfec = PFEC_page_present;
>>> +        unsigned long gfn;
>>> +        uint32_t gflags;
>>> +
>>> +        hvm_funcs.save_cpu_ctxt(v, &ctxt);
>>> +        paging_get_hostmode(v)->pte_flags(v, p2m, gla, 0, ctxt.cr3, &gflags);
>>> +        if ( gflags & _PAGE_RW )
>>> +            pfec |= PFEC_write_access;
>>> +
>>> +        if ( gflags & _PAGE_USER )
>>> +            pfec |= PFEC_user_mode;
>> 
>> As I've tried to explain before, this is architecturally incorrect. 
>> Both need to be derived from the EPT violation, because they are
>> properties of instruction which caused the fault, not the mapping which
>> faulted.
> 
> Right, I assume you mean starting to use eff_read, eff_write, eff_exec,
> and eff_user_exec from ept_qual_t:
> 
> 619 /* EPT violation qualifications definitions */
> 620 typedef union ept_qual {
> 621     unsigned long raw;
> 622     struct {
> 623         bool read:1, write:1, fetch:1,
> 624             eff_read:1, eff_write:1, eff_exec:1, /* eff_user_exec */:1,
> 625             gla_valid:1,
> 626             gla_fault:1; /* Valid iff gla_valid. */
> 627         unsigned long /* pad */:55;
> 628     };
> 629 } __transparent__ ept_qual_t;
> 
> Unfortunately, I've been told that that's not the way to go since those
> fields will be relevant only on newer architectures, and we'd like to be
> able to support even older ones. And, of course, the other thing is that
> they're VMX / EPT specific, and we were hoping to be able to get SVM
> support for free like that.

If this was the case (I didn't check, and you didn't provide a
pointer along with the "I've been told"), at the very least on
newer hardware correct behavior should be implemented.

For NPT, isn't there an error code bit telling you whether the
request was a user or "system" one? If not, some cheating
would be needed (derive from CPL, accepting that e.g.
descriptor table accesses would get mis-attributed), but
that's still not going to involve looking at the PTE flags.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27  9:11     ` Jan Beulich
@ 2018-08-27 11:24       ` Razvan Cojocaru
  2018-08-27 12:12         ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-27 11:24 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

On 8/27/18 12:11 PM, Jan Beulich wrote:
>>>> On 24.08.18 at 20:47, <rcojocaru@bitdefender.com> wrote:
>> 619 /* EPT violation qualifications definitions */
>> 620 typedef union ept_qual {
>> 621     unsigned long raw;
>> 622     struct {
>> 623         bool read:1, write:1, fetch:1,
>> 624             eff_read:1, eff_write:1, eff_exec:1, /* eff_user_exec */:1,
>> 625             gla_valid:1,
>> 626             gla_fault:1; /* Valid iff gla_valid. */
>> 627         unsigned long /* pad */:55;
>> 628     };
>> 629 } __transparent__ ept_qual_t;
>>
>> Unfortunately, I've been told that that's not the way to go since those
>> fields will be relevant only on newer architectures, and we'd like to be
>> able to support even older ones. And, of course, the other thing is that
>> they're VMX / EPT specific, and we were hoping to be able to get SVM
>> support for free like that.
> 
> If this was the case (I didn't check, and you didn't provide a
> pointer along with the "I've been told"), at the very least on
> newer hardware correct behavior should be implemented.

Sorry for being unclear. If the question is who told me, it's been
decided internally that the product should support older processors. If
the question is what does "older" mean, we've checked a random Skylake
CPU with the "if bit 22 is read as 1, the processor reports advanced
VM-exit information for EPT violations (see Section 27.2.1). This
reporting is done only if this bit is read as 1" method, and the
respective bit was not set.

The information is in Table 27-6 of the Intel SDM.

> For NPT, isn't there an error code bit telling you whether the
> request was a user or "system" one? If not, some cheating
> would be needed (derive from CPL, accepting that e.g.
> descriptor table accesses would get mis-attributed), but
> that's still not going to involve looking at the PTE flags.

The alternative would be to simply walk (without enforcing any flags,
and so making the pfec walk parameter unnecessary) to the respective
address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.

If _PAGE_ACCESSED is not set, set it and exit.
If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.

The idea here being that (1) "no TLB entry or paging-structure cache
entry is created with information from a paging-structure entry in which
the accessed flag is 0" and (2) "a page-fault exception resulting from
an attempt to use a linear address will invalidate any TLB entries that
are for a page number corresponding to that linear address and that are
associated with the current PCID" mean that the only way we would end up
in the codepath is if we need to set the A/D bits - which we can, if we
use modified version of the code walker that simply does not care about
the pfec parameter.

This would presumably also work well for NPT, and also on CPUs that do
not support the newer EPT qualification bits.

Is this upstreamable?


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 11:24       ` Razvan Cojocaru
@ 2018-08-27 12:12         ` Jan Beulich
  2018-08-27 12:35           ` Razvan Cojocaru
  2018-08-27 12:37           ` Andrew Cooper
  0 siblings, 2 replies; 18+ messages in thread
From: Jan Beulich @ 2018-08-27 12:12 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

>>> On 27.08.18 at 13:24, <rcojocaru@bitdefender.com> wrote:
> On 8/27/18 12:11 PM, Jan Beulich wrote:
>>>>> On 24.08.18 at 20:47, <rcojocaru@bitdefender.com> wrote:
>>> 619 /* EPT violation qualifications definitions */
>>> 620 typedef union ept_qual {
>>> 621     unsigned long raw;
>>> 622     struct {
>>> 623         bool read:1, write:1, fetch:1,
>>> 624             eff_read:1, eff_write:1, eff_exec:1, /* eff_user_exec */:1,
>>> 625             gla_valid:1,
>>> 626             gla_fault:1; /* Valid iff gla_valid. */
>>> 627         unsigned long /* pad */:55;
>>> 628     };
>>> 629 } __transparent__ ept_qual_t;
>>>
>>> Unfortunately, I've been told that that's not the way to go since those
>>> fields will be relevant only on newer architectures, and we'd like to be
>>> able to support even older ones. And, of course, the other thing is that
>>> they're VMX / EPT specific, and we were hoping to be able to get SVM
>>> support for free like that.
>> 
>> If this was the case (I didn't check, and you didn't provide a
>> pointer along with the "I've been told"), at the very least on
>> newer hardware correct behavior should be implemented.
> 
> Sorry for being unclear. If the question is who told me, it's been
> decided internally that the product should support older processors. If
> the question is what does "older" mean, we've checked a random Skylake
> CPU with the "if bit 22 is read as 1, the processor reports advanced
> VM-exit information for EPT violations (see Section 27.2.1). This
> reporting is done only if this bit is read as 1" method, and the
> respective bit was not set.

Thanks. If this information was indeed supplied by Intel only
after the original introduction of EPT, then - as said - when this
is available it should be used, but an alternative approach is
needed when the hardware doesn't supply the bits.

>> For NPT, isn't there an error code bit telling you whether the
>> request was a user or "system" one? If not, some cheating
>> would be needed (derive from CPL, accepting that e.g.
>> descriptor table accesses would get mis-attributed), but
>> that's still not going to involve looking at the PTE flags.
> 
> The alternative would be to simply walk (without enforcing any flags,
> and so making the pfec walk parameter unnecessary) to the respective
> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
> 
> If _PAGE_ACCESSED is not set, set it and exit.
> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.

Since it's ambiguous in the NPT case - are you talking about
setting the flags in the guest or host page tables? The
former, I'm afraid, might not be acceptable (as not always
being architecturally correct). In anyway feels as if we'd
been here before, in that this reminds me of you meaning
to imply from a second walk (with A already set) that it must
be a write access. I thought we had settled on such an
implication not being generally correct.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 12:12         ` Jan Beulich
@ 2018-08-27 12:35           ` Razvan Cojocaru
  2018-08-27 12:37           ` Andrew Cooper
  1 sibling, 0 replies; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-27 12:35 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

On 8/27/18 3:12 PM, Jan Beulich wrote:
>>> For NPT, isn't there an error code bit telling you whether the
>>> request was a user or "system" one? If not, some cheating
>>> would be needed (derive from CPL, accepting that e.g.
>>> descriptor table accesses would get mis-attributed), but
>>> that's still not going to involve looking at the PTE flags.
>>
>> The alternative would be to simply walk (without enforcing any flags,
>> and so making the pfec walk parameter unnecessary) to the respective
>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>
>> If _PAGE_ACCESSED is not set, set it and exit.
>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
> 
> Since it's ambiguous in the NPT case - are you talking about
> setting the flags in the guest or host page tables? The
> former, I'm afraid, might not be acceptable (as not always
> being architecturally correct). In anyway feels as if we'd

The former (i.e. the guest) - and I agree that it might not be 100%
architecturally correct, however we can't think of a better solution
that will really work on most actual hardware.

> been here before, in that this reminds me of you meaning
> to imply from a second walk (with A already set) that it must
> be a write access. I thought we had settled on such an
> implication not being generally correct.

Right, but the previous attempt had a different problem: for each new
[RIP:GLA] pairs we thought A was being set, and every time we had a
violation where the current [RIP:GLA] matched the previous we thought D
was being set. The main problem there was that an interrupt could have
broken RIP constancy (for lack of a better term).

This new proposal was to check if A is already set on the current
violation - and if it is, set D. While not 100% architecturally correct,
it should work with no ill-effects AFAIK.

Our use case for this is to simply clear that violation, which is not
interesting for the introspection agent - so primarily an optimization.
But also, we'd like to be able to get the violation caused by the actual
guest instruction (if it will cause one). The current patch solves the
first problem, but:

1. It doesn't solve the second problem (we lose a potentially
interesting violation).
2. It's slower to emulate the whole instruction.
3. Emulation can fail, and we end up needing to single-step the current
instruction.

I think what you are saying is that there are scenarios where A is set
already, and the fault is caused by _again_ trying to set A, in which
case we'd set D. I suppose that could happen, but I'm not sure what it
would break.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 12:12         ` Jan Beulich
  2018-08-27 12:35           ` Razvan Cojocaru
@ 2018-08-27 12:37           ` Andrew Cooper
  2018-08-27 12:53             ` Razvan Cojocaru
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2018-08-27 12:37 UTC (permalink / raw)
  To: Jan Beulich, Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 27/08/18 13:12, Jan Beulich wrote:
>>> For NPT, isn't there an error code bit telling you whether the
>>> request was a user or "system" one? If not, some cheating
>>> would be needed (derive from CPL, accepting that e.g.
>>> descriptor table accesses would get mis-attributed), but
>>> that's still not going to involve looking at the PTE flags.
>> The alternative would be to simply walk (without enforcing any flags,
>> and so making the pfec walk parameter unnecessary) to the respective
>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>
>> If _PAGE_ACCESSED is not set, set it and exit.
>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
> Since it's ambiguous in the NPT case - are you talking about
> setting the flags in the guest or host page tables? The
> former, I'm afraid, might not be acceptable (as not always
> being architecturally correct). In anyway feels as if we'd
> been here before, in that this reminds me of you meaning
> to imply from a second walk (with A already set) that it must
> be a write access. I thought we had settled on such an
> implication not being generally correct.

The problem that is trying to be solved is that when operating in
non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
write-protected EPT page, takes an EPT violation for a write to a
read-only page.

Manually setting the A/D bits (as appropriate) and restarting the
instruction is sufficient for it to complete correctly.

At the moment, every time this happens, a request is sent to the
introspection agent, and the agent calculates that it was due to
pagetable protection, and instructs Xen to emulate the instruction. 
This accounts for 97% (?) of the VMExits, and is unrelated to any of the
real protections which introspection is trying to achieve.

What Razvan is looking to do is to have Xen skip the "send to the
introspection agent" part as an optimisation, because hardware tells Xen
(as part of the VMExit) when this condition has occurred, and the
vm_event logic has already asked Xen to try and fix up this condition
automatically.

What can actually be done depends on how A/D bits behave in real hardware.

Setting access bits for non-leaf entries is definitely fine, and
speculatively setting the access bit is also explicitly permitted by the
spec.  However, I can't find any comment on speculative dirty bits (from
either Intel or AMD), and I've not encountered such a behaviour with the
pagetable work I've been doing.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 12:37           ` Andrew Cooper
@ 2018-08-27 12:53             ` Razvan Cojocaru
  2018-08-27 13:02               ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-27 12:53 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 8/27/18 3:37 PM, Andrew Cooper wrote:
> On 27/08/18 13:12, Jan Beulich wrote:
>>>> For NPT, isn't there an error code bit telling you whether the
>>>> request was a user or "system" one? If not, some cheating
>>>> would be needed (derive from CPL, accepting that e.g.
>>>> descriptor table accesses would get mis-attributed), but
>>>> that's still not going to involve looking at the PTE flags.
>>> The alternative would be to simply walk (without enforcing any flags,
>>> and so making the pfec walk parameter unnecessary) to the respective
>>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>>
>>> If _PAGE_ACCESSED is not set, set it and exit.
>>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
>> Since it's ambiguous in the NPT case - are you talking about
>> setting the flags in the guest or host page tables? The
>> former, I'm afraid, might not be acceptable (as not always
>> being architecturally correct). In anyway feels as if we'd
>> been here before, in that this reminds me of you meaning
>> to imply from a second walk (with A already set) that it must
>> be a write access. I thought we had settled on such an
>> implication not being generally correct.
> 
> The problem that is trying to be solved is that when operating in
> non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
> write-protected EPT page, takes an EPT violation for a write to a
> read-only page.
> 
> Manually setting the A/D bits (as appropriate) and restarting the
> instruction is sufficient for it to complete correctly.
> 
> At the moment, every time this happens, a request is sent to the
> introspection agent, and the agent calculates that it was due to
> pagetable protection, and instructs Xen to emulate the instruction. 
> This accounts for 97% (?) of the VMExits, and is unrelated to any of the
> real protections which introspection is trying to achieve.
> 
> What Razvan is looking to do is to have Xen skip the "send to the
> introspection agent" part as an optimisation, because hardware tells Xen
> (as part of the VMExit) when this condition has occurred, and the
> vm_event logic has already asked Xen to try and fix up this condition
> automatically.
> 
> What can actually be done depends on how A/D bits behave in real hardware.
> 
> Setting access bits for non-leaf entries is definitely fine, and
> speculatively setting the access bit is also explicitly permitted by the
> spec.  However, I can't find any comment on speculative dirty bits (from
> either Intel or AMD), and I've not encountered such a behaviour with the
> pagetable work I've been doing.

Thanks for the reply!

I've forgotten a piece of information that I really should have written
here: we would only set the D bit if A is already set and either the
page is writable (has _PAGE_RW set) or CR0.WP is 0 (the latter case is
admittedly more tricky).


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 12:53             ` Razvan Cojocaru
@ 2018-08-27 13:02               ` Andrew Cooper
  2018-08-27 13:17                 ` Jan Beulich
  2018-08-27 13:24                 ` Razvan Cojocaru
  0 siblings, 2 replies; 18+ messages in thread
From: Andrew Cooper @ 2018-08-27 13:02 UTC (permalink / raw)
  To: Razvan Cojocaru, Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 27/08/18 13:53, Razvan Cojocaru wrote:
> On 8/27/18 3:37 PM, Andrew Cooper wrote:
>> On 27/08/18 13:12, Jan Beulich wrote:
>>>>> For NPT, isn't there an error code bit telling you whether the
>>>>> request was a user or "system" one? If not, some cheating
>>>>> would be needed (derive from CPL, accepting that e.g.
>>>>> descriptor table accesses would get mis-attributed), but
>>>>> that's still not going to involve looking at the PTE flags.
>>>> The alternative would be to simply walk (without enforcing any flags,
>>>> and so making the pfec walk parameter unnecessary) to the respective
>>>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>>>
>>>> If _PAGE_ACCESSED is not set, set it and exit.
>>>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
>>> Since it's ambiguous in the NPT case - are you talking about
>>> setting the flags in the guest or host page tables? The
>>> former, I'm afraid, might not be acceptable (as not always
>>> being architecturally correct). In anyway feels as if we'd
>>> been here before, in that this reminds me of you meaning
>>> to imply from a second walk (with A already set) that it must
>>> be a write access. I thought we had settled on such an
>>> implication not being generally correct.
>> The problem that is trying to be solved is that when operating in
>> non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
>> write-protected EPT page, takes an EPT violation for a write to a
>> read-only page.
>>
>> Manually setting the A/D bits (as appropriate) and restarting the
>> instruction is sufficient for it to complete correctly.
>>
>> At the moment, every time this happens, a request is sent to the
>> introspection agent, and the agent calculates that it was due to
>> pagetable protection, and instructs Xen to emulate the instruction. 
>> This accounts for 97% (?) of the VMExits, and is unrelated to any of the
>> real protections which introspection is trying to achieve.
>>
>> What Razvan is looking to do is to have Xen skip the "send to the
>> introspection agent" part as an optimisation, because hardware tells Xen
>> (as part of the VMExit) when this condition has occurred, and the
>> vm_event logic has already asked Xen to try and fix up this condition
>> automatically.
>>
>> What can actually be done depends on how A/D bits behave in real hardware.
>>
>> Setting access bits for non-leaf entries is definitely fine, and
>> speculatively setting the access bit is also explicitly permitted by the
>> spec.  However, I can't find any comment on speculative dirty bits (from
>> either Intel or AMD), and I've not encountered such a behaviour with the
>> pagetable work I've been doing.
> Thanks for the reply!
>
> I've forgotten a piece of information that I really should have written
> here: we would only set the D bit if A is already set and either the
> page is writable (has _PAGE_RW set) or CR0.WP is 0 (the latter case is
> admittedly more tricky).

How about a new function which works similarly to guest-walk-tables, but
only ever sets A/D bits.

Given information from hardware, we know the linear address, and that it
was a problem with the guest pagetables, from which we explicitly know
that it was from writing an A/D bit to a guest PTE.

While walking down the levels, set any missing A bits and remember if we
set any.  If we set A bits, consider ourselves complete and exit back to
the guest.  If no A bits were set, and the access was a write (which we
know from the EPT violation information), then set the leaf D bit.

This should be architecturally correct as it is exclusively derived from
information provided by the VMExit, and won't cause dirty bits to be
written in cases where the hardware wouldn't have written them
(speculative or otherwise).  It does mean that an instruction which
would need to set A and D bits in the walk will take two EPT violations
to achieve the end result, but it probably is still quicker than sending
the vm_event out.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 13:02               ` Andrew Cooper
@ 2018-08-27 13:17                 ` Jan Beulich
  2018-08-27 13:47                   ` Razvan Cojocaru
  2018-08-27 13:24                 ` Razvan Cojocaru
  1 sibling, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2018-08-27 13:17 UTC (permalink / raw)
  To: Razvan Cojocaru, Andrew Cooper
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
> On 27/08/18 13:53, Razvan Cojocaru wrote:
>> On 8/27/18 3:37 PM, Andrew Cooper wrote:
>>> On 27/08/18 13:12, Jan Beulich wrote:
>>>>>> For NPT, isn't there an error code bit telling you whether the
>>>>>> request was a user or "system" one? If not, some cheating
>>>>>> would be needed (derive from CPL, accepting that e.g.
>>>>>> descriptor table accesses would get mis-attributed), but
>>>>>> that's still not going to involve looking at the PTE flags.
>>>>> The alternative would be to simply walk (without enforcing any flags,
>>>>> and so making the pfec walk parameter unnecessary) to the respective
>>>>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>>>>
>>>>> If _PAGE_ACCESSED is not set, set it and exit.
>>>>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
>>>> Since it's ambiguous in the NPT case - are you talking about
>>>> setting the flags in the guest or host page tables? The
>>>> former, I'm afraid, might not be acceptable (as not always
>>>> being architecturally correct). In anyway feels as if we'd
>>>> been here before, in that this reminds me of you meaning
>>>> to imply from a second walk (with A already set) that it must
>>>> be a write access. I thought we had settled on such an
>>>> implication not being generally correct.
>>> The problem that is trying to be solved is that when operating in
>>> non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
>>> write-protected EPT page, takes an EPT violation for a write to a
>>> read-only page.
>>>
>>> Manually setting the A/D bits (as appropriate) and restarting the
>>> instruction is sufficient for it to complete correctly.
>>>
>>> At the moment, every time this happens, a request is sent to the
>>> introspection agent, and the agent calculates that it was due to
>>> pagetable protection, and instructs Xen to emulate the instruction. 
>>> This accounts for 97% (?) of the VMExits, and is unrelated to any of the
>>> real protections which introspection is trying to achieve.
>>>
>>> What Razvan is looking to do is to have Xen skip the "send to the
>>> introspection agent" part as an optimisation, because hardware tells Xen
>>> (as part of the VMExit) when this condition has occurred, and the
>>> vm_event logic has already asked Xen to try and fix up this condition
>>> automatically.
>>>
>>> What can actually be done depends on how A/D bits behave in real hardware.
>>>
>>> Setting access bits for non-leaf entries is definitely fine, and
>>> speculatively setting the access bit is also explicitly permitted by the
>>> spec.  However, I can't find any comment on speculative dirty bits (from
>>> either Intel or AMD), and I've not encountered such a behaviour with the
>>> pagetable work I've been doing.

Yeah, a description of the problem to solve definitely helps.

>> I've forgotten a piece of information that I really should have written
>> here: we would only set the D bit if A is already set and either the
>> page is writable (has _PAGE_RW set) or CR0.WP is 0 (the latter case is
>> admittedly more tricky).
> 
> How about a new function which works similarly to guest-walk-tables, but
> only ever sets A/D bits.
> 
> Given information from hardware, we know the linear address, and that it
> was a problem with the guest pagetables, from which we explicitly know
> that it was from writing an A/D bit to a guest PTE.
> 
> While walking down the levels, set any missing A bits and remember if we
> set any.  If we set A bits, consider ourselves complete and exit back to
> the guest.  If no A bits were set, and the access was a write (which we
> know from the EPT violation information), then set the leaf D bit.

Plus taking into consideration CR0.WP and the entry's W bit, as
Razvan has said.

> This should be architecturally correct as it is exclusively derived from
> information provided by the VMExit, and won't cause dirty bits to be
> written in cases where the hardware wouldn't have written them
> (speculative or otherwise).  It does mean that an instruction which
> would need to set A and D bits in the walk will take two EPT violations
> to achieve the end result, but it probably is still quicker than sending
> the vm_event out.

I'm afraid this is going to be only mostly correct: Atomicity of the page
table write is going to be lost. This could become an actual problem if
the guest used racing PTE accesses. Such racing accesses might not
be a bug - simply consider the OS scanning for set A and/or D bits
(and clearing them when they're set). Or an entity temporarily clearing
(parts of) PTEs, with recovery logic in place to restore them when
needed for a synchronous access. At the very least there's then the
risk of a live lock within the guest.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 13:02               ` Andrew Cooper
  2018-08-27 13:17                 ` Jan Beulich
@ 2018-08-27 13:24                 ` Razvan Cojocaru
  2018-08-27 14:06                   ` Jan Beulich
  1 sibling, 1 reply; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-27 13:24 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 8/27/18 4:02 PM, Andrew Cooper wrote:
> On 27/08/18 13:53, Razvan Cojocaru wrote:
>> On 8/27/18 3:37 PM, Andrew Cooper wrote:
>>> On 27/08/18 13:12, Jan Beulich wrote:
>>>>>> For NPT, isn't there an error code bit telling you whether the
>>>>>> request was a user or "system" one? If not, some cheating
>>>>>> would be needed (derive from CPL, accepting that e.g.
>>>>>> descriptor table accesses would get mis-attributed), but
>>>>>> that's still not going to involve looking at the PTE flags.
>>>>> The alternative would be to simply walk (without enforcing any flags,
>>>>> and so making the pfec walk parameter unnecessary) to the respective
>>>>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>>>>
>>>>> If _PAGE_ACCESSED is not set, set it and exit.
>>>>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
>>>> Since it's ambiguous in the NPT case - are you talking about
>>>> setting the flags in the guest or host page tables? The
>>>> former, I'm afraid, might not be acceptable (as not always
>>>> being architecturally correct). In anyway feels as if we'd
>>>> been here before, in that this reminds me of you meaning
>>>> to imply from a second walk (with A already set) that it must
>>>> be a write access. I thought we had settled on such an
>>>> implication not being generally correct.
>>> The problem that is trying to be solved is that when operating in
>>> non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
>>> write-protected EPT page, takes an EPT violation for a write to a
>>> read-only page.
>>>
>>> Manually setting the A/D bits (as appropriate) and restarting the
>>> instruction is sufficient for it to complete correctly.
>>>
>>> At the moment, every time this happens, a request is sent to the
>>> introspection agent, and the agent calculates that it was due to
>>> pagetable protection, and instructs Xen to emulate the instruction. 
>>> This accounts for 97% (?) of the VMExits, and is unrelated to any of the
>>> real protections which introspection is trying to achieve.
>>>
>>> What Razvan is looking to do is to have Xen skip the "send to the
>>> introspection agent" part as an optimisation, because hardware tells Xen
>>> (as part of the VMExit) when this condition has occurred, and the
>>> vm_event logic has already asked Xen to try and fix up this condition
>>> automatically.
>>>
>>> What can actually be done depends on how A/D bits behave in real hardware.
>>>
>>> Setting access bits for non-leaf entries is definitely fine, and
>>> speculatively setting the access bit is also explicitly permitted by the
>>> spec.  However, I can't find any comment on speculative dirty bits (from
>>> either Intel or AMD), and I've not encountered such a behaviour with the
>>> pagetable work I've been doing.
>> Thanks for the reply!
>>
>> I've forgotten a piece of information that I really should have written
>> here: we would only set the D bit if A is already set and either the
>> page is writable (has _PAGE_RW set) or CR0.WP is 0 (the latter case is
>> admittedly more tricky).
> 
> How about a new function which works similarly to guest-walk-tables, but
> only ever sets A/D bits.
> 
> Given information from hardware, we know the linear address, and that it
> was a problem with the guest pagetables, from which we explicitly know
> that it was from writing an A/D bit to a guest PTE.
> 
> While walking down the levels, set any missing A bits and remember if we
> set any.  If we set A bits, consider ourselves complete and exit back to
> the guest.  If no A bits were set, and the access was a write (which we
> know from the EPT violation information), then set the leaf D bit.
> 
> This should be architecturally correct as it is exclusively derived from
> information provided by the VMExit, and won't cause dirty bits to be
> written in cases where the hardware wouldn't have written them
> (speculative or otherwise).  It does mean that an instruction which
> would need to set A and D bits in the walk will take two EPT violations
> to achieve the end result, but it probably is still quicker than sending
> the vm_event out.

Right, that's pretty much what we were proposing, a basic algoritm of:

if ((pte & A) && (pte & RW)) pte |= D;
pte |= A;

where the if probably becomes:

if ((pte & A) && ((pte & RW) || CR0.WP == 0)) pte |= D;
pte |= A

for the CR0.WP case.

As discussed privately, there's also the case where two VCPUs may try to
set A concurrently, which is what I assume is the case Jan has hinted at.

Another small issue is that we do need to ignore the EPT violation
information as it pertains to reads or writes: that will always be the
page-walk access type, rw.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 13:17                 ` Jan Beulich
@ 2018-08-27 13:47                   ` Razvan Cojocaru
  2018-08-27 14:08                     ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-27 13:47 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 8/27/18 4:17 PM, Jan Beulich wrote:
>>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
>> This should be architecturally correct as it is exclusively derived from
>> information provided by the VMExit, and won't cause dirty bits to be
>> written in cases where the hardware wouldn't have written them
>> (speculative or otherwise).  It does mean that an instruction which
>> would need to set A and D bits in the walk will take two EPT violations
>> to achieve the end result, but it probably is still quicker than sending
>> the vm_event out.
> 
> I'm afraid this is going to be only mostly correct: Atomicity of the page
> table write is going to be lost. This could become an actual problem if
> the guest used racing PTE accesses. Such racing accesses might not
> be a bug - simply consider the OS scanning for set A and/or D bits
> (and clearing them when they're set). Or an entity temporarily clearing
> (parts of) PTEs, with recovery logic in place to restore them when
> needed for a synchronous access. At the very least there's then the
> risk of a live lock within the guest.

But it's not clear to me why that can't already happen when just
emulating the current instruction (as we do now), if emulating said
instruction would set A or D?


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 13:24                 ` Razvan Cojocaru
@ 2018-08-27 14:06                   ` Jan Beulich
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Beulich @ 2018-08-27 14:06 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

>>> On 27.08.18 at 15:24, <rcojocaru@bitdefender.com> wrote:
> On 8/27/18 4:02 PM, Andrew Cooper wrote:
>> On 27/08/18 13:53, Razvan Cojocaru wrote:
>>> On 8/27/18 3:37 PM, Andrew Cooper wrote:
>>>> On 27/08/18 13:12, Jan Beulich wrote:
>>>>>>> For NPT, isn't there an error code bit telling you whether the
>>>>>>> request was a user or "system" one? If not, some cheating
>>>>>>> would be needed (derive from CPL, accepting that e.g.
>>>>>>> descriptor table accesses would get mis-attributed), but
>>>>>>> that's still not going to involve looking at the PTE flags.
>>>>>> The alternative would be to simply walk (without enforcing any flags,
>>>>>> and so making the pfec walk parameter unnecessary) to the respective
>>>>>> address, and query for _PAGE_ACCESSED and _PAGE_DIRTY only.
>>>>>>
>>>>>> If _PAGE_ACCESSED is not set, set it and exit.
>>>>>> If _PAGE_ACCESSED is set, set _PAGE_DIRTY also and exit.
>>>>> Since it's ambiguous in the NPT case - are you talking about
>>>>> setting the flags in the guest or host page tables? The
>>>>> former, I'm afraid, might not be acceptable (as not always
>>>>> being architecturally correct). In anyway feels as if we'd
>>>>> been here before, in that this reminds me of you meaning
>>>>> to imply from a second walk (with A already set) that it must
>>>>> be a write access. I thought we had settled on such an
>>>>> implication not being generally correct.
>>>> The problem that is trying to be solved is that when operating in
>>>> non-root mode, the cpu pagewalk, when trying to set a guest A/D bit in a
>>>> write-protected EPT page, takes an EPT violation for a write to a
>>>> read-only page.
>>>>
>>>> Manually setting the A/D bits (as appropriate) and restarting the
>>>> instruction is sufficient for it to complete correctly.
>>>>
>>>> At the moment, every time this happens, a request is sent to the
>>>> introspection agent, and the agent calculates that it was due to
>>>> pagetable protection, and instructs Xen to emulate the instruction. 
>>>> This accounts for 97% (?) of the VMExits, and is unrelated to any of the
>>>> real protections which introspection is trying to achieve.
>>>>
>>>> What Razvan is looking to do is to have Xen skip the "send to the
>>>> introspection agent" part as an optimisation, because hardware tells Xen
>>>> (as part of the VMExit) when this condition has occurred, and the
>>>> vm_event logic has already asked Xen to try and fix up this condition
>>>> automatically.
>>>>
>>>> What can actually be done depends on how A/D bits behave in real hardware.
>>>>
>>>> Setting access bits for non-leaf entries is definitely fine, and
>>>> speculatively setting the access bit is also explicitly permitted by the
>>>> spec.  However, I can't find any comment on speculative dirty bits (from
>>>> either Intel or AMD), and I've not encountered such a behaviour with the
>>>> pagetable work I've been doing.
>>> Thanks for the reply!
>>>
>>> I've forgotten a piece of information that I really should have written
>>> here: we would only set the D bit if A is already set and either the
>>> page is writable (has _PAGE_RW set) or CR0.WP is 0 (the latter case is
>>> admittedly more tricky).
>> 
>> How about a new function which works similarly to guest-walk-tables, but
>> only ever sets A/D bits.
>> 
>> Given information from hardware, we know the linear address, and that it
>> was a problem with the guest pagetables, from which we explicitly know
>> that it was from writing an A/D bit to a guest PTE.
>> 
>> While walking down the levels, set any missing A bits and remember if we
>> set any.  If we set A bits, consider ourselves complete and exit back to
>> the guest.  If no A bits were set, and the access was a write (which we
>> know from the EPT violation information), then set the leaf D bit.
>> 
>> This should be architecturally correct as it is exclusively derived from
>> information provided by the VMExit, and won't cause dirty bits to be
>> written in cases where the hardware wouldn't have written them
>> (speculative or otherwise).  It does mean that an instruction which
>> would need to set A and D bits in the walk will take two EPT violations
>> to achieve the end result, but it probably is still quicker than sending
>> the vm_event out.
> 
> Right, that's pretty much what we were proposing, a basic algoritm of:
> 
> if ((pte & A) && (pte & RW)) pte |= D;
> pte |= A;
> 
> where the if probably becomes:
> 
> if ((pte & A) && ((pte & RW) || CR0.WP == 0)) pte |= D;
> pte |= A
> 
> for the CR0.WP case.

Except that it's not quite this simple, as Andrew has already explained:
You need to check all levels' A bits, and only if they've all been set
you'd want to set the leaf level D bit. Then again, looking at the A bits
as all levels should not be necessary - you know the guest physical
address for which the exit occurred, and hence you can tell at least
which page the bit of interest lives in. For A bits this may be any
page table touched during the walk, while for D bits this should only
ever be the leaf one.

> As discussed privately, there's also the case where two VCPUs may try to
> set A concurrently, which is what I assume is the case Jan has hinted at.

Not exactly - I was rather hinting at a situation where some "monitor"
process inside the OS could scan for _set_ A and/or D bits, _clearing_
them asynchronously to CPU side page walks. My understanding of
how real hardware behaves here is that a PTE would be read, and if
the need was found to set A and/or D, it would (sort of) LOCK
CMPXCHG the entry with the one with the bit(s) set. Whether the lock
would be asserted already for the read (and then perhaps dropped
without a write having occurred), or whether CMPXCHG failure would
be recovered from by retrying I can't tell. In any even you can't do
the same in software, as you can neither assert (and keep) LOCK
for as long as you need to, nor do you know the original PTE that
was read by the hardware walk.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 13:47                   ` Razvan Cojocaru
@ 2018-08-27 14:08                     ` Jan Beulich
  2018-08-28  8:04                       ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2018-08-27 14:08 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Andrew Cooper,
	Tim Deegan, xen-devel, aisaila

>>> On 27.08.18 at 15:47, <rcojocaru@bitdefender.com> wrote:
> On 8/27/18 4:17 PM, Jan Beulich wrote:
>>>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
>>> This should be architecturally correct as it is exclusively derived from
>>> information provided by the VMExit, and won't cause dirty bits to be
>>> written in cases where the hardware wouldn't have written them
>>> (speculative or otherwise).  It does mean that an instruction which
>>> would need to set A and D bits in the walk will take two EPT violations
>>> to achieve the end result, but it probably is still quicker than sending
>>> the vm_event out.
>> 
>> I'm afraid this is going to be only mostly correct: Atomicity of the page
>> table write is going to be lost. This could become an actual problem if
>> the guest used racing PTE accesses. Such racing accesses might not
>> be a bug - simply consider the OS scanning for set A and/or D bits
>> (and clearing them when they're set). Or an entity temporarily clearing
>> (parts of) PTEs, with recovery logic in place to restore them when
>> needed for a synchronous access. At the very least there's then the
>> risk of a live lock within the guest.
> 
> But it's not clear to me why that can't already happen when just
> emulating the current instruction (as we do now), if emulating said
> instruction would set A or D?

Yes, good point - this is a problem not just to the new handling you
propose.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-27 14:08                     ` Jan Beulich
@ 2018-08-28  8:04                       ` Andrew Cooper
  2018-08-28  8:13                         ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2018-08-28  8:04 UTC (permalink / raw)
  To: Jan Beulich, Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 27/08/18 15:08, Jan Beulich wrote:
>>>> On 27.08.18 at 15:47, <rcojocaru@bitdefender.com> wrote:
>> On 8/27/18 4:17 PM, Jan Beulich wrote:
>>>>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
>>>> This should be architecturally correct as it is exclusively derived from
>>>> information provided by the VMExit, and won't cause dirty bits to be
>>>> written in cases where the hardware wouldn't have written them
>>>> (speculative or otherwise).  It does mean that an instruction which
>>>> would need to set A and D bits in the walk will take two EPT violations
>>>> to achieve the end result, but it probably is still quicker than sending
>>>> the vm_event out.
>>> I'm afraid this is going to be only mostly correct: Atomicity of the page
>>> table write is going to be lost. This could become an actual problem if
>>> the guest used racing PTE accesses. Such racing accesses might not
>>> be a bug - simply consider the OS scanning for set A and/or D bits
>>> (and clearing them when they're set). Or an entity temporarily clearing
>>> (parts of) PTEs, with recovery logic in place to restore them when
>>> needed for a synchronous access. At the very least there's then the
>>> risk of a live lock within the guest.
>> But it's not clear to me why that can't already happen when just
>> emulating the current instruction (as we do now), if emulating said
>> instruction would set A or D?
> Yes, good point - this is a problem not just to the new handling you
> propose.

There is no risk of livelock.  The A/D bits we get an EPT-violation on
are those which are write protected, so any modification at all will
trap.  In particular, an attempt from software to play in weird ways
with the pagetable will cause real vm_events which will be sent for
auditing.

Legitimate updates (e.g. the guest paging-out algorithm refreshing its
A/D metadata) will be permitted, whereas illegitimate ones will trigger
remediation.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-28  8:04                       ` Andrew Cooper
@ 2018-08-28  8:13                         ` Jan Beulich
  2018-08-28  8:34                           ` Razvan Cojocaru
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2018-08-28  8:13 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Andrei LUTAS, Tamas K Lengyel, Razvan Cojocaru, George Dunlap,
	Tim Deegan, xen-devel, aisaila

>>> On 28.08.18 at 10:04, <andrew.cooper3@citrix.com> wrote:
> On 27/08/18 15:08, Jan Beulich wrote:
>>>>> On 27.08.18 at 15:47, <rcojocaru@bitdefender.com> wrote:
>>> On 8/27/18 4:17 PM, Jan Beulich wrote:
>>>>>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
>>>>> This should be architecturally correct as it is exclusively derived from
>>>>> information provided by the VMExit, and won't cause dirty bits to be
>>>>> written in cases where the hardware wouldn't have written them
>>>>> (speculative or otherwise).  It does mean that an instruction which
>>>>> would need to set A and D bits in the walk will take two EPT violations
>>>>> to achieve the end result, but it probably is still quicker than sending
>>>>> the vm_event out.
>>>> I'm afraid this is going to be only mostly correct: Atomicity of the page
>>>> table write is going to be lost. This could become an actual problem if
>>>> the guest used racing PTE accesses. Such racing accesses might not
>>>> be a bug - simply consider the OS scanning for set A and/or D bits
>>>> (and clearing them when they're set). Or an entity temporarily clearing
>>>> (parts of) PTEs, with recovery logic in place to restore them when
>>>> needed for a synchronous access. At the very least there's then the
>>>> risk of a live lock within the guest.
>>> But it's not clear to me why that can't already happen when just
>>> emulating the current instruction (as we do now), if emulating said
>>> instruction would set A or D?
>> Yes, good point - this is a problem not just to the new handling you
>> propose.
> 
> There is no risk of livelock.  The A/D bits we get an EPT-violation on
> are those which are write protected, so any modification at all will
> trap.  In particular, an attempt from software to play in weird ways
> with the pagetable will cause real vm_events which will be sent for
> auditing.

Even with multiple views, where only some write-protect the page?

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks
  2018-08-28  8:13                         ` Jan Beulich
@ 2018-08-28  8:34                           ` Razvan Cojocaru
  0 siblings, 0 replies; 18+ messages in thread
From: Razvan Cojocaru @ 2018-08-28  8:34 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper
  Cc: Andrei LUTAS, Tamas K Lengyel, George Dunlap, Tim Deegan,
	xen-devel, aisaila

On 8/28/18 11:13 AM, Jan Beulich wrote:
>>>> On 28.08.18 at 10:04, <andrew.cooper3@citrix.com> wrote:
>> On 27/08/18 15:08, Jan Beulich wrote:
>>>>>> On 27.08.18 at 15:47, <rcojocaru@bitdefender.com> wrote:
>>>> On 8/27/18 4:17 PM, Jan Beulich wrote:
>>>>>>>> On 27.08.18 at 15:02, <andrew.cooper3@citrix.com> wrote:
>>>>>> This should be architecturally correct as it is exclusively derived from
>>>>>> information provided by the VMExit, and won't cause dirty bits to be
>>>>>> written in cases where the hardware wouldn't have written them
>>>>>> (speculative or otherwise).  It does mean that an instruction which
>>>>>> would need to set A and D bits in the walk will take two EPT violations
>>>>>> to achieve the end result, but it probably is still quicker than sending
>>>>>> the vm_event out.
>>>>> I'm afraid this is going to be only mostly correct: Atomicity of the page
>>>>> table write is going to be lost. This could become an actual problem if
>>>>> the guest used racing PTE accesses. Such racing accesses might not
>>>>> be a bug - simply consider the OS scanning for set A and/or D bits
>>>>> (and clearing them when they're set). Or an entity temporarily clearing
>>>>> (parts of) PTEs, with recovery logic in place to restore them when
>>>>> needed for a synchronous access. At the very least there's then the
>>>>> risk of a live lock within the guest.
>>>> But it's not clear to me why that can't already happen when just
>>>> emulating the current instruction (as we do now), if emulating said
>>>> instruction would set A or D?
>>> Yes, good point - this is a problem not just to the new handling you
>>> propose.
>>
>> There is no risk of livelock.  The A/D bits we get an EPT-violation on
>> are those which are write protected, so any modification at all will
>> trap.  In particular, an attempt from software to play in weird ways
>> with the pagetable will cause real vm_events which will be sent for
>> auditing.
> 
> Even with multiple views, where only some write-protect the page?

In that scenario it could be problematic, but the feature, even now, is
disabled by default. We could add a comment to the effect of "don't use
this with combinations of restricted + unrestricted views" above the
libxc enabler function, and the caller assumes responsibility for
calling it in the proper context only.

FWIW, our introspection agent uses just one EPT view (even with altp2m,
our main scenario is #VE - so a guest-level optimization). It does
unfortunately need to be view 1 (because whatever we do to view 0
propagates to all other views by design), but that's about the extent of it.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2018-08-28  8:34 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-24 14:11 [PATCH v1] x86/mm: Suppresses vm_events caused by page-walks Alexandru Isaila
2018-08-24 17:49 ` Andrew Cooper
2018-08-24 18:47   ` Razvan Cojocaru
2018-08-27  9:11     ` Jan Beulich
2018-08-27 11:24       ` Razvan Cojocaru
2018-08-27 12:12         ` Jan Beulich
2018-08-27 12:35           ` Razvan Cojocaru
2018-08-27 12:37           ` Andrew Cooper
2018-08-27 12:53             ` Razvan Cojocaru
2018-08-27 13:02               ` Andrew Cooper
2018-08-27 13:17                 ` Jan Beulich
2018-08-27 13:47                   ` Razvan Cojocaru
2018-08-27 14:08                     ` Jan Beulich
2018-08-28  8:04                       ` Andrew Cooper
2018-08-28  8:13                         ` Jan Beulich
2018-08-28  8:34                           ` Razvan Cojocaru
2018-08-27 13:24                 ` Razvan Cojocaru
2018-08-27 14:06                   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.