From mboxrd@z Thu Jan  1 00:00:00 1970
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: Re: [PATCH RFC V2 5/6] xen,
 libxc: Request page fault injection via libxc
Date: Fri, 11 Jul 2014 19:06:55 +0100
Message-ID: <53C027BF.1090902@citrix.com>
References: <1405093418-23481-1-git-send-email-rcojocaru@bitdefender.com>
	<1405093418-23481-5-git-send-email-rcojocaru@bitdefender.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
In-Reply-To: <1405093418-23481-5-git-send-email-rcojocaru@bitdefender.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: Razvan Cojocaru <rcojocaru@bitdefender.com>, xen-devel@lists.xen.org
Cc: mdontu@bitdefender.com, tim@xen.org, JBeulich@suse.com
List-Id: xen-devel@lists.xenproject.org

On 11/07/14 16:43, Razvan Cojocaru wrote:
> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
> new xc_domain_set_pagefault_info() function to set per-domain page
> fault injection information. All a call does is set per-domain info,
> and nothing actually happens until VMENTRY time, and then only if
> all conditions are met (the guest is in user mode, the set value
> matches CR3, and there are no other pending traps).
> This mechanism allows bringing in swapped-out pages for inspection.
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

For the record, I still think this is a bad idea to be working against a
guest OS paging algorithm, and I am uneasy about whether it is sensible
to introduce abilities like that into the Xen API.

With that problem aside, I have some review anway.

> ---
>  tools/libxc/xc_domain.c     |   17 +++++++++++++++++
>  tools/libxc/xenctrl.h       |    4 ++++
>  xen/arch/x86/hvm/vmx/vmx.c  |   39 +++++++++++++++++++++++++++++++++++++++
>  xen/common/domain.c         |    5 +++++
>  xen/common/domctl.c         |   21 +++++++++++++++++++++
>  xen/include/public/domctl.h |   14 ++++++++++++++
>  xen/include/xen/sched.h     |    7 +++++++
>  7 files changed, 107 insertions(+)
>
> diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
> index 0230c6c..7437c51 100644
> --- a/tools/libxc/xc_domain.c
> +++ b/tools/libxc/xc_domain.c
> @@ -506,6 +506,23 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
>      return ret;
>  }
>  
> +int xc_domain_set_pagefault_info(xc_interface *xch,
> +                                 uint32_t domid,
> +                                 xen_domctl_set_pagefault_info_t *info)
> +{
> +    DECLARE_DOMCTL;
> +
> +    if (info == NULL)
> +        return -1;
> +
> +    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
> +    domctl.domain = (domid_t)domid;
> +    domctl.u.set_pagefault_info.address_space = info->address_space;
> +    domctl.u.set_pagefault_info.virtual_address = info->virtual_address;
> +    domctl.u.set_pagefault_info.write_access = info->write_access;
> +    return do_domctl(xch, &domctl);
> +}
> +
>  int xc_vcpu_getcontext(xc_interface *xch,
>                         uint32_t domid,
>                         uint32_t vcpu,
> diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
> index 3578b09..302ef0e 100644
> --- a/tools/libxc/xenctrl.h
> +++ b/tools/libxc/xenctrl.h
> @@ -793,6 +793,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
>  const char *xc_domain_get_native_protocol(xc_interface *xch,
>                                            uint32_t domid);
>  
> +int xc_domain_set_pagefault_info(xc_interface *xch,
> +                                 uint32_t domid,
> +                                 xen_domctl_set_pagefault_info_t *info);
> +
>  /**
>   * This function returns information about the execution context of a
>   * particular vcpu of a domain.
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 6c63225..835621f 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -416,6 +416,7 @@ static void vmx_restore_dr(struct vcpu *v)
>  static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>  {
>      unsigned long ev;
> +    unsigned long cs_arbytes;
>  
>      vmx_vmcs_enter(v);
>  
> @@ -429,6 +430,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>      __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
>      __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
>      __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
> +    __vmread(GUEST_CS_AR_BYTES, &cs_arbytes);
> +
> +    c->cs_arbytes = (uint32_t)cs_arbytes;

I am fairly sure this will break migration, as you are changing the bit
representation of c->cs_arbytes in the architectural state.

However, given a comment below, I cant see why you even need it.

>  
>      c->pending_event = 0;
>      c->error_code = 0;
> @@ -3113,6 +3117,39 @@ out:
>          nvmx_idtv_handling();
>  }
>  
> +static void check_pf_injection(void)
> +{
> +    struct vcpu *curr = current;
> +    struct domain *d = curr->domain;
> +    struct hvm_hw_cpu ctxt;
> +    uint32_t ss_dpl;
> +
> +    if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 )
> +        return;
> +
> +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
> +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
> +
> +    ss_dpl = (ctxt.cs_arbytes >> 5) & 3;
> +
> +    if ( ss_dpl == 3 /* Guest is in user mode */

cs dpl is not cpl.  It will be incorrect at various points, e.g.
conforming code segments.

The real ss dpl is always the correct cpl.

There are notes to this effect in both the Intel and AMD architecture
manuals, but I can't for the life of me actually find either of the notes.

> +         && !ctxt.pending_event
> +         && ctxt.cr3 == d->fault_info.address_space )
> +    {
> +        /* Cache */
> +        uint64_t virtual_address = d->fault_info.virtual_address;
> +        uint32_t write_access = d->fault_info.write_access;
> +
> +        /* Reset */
> +        d->fault_info.address_space = 0;
> +        d->fault_info.virtual_address = 0;
> +        d->fault_info.write_access = 0;
> +
> +        hvm_inject_page_fault((write_access << 1) | PFEC_user_mode,

It is sensible in the slightest to have write_access shifted by 1 with
respect to a real pagefault error code?

> +            virtual_address);
> +    }
> +}
> +
>  void vmx_vmenter_helper(const struct cpu_user_regs *regs)
>  {
>      struct vcpu *curr = current;
> @@ -3153,6 +3190,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
>      if ( unlikely(need_flush) )
>          vpid_sync_all();
>  
> +    check_pf_injection();
> +
>   out:
>      HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
>  
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index cd64aea..e7bd734 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -255,6 +255,11 @@ struct domain *domain_create(
>  
>      d->domain_id = domid;
>  
> +    /* Memory introspection page fault variables set-up. */
> +    d->fault_info.address_space = 0;
> +    d->fault_info.virtual_address = 0;
> +    d->fault_info.write_access = 0;
> +

alloc_domain_struct() zeroes the memory beforehand, so you don't need to
repeat it here.

>      lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
>  
>      if ( (err = xsm_alloc_security_domain(d)) != 0 )
> diff --git a/xen/common/domctl.c b/xen/common/domctl.c
> index c326aba..4321ca1 100644
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -967,6 +967,27 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>      }
>      break;
>  
> +    case XEN_DOMCTL_set_pagefault_info:
> +    {
> +        struct domain *d;
> +
> +        ret = -ESRCH;
> +        d = rcu_lock_domain_by_id(op->domain);
> +        if ( d != NULL )
> +        {

This should fail with -EINVAL if !has_hvm_container(d)

> +            d->fault_info.address_space =
> +                op->u.set_pagefault_info.address_space;
> +            d->fault_info.virtual_address =
> +                op->u.set_pagefault_info.virtual_address;
> +            d->fault_info.write_access =
> +                op->u.set_pagefault_info.write_access;
> +
> +            rcu_unlock_domain(d);
> +            ret = 0;
> +        }
> +    }
> +    break;
> +
>      default:
>          ret = arch_do_domctl(op, d, u_domctl);
>          break;
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index 5b11bbf..c8bf3f8 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
>  #endif
>  
> +/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
> + * the next VMENTRY.
> + *  */
> +struct xen_domctl_set_pagefault_info {
> +    uint64_t address_space;
> +    uint64_t virtual_address;
> +    uint32_t write_access;
> +};
> +typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t);
> +
> +
>  struct xen_domctl {
>      uint32_t cmd;
>  #define XEN_DOMCTL_createdomain                   1
> @@ -1012,6 +1024,7 @@ struct xen_domctl {
>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
>  #define XEN_DOMCTL_gdbsx_domstatus             1003
> +#define XEN_DOMCTL_set_pagefault_info          1004
>      uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
>      domid_t  domain;
>      union {
> @@ -1068,6 +1081,7 @@ struct xen_domctl {
>          struct xen_domctl_cacheflush        cacheflush;
>          struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
>          struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
> +        struct xen_domctl_set_pagefault_info set_pagefault_info;
>          uint8_t                             pad[128];
>      } u;
>  };
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index d5bc461..754e070 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -447,6 +447,13 @@ struct domain
>      nodemask_t node_affinity;
>      unsigned int last_alloc_node;
>      spinlock_t node_affinity_lock;
> +
> +    /* Memory introspection page fault injection data. */
> +    struct {
> +        uint64_t address_space;
> +        uint64_t virtual_address;
> +        uint32_t write_access;
> +    } fault_info;

As currently designed, this is only possible for hvm domains.  It should
live inside struct hvm_domain.

~Andrew

>  };
>  
>  struct domain_setup_info