From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Cooper Subject: Re: [PATCH RFC V2 5/6] xen, libxc: Request page fault injection via libxc Date: Fri, 11 Jul 2014 19:06:55 +0100 Message-ID: <53C027BF.1090902@citrix.com> References: <1405093418-23481-1-git-send-email-rcojocaru@bitdefender.com> <1405093418-23481-5-git-send-email-rcojocaru@bitdefender.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1405093418-23481-5-git-send-email-rcojocaru@bitdefender.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Razvan Cojocaru , xen-devel@lists.xen.org Cc: mdontu@bitdefender.com, tim@xen.org, JBeulich@suse.com List-Id: xen-devel@lists.xenproject.org On 11/07/14 16:43, Razvan Cojocaru wrote: > Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's > new xc_domain_set_pagefault_info() function to set per-domain page > fault injection information. All a call does is set per-domain info, > and nothing actually happens until VMENTRY time, and then only if > all conditions are met (the guest is in user mode, the set value > matches CR3, and there are no other pending traps). > This mechanism allows bringing in swapped-out pages for inspection. > > Signed-off-by: Razvan Cojocaru For the record, I still think this is a bad idea to be working against a guest OS paging algorithm, and I am uneasy about whether it is sensible to introduce abilities like that into the Xen API. With that problem aside, I have some review anway. > --- > tools/libxc/xc_domain.c | 17 +++++++++++++++++ > tools/libxc/xenctrl.h | 4 ++++ > xen/arch/x86/hvm/vmx/vmx.c | 39 +++++++++++++++++++++++++++++++++++++++ > xen/common/domain.c | 5 +++++ > xen/common/domctl.c | 21 +++++++++++++++++++++ > xen/include/public/domctl.h | 14 ++++++++++++++ > xen/include/xen/sched.h | 7 +++++++ > 7 files changed, 107 insertions(+) > > diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c > index 0230c6c..7437c51 100644 > --- a/tools/libxc/xc_domain.c > +++ b/tools/libxc/xc_domain.c > @@ -506,6 +506,23 @@ int xc_domain_hvm_setcontext(xc_interface *xch, > return ret; > } > > +int xc_domain_set_pagefault_info(xc_interface *xch, > + uint32_t domid, > + xen_domctl_set_pagefault_info_t *info) > +{ > + DECLARE_DOMCTL; > + > + if (info == NULL) > + return -1; > + > + domctl.cmd = XEN_DOMCTL_set_pagefault_info; > + domctl.domain = (domid_t)domid; > + domctl.u.set_pagefault_info.address_space = info->address_space; > + domctl.u.set_pagefault_info.virtual_address = info->virtual_address; > + domctl.u.set_pagefault_info.write_access = info->write_access; > + return do_domctl(xch, &domctl); > +} > + > int xc_vcpu_getcontext(xc_interface *xch, > uint32_t domid, > uint32_t vcpu, > diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h > index 3578b09..302ef0e 100644 > --- a/tools/libxc/xenctrl.h > +++ b/tools/libxc/xenctrl.h > @@ -793,6 +793,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch, > const char *xc_domain_get_native_protocol(xc_interface *xch, > uint32_t domid); > > +int xc_domain_set_pagefault_info(xc_interface *xch, > + uint32_t domid, > + xen_domctl_set_pagefault_info_t *info); > + > /** > * This function returns information about the execution context of a > * particular vcpu of a domain. > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c > index 6c63225..835621f 100644 > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -416,6 +416,7 @@ static void vmx_restore_dr(struct vcpu *v) > static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c) > { > unsigned long ev; > + unsigned long cs_arbytes; > > vmx_vmcs_enter(v); > > @@ -429,6 +430,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c) > __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs); > __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp); > __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip); > + __vmread(GUEST_CS_AR_BYTES, &cs_arbytes); > + > + c->cs_arbytes = (uint32_t)cs_arbytes; I am fairly sure this will break migration, as you are changing the bit representation of c->cs_arbytes in the architectural state. However, given a comment below, I cant see why you even need it. > > c->pending_event = 0; > c->error_code = 0; > @@ -3113,6 +3117,39 @@ out: > nvmx_idtv_handling(); > } > > +static void check_pf_injection(void) > +{ > + struct vcpu *curr = current; > + struct domain *d = curr->domain; > + struct hvm_hw_cpu ctxt; > + uint32_t ss_dpl; > + > + if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 ) > + return; > + > + memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)); > + hvm_funcs.save_cpu_ctxt(curr, &ctxt); > + > + ss_dpl = (ctxt.cs_arbytes >> 5) & 3; > + > + if ( ss_dpl == 3 /* Guest is in user mode */ cs dpl is not cpl. It will be incorrect at various points, e.g. conforming code segments. The real ss dpl is always the correct cpl. There are notes to this effect in both the Intel and AMD architecture manuals, but I can't for the life of me actually find either of the notes. > + && !ctxt.pending_event > + && ctxt.cr3 == d->fault_info.address_space ) > + { > + /* Cache */ > + uint64_t virtual_address = d->fault_info.virtual_address; > + uint32_t write_access = d->fault_info.write_access; > + > + /* Reset */ > + d->fault_info.address_space = 0; > + d->fault_info.virtual_address = 0; > + d->fault_info.write_access = 0; > + > + hvm_inject_page_fault((write_access << 1) | PFEC_user_mode, It is sensible in the slightest to have write_access shifted by 1 with respect to a real pagefault error code? > + virtual_address); > + } > +} > + > void vmx_vmenter_helper(const struct cpu_user_regs *regs) > { > struct vcpu *curr = current; > @@ -3153,6 +3190,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs) > if ( unlikely(need_flush) ) > vpid_sync_all(); > > + check_pf_injection(); > + > out: > HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); > > diff --git a/xen/common/domain.c b/xen/common/domain.c > index cd64aea..e7bd734 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -255,6 +255,11 @@ struct domain *domain_create( > > d->domain_id = domid; > > + /* Memory introspection page fault variables set-up. */ > + d->fault_info.address_space = 0; > + d->fault_info.virtual_address = 0; > + d->fault_info.write_access = 0; > + alloc_domain_struct() zeroes the memory beforehand, so you don't need to repeat it here. > lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain"); > > if ( (err = xsm_alloc_security_domain(d)) != 0 ) > diff --git a/xen/common/domctl.c b/xen/common/domctl.c > index c326aba..4321ca1 100644 > --- a/xen/common/domctl.c > +++ b/xen/common/domctl.c > @@ -967,6 +967,27 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) > } > break; > > + case XEN_DOMCTL_set_pagefault_info: > + { > + struct domain *d; > + > + ret = -ESRCH; > + d = rcu_lock_domain_by_id(op->domain); > + if ( d != NULL ) > + { This should fail with -EINVAL if !has_hvm_container(d) > + d->fault_info.address_space = > + op->u.set_pagefault_info.address_space; > + d->fault_info.virtual_address = > + op->u.set_pagefault_info.virtual_address; > + d->fault_info.write_access = > + op->u.set_pagefault_info.write_access; > + > + rcu_unlock_domain(d); > + ret = 0; > + } > + } > + break; > + > default: > ret = arch_do_domctl(op, d, u_domctl); > break; > diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h > index 5b11bbf..c8bf3f8 100644 > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t; > DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t); > #endif > > +/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at > + * the next VMENTRY. > + * */ > +struct xen_domctl_set_pagefault_info { > + uint64_t address_space; > + uint64_t virtual_address; > + uint32_t write_access; > +}; > +typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t; > +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t); > + > + > struct xen_domctl { > uint32_t cmd; > #define XEN_DOMCTL_createdomain 1 > @@ -1012,6 +1024,7 @@ struct xen_domctl { > #define XEN_DOMCTL_gdbsx_pausevcpu 1001 > #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 > #define XEN_DOMCTL_gdbsx_domstatus 1003 > +#define XEN_DOMCTL_set_pagefault_info 1004 > uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ > domid_t domain; > union { > @@ -1068,6 +1081,7 @@ struct xen_domctl { > struct xen_domctl_cacheflush cacheflush; > struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; > struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; > + struct xen_domctl_set_pagefault_info set_pagefault_info; > uint8_t pad[128]; > } u; > }; > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index d5bc461..754e070 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -447,6 +447,13 @@ struct domain > nodemask_t node_affinity; > unsigned int last_alloc_node; > spinlock_t node_affinity_lock; > + > + /* Memory introspection page fault injection data. */ > + struct { > + uint64_t address_space; > + uint64_t virtual_address; > + uint32_t write_access; > + } fault_info; As currently designed, this is only possible for hvm domains. It should live inside struct hvm_domain. ~Andrew > }; > > struct domain_setup_info