From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH v2] x86/HVM: use available linear->phys translations in REP MOVS/STOS handling Date: Mon, 20 Jun 2016 05:29:10 -0600 Message-ID: <5767EFA602000078000F6A2E@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__Part1D2B2096.1__=" Return-path: Received: from mail6.bemta5.messagelabs.com ([195.245.231.135]) by lists.xenproject.org with esmtp (Exim 4.84_2) (envelope-from ) id 1bExOH-0006ef-4P for xen-devel@lists.xenproject.org; Mon, 20 Jun 2016 11:29:17 +0000 List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: xen-devel Cc: Paul Durrant List-Id: xen-devel@lists.xenproject.org This is a MIME message. If you are reading this text, you may want to consider changing to a mail reader or gateway that understands how to properly handle MIME multipart messages. --=__Part1D2B2096.1__= Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Content-Disposition: inline If we have the translation result available already, we should also use it here. In my tests with Linux guests this eliminates all calls to hvmemul_linear_to_phys() from the STOS path and most from the MOVS one. Also record the translation for re-use at least during response processing. Signed-off-by: Jan Beulich --- v2: Restrict to single iteration or address incrementing variant upon initial invocation, and don't bound *reps upon response processing. Latch translation only for the actual MMIO part of the accesses (to match the purpose of the per-vCPU fields; any further translation caching should probably go into hvmemul_linear_to_phys() itself). Also this time tested on AMD as well (albeit it still escapes me why behavior here would be CPU vendor dependent). --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -1156,6 +1156,7 @@ static int hvmemul_rep_movs( { struct hvm_emulate_ctxt *hvmemul_ctxt =3D container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + struct hvm_vcpu_io *vio =3D ¤t->arch.hvm_vcpu.hvm_io; unsigned long saddr, daddr, bytes; paddr_t sgpa, dgpa; uint32_t pfec =3D PFEC_page_present; @@ -1178,16 +1179,41 @@ static int hvmemul_rep_movs( if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl =3D=3D 3 ) pfec |=3D PFEC_user_mode; =20 - rc =3D hvmemul_linear_to_phys( - saddr, &sgpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); - if ( rc !=3D X86EMUL_OKAY ) - return rc; + if ( vio->mmio_access.read_access && + (vio->mmio_gla =3D=3D (saddr & PAGE_MASK)) && + /* + * Upon initial invocation don't truncate large batches just = because + * of a hit for the translation: Doing the guest page table walk = is + * cheaper than multiple round trips through the device model. = Yet + * when processing a response we can always re-use the translatio= n. + */ + (vio->io_req.state =3D=3D STATE_IORESP_READY || + ((!df || *reps =3D=3D 1) && + PAGE_SIZE - (saddr & ~PAGE_MASK) >=3D *reps * bytes_per_rep)) = ) + sgpa =3D pfn_to_paddr(vio->mmio_gpfn) | (saddr & ~PAGE_MASK); + else + { + rc =3D hvmemul_linear_to_phys(saddr, &sgpa, bytes_per_rep, reps, = pfec, + hvmemul_ctxt); + if ( rc !=3D X86EMUL_OKAY ) + return rc; + } =20 - rc =3D hvmemul_linear_to_phys( - daddr, &dgpa, bytes_per_rep, reps, - pfec | PFEC_write_access, hvmemul_ctxt); - if ( rc !=3D X86EMUL_OKAY ) - return rc; + bytes =3D PAGE_SIZE - (daddr & ~PAGE_MASK); + if ( vio->mmio_access.write_access && + (vio->mmio_gla =3D=3D (daddr & PAGE_MASK)) && + /* See comment above. */ + (vio->io_req.state =3D=3D STATE_IORESP_READY || + ((!df || *reps =3D=3D 1) && + PAGE_SIZE - (daddr & ~PAGE_MASK) >=3D *reps * bytes_per_rep)) = ) + dgpa =3D pfn_to_paddr(vio->mmio_gpfn) | (daddr & ~PAGE_MASK); + else + { + rc =3D hvmemul_linear_to_phys(daddr, &dgpa, bytes_per_rep, reps, + pfec | PFEC_write_access, hvmemul_ctxt= ); + if ( rc !=3D X86EMUL_OKAY ) + return rc; + } =20 /* Check for MMIO ops */ (void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT, = &sp2mt); @@ -1198,12 +1224,18 @@ static int hvmemul_rep_movs( return X86EMUL_UNHANDLEABLE; =20 if ( sp2mt =3D=3D p2m_mmio_dm ) + { + latch_linear_to_phys(vio, saddr, sgpa, 0); return hvmemul_do_mmio_addr( sgpa, reps, bytes_per_rep, IOREQ_READ, df, dgpa); + } =20 if ( dp2mt =3D=3D p2m_mmio_dm ) + { + latch_linear_to_phys(vio, daddr, dgpa, 1); return hvmemul_do_mmio_addr( dgpa, reps, bytes_per_rep, IOREQ_WRITE, df, sgpa); + } =20 /* RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, = bytes). */ bytes =3D *reps * bytes_per_rep; @@ -1279,25 +1311,37 @@ static int hvmemul_rep_stos( { struct hvm_emulate_ctxt *hvmemul_ctxt =3D container_of(ctxt, struct hvm_emulate_ctxt, ctxt); - unsigned long addr; + struct hvm_vcpu_io *vio =3D ¤t->arch.hvm_vcpu.hvm_io; + unsigned long addr, bytes; paddr_t gpa; p2m_type_t p2mt; bool_t df =3D !!(ctxt->regs->eflags & X86_EFLAGS_DF); int rc =3D hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, = reps, hvm_access_write, hvmemul_ctxt, = &addr); =20 - if ( rc =3D=3D X86EMUL_OKAY ) + if ( rc !=3D X86EMUL_OKAY ) + return rc; + + bytes =3D PAGE_SIZE - (addr & ~PAGE_MASK); + if ( vio->mmio_access.write_access && + (vio->mmio_gla =3D=3D (addr & PAGE_MASK)) && + /* See respective comment in MOVS processing. */ + (vio->io_req.state =3D=3D STATE_IORESP_READY || + ((!df || *reps =3D=3D 1) && + PAGE_SIZE - (addr & ~PAGE_MASK) >=3D *reps * bytes_per_rep)) ) + gpa =3D pfn_to_paddr(vio->mmio_gpfn) | (addr & ~PAGE_MASK); + else { uint32_t pfec =3D PFEC_page_present | PFEC_write_access; =20 if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl =3D=3D 3 ) pfec |=3D PFEC_user_mode; =20 - rc =3D hvmemul_linear_to_phys( - addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); + rc =3D hvmemul_linear_to_phys(addr, &gpa, bytes_per_rep, reps, = pfec, + hvmemul_ctxt); + if ( rc !=3D X86EMUL_OKAY ) + return rc; } - if ( rc !=3D X86EMUL_OKAY ) - return rc; =20 /* Check for MMIO op */ (void)get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, = &p2mt); @@ -1371,6 +1415,7 @@ static int hvmemul_rep_stos( return X86EMUL_UNHANDLEABLE; =20 case p2m_mmio_dm: + latch_linear_to_phys(vio, addr, gpa, 1); return hvmemul_do_mmio_buffer(gpa, reps, bytes_per_rep, IOREQ_WRIT= E, df, p_data); } --=__Part1D2B2096.1__= Content-Type: text/plain; name="x86-HVM-emul-REP-use-addresses.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="x86-HVM-emul-REP-use-addresses.patch" x86/HVM: use available linear->phys translations in REP MOVS/STOS = handling=0A=0AIf we have the translation result available already, we = should also use=0Ait here. In my tests with Linux guests this eliminates = all calls to=0Ahvmemul_linear_to_phys() from the STOS path and most from = the MOVS one.=0A=0AAlso record the translation for re-use at least during = response=0Aprocessing.=0A=0ASigned-off-by: Jan Beulich = =0A---=0Av2: Restrict to single iteration or address incrementing variant = upon=0A initial invocation, and don't bound *reps upon response = processing.=0A Latch translation only for the actual MMIO part of the = accesses (to=0A match the purpose of the per-vCPU fields; any further = translation=0A caching should probably go into hvmemul_linear_to_phys() = itself).=0A Also this time tested on AMD as well (albeit it still = escapes me=0A why behavior here would be CPU vendor dependent).=0A=0A---= a/xen/arch/x86/hvm/emulate.c=0A+++ b/xen/arch/x86/hvm/emulate.c=0A@@ = -1156,6 +1156,7 @@ static int hvmemul_rep_movs(=0A {=0A struct = hvm_emulate_ctxt *hvmemul_ctxt =3D=0A container_of(ctxt, struct = hvm_emulate_ctxt, ctxt);=0A+ struct hvm_vcpu_io *vio =3D ¤t->arch.= hvm_vcpu.hvm_io;=0A unsigned long saddr, daddr, bytes;=0A paddr_t = sgpa, dgpa;=0A uint32_t pfec =3D PFEC_page_present;=0A@@ -1178,16 = +1179,41 @@ static int hvmemul_rep_movs(=0A if ( hvmemul_ctxt->seg_reg[= x86_seg_ss].attr.fields.dpl =3D=3D 3 )=0A pfec |=3D PFEC_user_mode;= =0A =0A- rc =3D hvmemul_linear_to_phys(=0A- saddr, &sgpa, = bytes_per_rep, reps, pfec, hvmemul_ctxt);=0A- if ( rc !=3D X86EMUL_OKAY = )=0A- return rc;=0A+ if ( vio->mmio_access.read_access &&=0A+ = (vio->mmio_gla =3D=3D (saddr & PAGE_MASK)) &&=0A+ /*=0A+ = * Upon initial invocation don't truncate large batches just because=0A+= * of a hit for the translation: Doing the guest page table walk = is=0A+ * cheaper than multiple round trips through the device = model. Yet=0A+ * when processing a response we can always re-use = the translation.=0A+ */=0A+ (vio->io_req.state =3D=3D = STATE_IORESP_READY ||=0A+ ((!df || *reps =3D=3D 1) &&=0A+ = PAGE_SIZE - (saddr & ~PAGE_MASK) >=3D *reps * bytes_per_rep)) )=0A+ = sgpa =3D pfn_to_paddr(vio->mmio_gpfn) | (saddr & ~PAGE_MASK);=0A+ = else=0A+ {=0A+ rc =3D hvmemul_linear_to_phys(saddr, &sgpa, = bytes_per_rep, reps, pfec,=0A+ = hvmemul_ctxt);=0A+ if ( rc !=3D X86EMUL_OKAY )=0A+ = return rc;=0A+ }=0A =0A- rc =3D hvmemul_linear_to_phys(=0A- = daddr, &dgpa, bytes_per_rep, reps,=0A- pfec | PFEC_write_access, = hvmemul_ctxt);=0A- if ( rc !=3D X86EMUL_OKAY )=0A- return = rc;=0A+ bytes =3D PAGE_SIZE - (daddr & ~PAGE_MASK);=0A+ if ( = vio->mmio_access.write_access &&=0A+ (vio->mmio_gla =3D=3D (daddr = & PAGE_MASK)) &&=0A+ /* See comment above. */=0A+ = (vio->io_req.state =3D=3D STATE_IORESP_READY ||=0A+ ((!df || = *reps =3D=3D 1) &&=0A+ PAGE_SIZE - (daddr & ~PAGE_MASK) >=3D = *reps * bytes_per_rep)) )=0A+ dgpa =3D pfn_to_paddr(vio->mmio_gpfn) = | (daddr & ~PAGE_MASK);=0A+ else=0A+ {=0A+ rc =3D hvmemul_line= ar_to_phys(daddr, &dgpa, bytes_per_rep, reps,=0A+ = pfec | PFEC_write_access, hvmemul_ctxt);=0A+ if ( rc !=3D = X86EMUL_OKAY )=0A+ return rc;=0A+ }=0A =0A /* Check for = MMIO ops */=0A (void) get_gfn_query_unlocked(current->domain, sgpa >> = PAGE_SHIFT, &sp2mt);=0A@@ -1198,12 +1224,18 @@ static int hvmemul_rep_movs(= =0A return X86EMUL_UNHANDLEABLE;=0A =0A if ( sp2mt =3D=3D = p2m_mmio_dm )=0A+ {=0A+ latch_linear_to_phys(vio, saddr, sgpa, = 0);=0A return hvmemul_do_mmio_addr(=0A sgpa, reps, = bytes_per_rep, IOREQ_READ, df, dgpa);=0A+ }=0A =0A if ( dp2mt = =3D=3D p2m_mmio_dm )=0A+ {=0A+ latch_linear_to_phys(vio, daddr, = dgpa, 1);=0A return hvmemul_do_mmio_addr(=0A dgpa, = reps, bytes_per_rep, IOREQ_WRITE, df, sgpa);=0A+ }=0A =0A /* = RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, bytes). = */=0A bytes =3D *reps * bytes_per_rep;=0A@@ -1279,25 +1311,37 @@ = static int hvmemul_rep_stos(=0A {=0A struct hvm_emulate_ctxt *hvmemul_c= txt =3D=0A container_of(ctxt, struct hvm_emulate_ctxt, ctxt);=0A- = unsigned long addr;=0A+ struct hvm_vcpu_io *vio =3D ¤t->arch.hvm= _vcpu.hvm_io;=0A+ unsigned long addr, bytes;=0A paddr_t gpa;=0A = p2m_type_t p2mt;=0A bool_t df =3D !!(ctxt->regs->eflags & X86_EFLAGS_DF= );=0A int rc =3D hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, = reps,=0A hvm_access_write, = hvmemul_ctxt, &addr);=0A =0A- if ( rc =3D=3D X86EMUL_OKAY )=0A+ if ( = rc !=3D X86EMUL_OKAY )=0A+ return rc;=0A+=0A+ bytes =3D = PAGE_SIZE - (addr & ~PAGE_MASK);=0A+ if ( vio->mmio_access.write_access = &&=0A+ (vio->mmio_gla =3D=3D (addr & PAGE_MASK)) &&=0A+ /* = See respective comment in MOVS processing. */=0A+ (vio->io_req.stat= e =3D=3D STATE_IORESP_READY ||=0A+ ((!df || *reps =3D=3D 1) = &&=0A+ PAGE_SIZE - (addr & ~PAGE_MASK) >=3D *reps * bytes_per_rep= )) )=0A+ gpa =3D pfn_to_paddr(vio->mmio_gpfn) | (addr & ~PAGE_MASK);= =0A+ else=0A {=0A uint32_t pfec =3D PFEC_page_present | = PFEC_write_access;=0A =0A if ( hvmemul_ctxt->seg_reg[x86_seg_ss].at= tr.fields.dpl =3D=3D 3 )=0A pfec |=3D PFEC_user_mode;=0A =0A- = rc =3D hvmemul_linear_to_phys(=0A- addr, &gpa, bytes_per_r= ep, reps, pfec, hvmemul_ctxt);=0A+ rc =3D hvmemul_linear_to_phys(add= r, &gpa, bytes_per_rep, reps, pfec,=0A+ = hvmemul_ctxt);=0A+ if ( rc !=3D X86EMUL_OKAY )=0A+ = return rc;=0A }=0A- if ( rc !=3D X86EMUL_OKAY )=0A- return = rc;=0A =0A /* Check for MMIO op */=0A (void)get_gfn_query_unlocked(= current->domain, gpa >> PAGE_SHIFT, &p2mt);=0A@@ -1371,6 +1415,7 @@ static = int hvmemul_rep_stos(=0A return X86EMUL_UNHANDLEABLE;=0A =0A = case p2m_mmio_dm:=0A+ latch_linear_to_phys(vio, addr, gpa, 1);=0A = return hvmemul_do_mmio_buffer(gpa, reps, bytes_per_rep, IOREQ_WRITE, = df,=0A p_data);=0A }=0A --=__Part1D2B2096.1__= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwOi8vbGlzdHMueGVuLm9y Zy94ZW4tZGV2ZWwK --=__Part1D2B2096.1__=--