From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH v3] x86: also allow REP STOS emulation acceleration Date: Mon, 12 Jan 2015 08:01:08 +0000 Message-ID: <54B38D540200007800053702@mail.emea.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__PartA4914E54.2__=" Return-path: Received: from mail6.bemta14.messagelabs.com ([193.109.254.103]) by lists.xen.org with esmtp (Exim 4.72) (envelope-from ) id 1YAZw2-0002mY-VW for xen-devel@lists.xenproject.org; Mon, 12 Jan 2015 08:01:15 +0000 List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel Cc: Ian Campbell , Andrew Cooper , Keir Fraser , Ian Jackson , Tim Deegan List-Id: xen-devel@lists.xenproject.org This is a MIME message. If you are reading this text, you may want to consider changing to a mail reader or gateway that understands how to properly handle MIME multipart messages. --=__PartA4914E54.2__= Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Content-Disposition: inline While the REP MOVS acceleration appears to have helped qemu-traditional based guests, qemu-upstream (or really the respective video BIOSes) doesn't appear to benefit from that. Instead the acceleration added here provides a visible performance improvement during very early HVM guest boot. Signed-off-by: Jan Beulich --- v3: Drop now pointless memory clobber from asm() in hvmemul_rep_stos() Introduce and use ASSERT_UNREACHABLE(), as suggested by Andrew. v2: Fix asm() constraints in hvmemul_rep_stos(), as pointed out by Andrew. Add output operand telling the compiler that "buf" is being written. --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -731,6 +731,17 @@ static int hvmemul_rep_movs_discard( return X86EMUL_OKAY; } =20 +static int hvmemul_rep_stos_discard( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + return X86EMUL_OKAY; +} + static int hvmemul_rep_outs_discard( enum x86_segment src_seg, unsigned long src_offset, @@ -982,6 +993,113 @@ static int hvmemul_rep_movs( return X86EMUL_OKAY; } =20 +static int hvmemul_rep_stos( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct hvm_emulate_ctxt *hvmemul_ctxt =3D + container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + unsigned long addr; + paddr_t gpa; + p2m_type_t p2mt; + bool_t df =3D !!(ctxt->regs->eflags & X86_EFLAGS_DF); + int rc =3D hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, = reps, + hvm_access_write, hvmemul_ctxt, = &addr); + + if ( rc =3D=3D X86EMUL_OKAY ) + { + uint32_t pfec =3D PFEC_page_present | PFEC_write_access; + + if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl =3D=3D 3 ) + pfec |=3D PFEC_user_mode; + + rc =3D hvmemul_linear_to_phys( + addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); + } + if ( rc !=3D X86EMUL_OKAY ) + return rc; + + /* Check for MMIO op */ + (void)get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, = &p2mt); + + switch ( p2mt ) + { + unsigned long bytes; + void *buf; + + default: + /* Allocate temporary buffer. */ + for ( ; ; ) + { + bytes =3D *reps * bytes_per_rep; + buf =3D xmalloc_bytes(bytes); + if ( buf || *reps <=3D 1 ) + break; + *reps >>=3D 1; + } + + if ( !buf ) + buf =3D p_data; + else + switch ( bytes_per_rep ) + { + unsigned long dummy; + +#define CASE(bits, suffix) \ + case (bits) / 8: \ + asm ( "rep stos" #suffix \ + : "=3Dm" (*(char (*)[bytes])buf), \ + "=3DD" (dummy), "=3Dc" (dummy) \ + : "a" (*(const uint##bits##_t *)p_data), \ + "1" (buf), "2" (*reps) ); \ + break + CASE(8, b); + CASE(16, w); + CASE(32, l); + CASE(64, q); +#undef CASE + + default: + ASSERT_UNREACHABLE(); + xfree(buf); + return X86EMUL_UNHANDLEABLE; + } + + /* Adjust address for reverse store. */ + if ( df ) + gpa -=3D bytes - bytes_per_rep; + + rc =3D hvm_copy_to_guest_phys(gpa, buf, bytes); + + if ( buf !=3D p_data ) + xfree(buf); + + switch ( rc ) + { + case HVMCOPY_gfn_paged_out: + case HVMCOPY_gfn_shared: + return X86EMUL_RETRY; + case HVMCOPY_okay: + return X86EMUL_OKAY; + } + + gdprintk(XENLOG_WARNING, + "Failed REP STOS: gpa=3D%"PRIpaddr" reps=3D%lu bytes_per_= rep=3D%u\n", + gpa, *reps, bytes_per_rep); + /* fall through */ + case p2m_mmio_direct: + return X86EMUL_UNHANDLEABLE; + + case p2m_mmio_dm: + return hvmemul_do_mmio(gpa, reps, bytes_per_rep, 0, IOREQ_WRITE, = df, + p_data); + } +} + static int hvmemul_read_segment( enum x86_segment seg, struct segment_register *reg, @@ -1239,6 +1357,7 @@ static const struct x86_emulate_ops hvm_ .rep_ins =3D hvmemul_rep_ins, .rep_outs =3D hvmemul_rep_outs, .rep_movs =3D hvmemul_rep_movs, + .rep_stos =3D hvmemul_rep_stos, .read_segment =3D hvmemul_read_segment, .write_segment =3D hvmemul_write_segment, .read_io =3D hvmemul_read_io, @@ -1264,6 +1383,7 @@ static const struct x86_emulate_ops hvm_ .rep_ins =3D hvmemul_rep_ins_discard, .rep_outs =3D hvmemul_rep_outs_discard, .rep_movs =3D hvmemul_rep_movs_discard, + .rep_stos =3D hvmemul_rep_stos_discard, .read_segment =3D hvmemul_read_segment, .write_segment =3D hvmemul_write_segment, .read_io =3D hvmemul_read_io_discard, --- a/xen/arch/x86/hvm/stdvga.c +++ b/xen/arch/x86/hvm/stdvga.c @@ -470,11 +470,11 @@ static int mmio_move(struct hvm_hw_stdvg uint64_t addr =3D p->addr; p2m_type_t p2mt; struct domain *d =3D current->domain; + int step =3D p->df ? -p->size : p->size; =20 if ( p->data_is_ptr ) { uint64_t data =3D p->data, tmp; - int step =3D p->df ? -p->size : p->size; =20 if ( p->dir =3D=3D IOREQ_READ ) { @@ -529,13 +529,18 @@ static int mmio_move(struct hvm_hw_stdvg } } } + else if ( p->dir =3D=3D IOREQ_WRITE ) + { + for ( i =3D 0; i < p->count; i++ ) + { + stdvga_mem_write(addr, p->data, p->size); + addr +=3D step; + } + } else { ASSERT(p->count =3D=3D 1); - if ( p->dir =3D=3D IOREQ_READ ) - p->data =3D stdvga_mem_read(addr, p->size); - else - stdvga_mem_write(addr, p->data, p->size); + p->data =3D stdvga_mem_read(addr, p->size); } =20 read_data =3D p->data; --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -2568,15 +2568,25 @@ x86_emulate( } =20 case 0xaa ... 0xab: /* stos */ { - /* unsigned long max_reps =3D */get_rep_prefix(); - dst.type =3D OP_MEM; + unsigned long nr_reps =3D get_rep_prefix(); dst.bytes =3D (d & ByteOp) ? 1 : op_bytes; dst.mem.seg =3D x86_seg_es; dst.mem.off =3D truncate_ea(_regs.edi); - dst.val =3D _regs.eax; + if ( (nr_reps =3D=3D 1) || !ops->rep_stos || + ((rc =3D ops->rep_stos(&_regs.eax, + dst.mem.seg, dst.mem.off, dst.bytes, + &nr_reps, ctxt)) =3D=3D X86EMUL_UNHANDLE= ABLE) ) + { + dst.val =3D _regs.eax; + dst.type =3D OP_MEM; + nr_reps =3D 1; + } + else if ( rc !=3D X86EMUL_OKAY ) + goto done; register_address_increment( - _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);= - put_rep_prefix(1); + _regs.edi, + nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes))= ; + put_rep_prefix(nr_reps); break; } =20 --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -241,6 +241,20 @@ struct x86_emulate_ops struct x86_emulate_ctxt *ctxt); =20 /* + * rep_stos: Emulate STOS: <*p_data> -> . + * @bytes_per_rep: [IN ] Bytes transferred per repetition. + * @reps: [IN ] Maximum repetitions to be emulated. + * [OUT] Number of repetitions actually emulated. + */ + int (*rep_stos)( + void *p_data, + enum x86_segment seg, + unsigned long offset, + unsigned int bytes_per_rep, + unsigned long *reps, + struct x86_emulate_ctxt *ctxt); + + /* * read_segment: Emulate a read of full context of a segment = register. * @reg: [OUT] Contents of segment register (visible and hidden = state). */ --- a/xen/include/xen/lib.h +++ b/xen/include/xen/lib.h @@ -41,9 +41,11 @@ do { =20 #ifndef NDEBUG #define ASSERT(p) \ do { if ( unlikely(!(p)) ) assert_failed(#p); } while (0) +#define ASSERT_UNREACHABLE() assert_failed("unreachable") #define debug_build() 1 #else #define ASSERT(p) do { if ( 0 && (p) ); } while (0) +#define ASSERT_UNREACHABLE() do { } while (0) #define debug_build() 0 #endif =20 --=__PartA4914E54.2__= Content-Type: text/plain; name="x86emul-rep-stos.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="x86emul-rep-stos.patch" x86: also allow REP STOS emulation acceleration=0A=0AWhile the REP MOVS = acceleration appears to have helped qemu-traditional=0Abased guests, = qemu-upstream (or really the respective video BIOSes)=0Adoesn't appear to = benefit from that. Instead the acceleration added=0Ahere provides a = visible performance improvement during very early HVM=0Aguest boot.=0A=0ASi= gned-off-by: Jan Beulich =0A---=0Av3: Drop now = pointless memory clobber from asm() in hvmemul_rep_stos()=0A Introduce = and use ASSERT_UNREACHABLE(), as suggested by Andrew.=0Av2: Fix asm() = constraints in hvmemul_rep_stos(), as pointed out by=0A Andrew. Add = output operand telling the compiler that "buf" is being=0A written.=0A= =0A--- a/xen/arch/x86/hvm/emulate.c=0A+++ b/xen/arch/x86/hvm/emulate.c=0A@@= -731,6 +731,17 @@ static int hvmemul_rep_movs_discard(=0A return = X86EMUL_OKAY;=0A }=0A =0A+static int hvmemul_rep_stos_discard(=0A+ void = *p_data,=0A+ enum x86_segment seg,=0A+ unsigned long offset,=0A+ = unsigned int bytes_per_rep,=0A+ unsigned long *reps,=0A+ struct = x86_emulate_ctxt *ctxt)=0A+{=0A+ return X86EMUL_OKAY;=0A+}=0A+=0A = static int hvmemul_rep_outs_discard(=0A enum x86_segment src_seg,=0A = unsigned long src_offset,=0A@@ -982,6 +993,113 @@ static int hvmemul_rep_= movs(=0A return X86EMUL_OKAY;=0A }=0A =0A+static int hvmemul_rep_stos(= =0A+ void *p_data,=0A+ enum x86_segment seg,=0A+ unsigned long = offset,=0A+ unsigned int bytes_per_rep,=0A+ unsigned long *reps,=0A+ = struct x86_emulate_ctxt *ctxt)=0A+{=0A+ struct hvm_emulate_ctxt = *hvmemul_ctxt =3D=0A+ container_of(ctxt, struct hvm_emulate_ctxt, = ctxt);=0A+ unsigned long addr;=0A+ paddr_t gpa;=0A+ p2m_type_t = p2mt;=0A+ bool_t df =3D !!(ctxt->regs->eflags & X86_EFLAGS_DF);=0A+ = int rc =3D hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, reps,=0A+ = hvm_access_write, hvmemul_ctxt, = &addr);=0A+=0A+ if ( rc =3D=3D X86EMUL_OKAY )=0A+ {=0A+ = uint32_t pfec =3D PFEC_page_present | PFEC_write_access;=0A+=0A+ if = ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl =3D=3D 3 )=0A+ = pfec |=3D PFEC_user_mode;=0A+=0A+ rc =3D hvmemul_linear_to_phys(= =0A+ addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt);=0A+ = }=0A+ if ( rc !=3D X86EMUL_OKAY )=0A+ return rc;=0A+=0A+ /* = Check for MMIO op */=0A+ (void)get_gfn_query_unlocked(current->domain, = gpa >> PAGE_SHIFT, &p2mt);=0A+=0A+ switch ( p2mt )=0A+ {=0A+ = unsigned long bytes;=0A+ void *buf;=0A+=0A+ default:=0A+ = /* Allocate temporary buffer. */=0A+ for ( ; ; )=0A+ {=0A+ = bytes =3D *reps * bytes_per_rep;=0A+ buf =3D xmalloc_by= tes(bytes);=0A+ if ( buf || *reps <=3D 1 )=0A+ = break;=0A+ *reps >>=3D 1;=0A+ }=0A+=0A+ if ( !buf = )=0A+ buf =3D p_data;=0A+ else=0A+ switch ( = bytes_per_rep )=0A+ {=0A+ unsigned long = dummy;=0A+=0A+#define CASE(bits, suffix) = \=0A+ case (bits) / 8: = \=0A+ asm ( "rep stos" #suffix \=0A+ = : "=3Dm" (*(char (*)[bytes])buf), \=0A+ = "=3DD" (dummy), "=3Dc" (dummy) \=0A+ = : "a" (*(const uint##bits##_t *)p_data), \=0A+ = "1" (buf), "2" (*reps) ); \=0A+ = break=0A+ CASE(8, b);=0A+ CASE(16, w);=0A+ = CASE(32, l);=0A+ CASE(64, q);=0A+#undef CASE=0A+=0A+ = default:=0A+ ASSERT_UNREACHABLE();=0A+ = xfree(buf);=0A+ return X86EMUL_UNHANDLEABLE;=0A+ = }=0A+=0A+ /* Adjust address for reverse store. */=0A+ if ( = df )=0A+ gpa -=3D bytes - bytes_per_rep;=0A+=0A+ rc =3D = hvm_copy_to_guest_phys(gpa, buf, bytes);=0A+=0A+ if ( buf !=3D = p_data )=0A+ xfree(buf);=0A+=0A+ switch ( rc )=0A+ = {=0A+ case HVMCOPY_gfn_paged_out:=0A+ case HVMCOPY_gfn_share= d:=0A+ return X86EMUL_RETRY;=0A+ case HVMCOPY_okay:=0A+ = return X86EMUL_OKAY;=0A+ }=0A+=0A+ gdprintk(XENLOG_= WARNING,=0A+ "Failed REP STOS: gpa=3D%"PRIpaddr" reps=3D%lu= bytes_per_rep=3D%u\n",=0A+ gpa, *reps, bytes_per_rep);=0A+= /* fall through */=0A+ case p2m_mmio_direct:=0A+ return = X86EMUL_UNHANDLEABLE;=0A+=0A+ case p2m_mmio_dm:=0A+ return = hvmemul_do_mmio(gpa, reps, bytes_per_rep, 0, IOREQ_WRITE, df,=0A+ = p_data);=0A+ }=0A+}=0A+=0A static int hvmemul_read_= segment(=0A enum x86_segment seg,=0A struct segment_register = *reg,=0A@@ -1239,6 +1357,7 @@ static const struct x86_emulate_ops hvm_=0A = .rep_ins =3D hvmemul_rep_ins,=0A .rep_outs =3D hvmemul_re= p_outs,=0A .rep_movs =3D hvmemul_rep_movs,=0A+ .rep_stos = =3D hvmemul_rep_stos,=0A .read_segment =3D hvmemul_read_segment,=0A = .write_segment =3D hvmemul_write_segment,=0A .read_io =3D = hvmemul_read_io,=0A@@ -1264,6 +1383,7 @@ static const struct x86_emulate_op= s hvm_=0A .rep_ins =3D hvmemul_rep_ins_discard,=0A .rep_outs = =3D hvmemul_rep_outs_discard,=0A .rep_movs =3D hvmemul_rep_mo= vs_discard,=0A+ .rep_stos =3D hvmemul_rep_stos_discard,=0A = .read_segment =3D hvmemul_read_segment,=0A .write_segment =3D = hvmemul_write_segment,=0A .read_io =3D hvmemul_read_io_discard,= =0A--- a/xen/arch/x86/hvm/stdvga.c=0A+++ b/xen/arch/x86/hvm/stdvga.c=0A@@ = -470,11 +470,11 @@ static int mmio_move(struct hvm_hw_stdvg=0A = uint64_t addr =3D p->addr;=0A p2m_type_t p2mt;=0A struct domain *d = =3D current->domain;=0A+ int step =3D p->df ? -p->size : p->size;=0A = =0A if ( p->data_is_ptr )=0A {=0A uint64_t data =3D = p->data, tmp;=0A- int step =3D p->df ? -p->size : p->size;=0A =0A = if ( p->dir =3D=3D IOREQ_READ )=0A {=0A@@ -529,13 +529,18 @@ = static int mmio_move(struct hvm_hw_stdvg=0A }=0A }=0A = }=0A+ else if ( p->dir =3D=3D IOREQ_WRITE )=0A+ {=0A+ for = ( i =3D 0; i < p->count; i++ )=0A+ {=0A+ stdvga_mem_write= (addr, p->data, p->size);=0A+ addr +=3D step;=0A+ }=0A+ = }=0A else=0A {=0A ASSERT(p->count =3D=3D 1);=0A- = if ( p->dir =3D=3D IOREQ_READ )=0A- p->data =3D stdvga_mem_read(= addr, p->size);=0A- else=0A- stdvga_mem_write(addr, = p->data, p->size);=0A+ p->data =3D stdvga_mem_read(addr, = p->size);=0A }=0A =0A read_data =3D p->data;=0A--- a/xen/arch/x86/x= 86_emulate/x86_emulate.c=0A+++ b/xen/arch/x86/x86_emulate/x86_emulate.c=0A@= @ -2568,15 +2568,25 @@ x86_emulate(=0A }=0A =0A case 0xaa ... = 0xab: /* stos */ {=0A- /* unsigned long max_reps =3D */get_rep_prefi= x();=0A- dst.type =3D OP_MEM;=0A+ unsigned long nr_reps =3D = get_rep_prefix();=0A dst.bytes =3D (d & ByteOp) ? 1 : op_bytes;=0A = dst.mem.seg =3D x86_seg_es;=0A dst.mem.off =3D truncate_ea(= _regs.edi);=0A- dst.val =3D _regs.eax;=0A+ if ( (nr_reps = =3D=3D 1) || !ops->rep_stos ||=0A+ ((rc =3D ops->rep_stos(&_reg= s.eax,=0A+ dst.mem.seg, dst.mem.off, = dst.bytes,=0A+ &nr_reps, ctxt)) =3D=3D = X86EMUL_UNHANDLEABLE) )=0A+ {=0A+ dst.val =3D _regs.eax;= =0A+ dst.type =3D OP_MEM;=0A+ nr_reps =3D 1;=0A+ = }=0A+ else if ( rc !=3D X86EMUL_OKAY )=0A+ goto = done;=0A register_address_increment(=0A- _regs.edi, = (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);=0A- put_rep_pref= ix(1);=0A+ _regs.edi,=0A+ nr_reps * ((_regs.eflags & = EFLG_DF) ? -dst.bytes : dst.bytes));=0A+ put_rep_prefix(nr_reps);=0A= break;=0A }=0A =0A--- a/xen/arch/x86/x86_emulate/x86_emulate.h= =0A+++ b/xen/arch/x86/x86_emulate/x86_emulate.h=0A@@ -241,6 +241,20 @@ = struct x86_emulate_ops=0A struct x86_emulate_ctxt *ctxt);=0A =0A = /*=0A+ * rep_stos: Emulate STOS: <*p_data> -> .=0A+ = * @bytes_per_rep: [IN ] Bytes transferred per repetition.=0A+ * = @reps: [IN ] Maximum repetitions to be emulated.=0A+ * [OUT] = Number of repetitions actually emulated.=0A+ */=0A+ int (*rep_stos)(= =0A+ void *p_data,=0A+ enum x86_segment seg,=0A+ = unsigned long offset,=0A+ unsigned int bytes_per_rep,=0A+ = unsigned long *reps,=0A+ struct x86_emulate_ctxt *ctxt);=0A+=0A+ = /*=0A * read_segment: Emulate a read of full context of a segment = register.=0A * @reg: [OUT] Contents of segment register (visible = and hidden state).=0A */=0A--- a/xen/include/xen/lib.h=0A+++ = b/xen/include/xen/lib.h=0A@@ -41,9 +41,11 @@ do { = =0A #ifndef NDEBUG=0A #define ASSERT(p) \=0A do { if ( = unlikely(!(p)) ) assert_failed(#p); } while (0)=0A+#define ASSERT_UNREACHAB= LE() assert_failed("unreachable")=0A #define debug_build() 1=0A #else=0A = #define ASSERT(p) do { if ( 0 && (p) ); } while (0)=0A+#define ASSERT_UNREA= CHABLE() do { } while (0)=0A #define debug_build() 0=0A #endif=0A =0A --=__PartA4914E54.2__= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel --=__PartA4914E54.2__=--