[PATCH 0/5] x86/HVM: XSA-63 follow-ups

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/5] x86/HVM: XSA-63 follow-ups
@ 2013-09-30 12:51 Jan Beulich
  2013-09-30 12:57 ` [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation Jan Beulich
                   ` (6 more replies)
  0 siblings, 7 replies; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:51 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

1: properly handle backward string instruction emulation
2: fix direct PCI port I/O emulation retry and error handling
3: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
4: properly deal with hvm_copy_*_guest_phys() errors
5: cache emulated instruction for retry processing

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
@ 2013-09-30 12:57 ` Jan Beulich
  2013-10-08 16:36   ` Andrew Cooper
  2013-09-30 12:57 ` [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling Jan Beulich
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:57 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 7398 bytes --]

Multiplying a signed 32-bit quantity with an unsigned 32-bit quantity
produces an unsigned 32-bit result, yet for emulation of backward
string instructions we need the result sign extended before getting
added to the base address.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -48,7 +48,7 @@ static int hvm_mmio_access(struct vcpu *
                            hvm_mmio_write_t write_handler)
 {
     unsigned long data;
-    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
 
     if ( !p->data_is_ptr )
     {
@@ -68,13 +68,10 @@ static int hvm_mmio_access(struct vcpu *
         {
             int ret;
 
-            rc = read_handler(v, p->addr + (sign * i * p->size), p->size,
-                              &data);
+            rc = read_handler(v, p->addr + step * i, p->size, &data);
             if ( rc != X86EMUL_OKAY )
                 break;
-            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size),
-                                         &data,
-                                         p->size);
+            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
             if ( (ret == HVMCOPY_gfn_paged_out) || 
                  (ret == HVMCOPY_gfn_shared) )
             {
@@ -87,8 +84,7 @@ static int hvm_mmio_access(struct vcpu *
     {
         for ( i = 0; i < p->count; i++ )
         {
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
@@ -109,8 +105,7 @@ static int hvm_mmio_access(struct vcpu *
             }
             if ( rc != X86EMUL_OKAY )
                 break;
-            rc = write_handler(v, p->addr + (sign * i * p->size), p->size,
-                               data);
+            rc = write_handler(v, p->addr + step * i, p->size, data);
             if ( rc != X86EMUL_OKAY )
                 break;
         }
@@ -142,7 +137,7 @@ int hvm_mmio_intercept(ioreq_t *p)
 
 static int process_portio_intercept(portio_action_t action, ioreq_t *p)
 {
-    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     if ( !p->data_is_ptr )
@@ -167,8 +162,7 @@ static int process_portio_intercept(port
             rc = action(IOREQ_READ, p->addr, p->size, &data);
             if ( rc != X86EMUL_OKAY )
                 break;
-            (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size,
-                                         &data, p->size);
+            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
         }
     }
     else /* p->dir == IOREQ_WRITE */
@@ -176,8 +170,7 @@ static int process_portio_intercept(port
         for ( i = 0; i < p->count; i++ )
         {
             data = 0;
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -294,7 +294,7 @@ void hvm_io_assist(void)
 
 static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
 {
-    int i, sign = p->df ? -1 : 1;
+    int i, step = p->df ? -p->size : p->size;
     uint32_t data = 0;
 
     for ( i = 0; i < p->count; i++ )
@@ -317,8 +317,7 @@ static int dpci_ioport_read(uint32_t mpo
         if ( p->data_is_ptr )
         {
             int ret;
-            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size), &data,
-                                         p->size);
+            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
             if ( (ret == HVMCOPY_gfn_paged_out) ||
                  (ret == HVMCOPY_gfn_shared) )
                 return X86EMUL_RETRY;
@@ -332,7 +331,7 @@ static int dpci_ioport_read(uint32_t mpo
 
 static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
 {
-    int i, sign = p->df ? -1 : 1;
+    int i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     for ( i = 0; i < p->count; i++ )
@@ -340,8 +339,7 @@ static int dpci_ioport_write(uint32_t mp
         data = p->data;
         if ( p->data_is_ptr )
         {
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -467,15 +467,17 @@ static uint32_t read_data;
 static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
 {
     int i;
-    int sign = p->df ? -1 : 1;
+    uint64_t addr = p->addr;
     p2m_type_t p2mt;
     struct domain *d = current->domain;
 
     if ( p->data_is_ptr )
     {
+        uint64_t data = p->data, tmp;
+        int step = p->df ? -p->size : p->size;
+
         if ( p->dir == IOREQ_READ )
         {
-            uint64_t addr = p->addr, data = p->data, tmp;
             for ( i = 0; i < p->count; i++ ) 
             {
                 tmp = stdvga_mem_read(addr, p->size);
@@ -498,13 +500,12 @@ static int mmio_move(struct hvm_hw_stdvg
                     ASSERT(!dp);
                     stdvga_mem_write(data, tmp, p->size);
                 }
-                data += sign * p->size;
-                addr += sign * p->size;
+                data += step;
+                addr += step;
             }
         }
         else
         {
-            uint32_t addr = p->addr, data = p->data, tmp;
             for ( i = 0; i < p->count; i++ )
             {
                 if ( hvm_copy_from_guest_phys(&tmp, data, p->size) !=
@@ -523,31 +524,18 @@ static int mmio_move(struct hvm_hw_stdvg
                     tmp = stdvga_mem_read(data, p->size);
                 }
                 stdvga_mem_write(addr, tmp, p->size);
-                data += sign * p->size;
-                addr += sign * p->size;
+                data += step;
+                addr += step;
             }
         }
     }
     else
     {
+        ASSERT(p->count == 1);
         if ( p->dir == IOREQ_READ )
-        {
-            uint32_t addr = p->addr;
-            for ( i = 0; i < p->count; i++ )
-            {
-                p->data = stdvga_mem_read(addr, p->size);
-                addr += sign * p->size;
-            }
-        }
+            p->data = stdvga_mem_read(addr, p->size);
         else
-        {
-            uint32_t addr = p->addr;
-            for ( i = 0; i < p->count; i++ )
-            {
-                stdvga_mem_write(addr, p->data, p->size);
-                addr += sign * p->size;
-            }
-        }
+            stdvga_mem_write(addr, p->data, p->size);
     }
 
     read_data = p->data;



[-- Attachment #2: x86-HVM-string-backward-io.patch --]
[-- Type: text/plain, Size: 7460 bytes --]

x86/HVM: properly handle backward string instruction emulation

Multiplying a signed 32-bit quantity with an unsigned 32-bit quantity
produces an unsigned 32-bit result, yet for emulation of backward
string instructions we need the result sign extended before getting
added to the base address.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -48,7 +48,7 @@ static int hvm_mmio_access(struct vcpu *
                            hvm_mmio_write_t write_handler)
 {
     unsigned long data;
-    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
 
     if ( !p->data_is_ptr )
     {
@@ -68,13 +68,10 @@ static int hvm_mmio_access(struct vcpu *
         {
             int ret;
 
-            rc = read_handler(v, p->addr + (sign * i * p->size), p->size,
-                              &data);
+            rc = read_handler(v, p->addr + step * i, p->size, &data);
             if ( rc != X86EMUL_OKAY )
                 break;
-            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size),
-                                         &data,
-                                         p->size);
+            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
             if ( (ret == HVMCOPY_gfn_paged_out) || 
                  (ret == HVMCOPY_gfn_shared) )
             {
@@ -87,8 +84,7 @@ static int hvm_mmio_access(struct vcpu *
     {
         for ( i = 0; i < p->count; i++ )
         {
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
@@ -109,8 +105,7 @@ static int hvm_mmio_access(struct vcpu *
             }
             if ( rc != X86EMUL_OKAY )
                 break;
-            rc = write_handler(v, p->addr + (sign * i * p->size), p->size,
-                               data);
+            rc = write_handler(v, p->addr + step * i, p->size, data);
             if ( rc != X86EMUL_OKAY )
                 break;
         }
@@ -142,7 +137,7 @@ int hvm_mmio_intercept(ioreq_t *p)
 
 static int process_portio_intercept(portio_action_t action, ioreq_t *p)
 {
-    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     if ( !p->data_is_ptr )
@@ -167,8 +162,7 @@ static int process_portio_intercept(port
             rc = action(IOREQ_READ, p->addr, p->size, &data);
             if ( rc != X86EMUL_OKAY )
                 break;
-            (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size,
-                                         &data, p->size);
+            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
         }
     }
     else /* p->dir == IOREQ_WRITE */
@@ -176,8 +170,7 @@ static int process_portio_intercept(port
         for ( i = 0; i < p->count; i++ )
         {
             data = 0;
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -294,7 +294,7 @@ void hvm_io_assist(void)
 
 static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
 {
-    int i, sign = p->df ? -1 : 1;
+    int i, step = p->df ? -p->size : p->size;
     uint32_t data = 0;
 
     for ( i = 0; i < p->count; i++ )
@@ -317,8 +317,7 @@ static int dpci_ioport_read(uint32_t mpo
         if ( p->data_is_ptr )
         {
             int ret;
-            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size), &data,
-                                         p->size);
+            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
             if ( (ret == HVMCOPY_gfn_paged_out) ||
                  (ret == HVMCOPY_gfn_shared) )
                 return X86EMUL_RETRY;
@@ -332,7 +331,7 @@ static int dpci_ioport_read(uint32_t mpo
 
 static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
 {
-    int i, sign = p->df ? -1 : 1;
+    int i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     for ( i = 0; i < p->count; i++ )
@@ -340,8 +339,7 @@ static int dpci_ioport_write(uint32_t mp
         data = p->data;
         if ( p->data_is_ptr )
         {
-            switch ( hvm_copy_from_guest_phys(&data,
-                                              p->data + sign * i * p->size,
+            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                               p->size) )
             {
             case HVMCOPY_okay:
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -467,15 +467,17 @@ static uint32_t read_data;
 static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
 {
     int i;
-    int sign = p->df ? -1 : 1;
+    uint64_t addr = p->addr;
     p2m_type_t p2mt;
     struct domain *d = current->domain;
 
     if ( p->data_is_ptr )
     {
+        uint64_t data = p->data, tmp;
+        int step = p->df ? -p->size : p->size;
+
         if ( p->dir == IOREQ_READ )
         {
-            uint64_t addr = p->addr, data = p->data, tmp;
             for ( i = 0; i < p->count; i++ ) 
             {
                 tmp = stdvga_mem_read(addr, p->size);
@@ -498,13 +500,12 @@ static int mmio_move(struct hvm_hw_stdvg
                     ASSERT(!dp);
                     stdvga_mem_write(data, tmp, p->size);
                 }
-                data += sign * p->size;
-                addr += sign * p->size;
+                data += step;
+                addr += step;
             }
         }
         else
         {
-            uint32_t addr = p->addr, data = p->data, tmp;
             for ( i = 0; i < p->count; i++ )
             {
                 if ( hvm_copy_from_guest_phys(&tmp, data, p->size) !=
@@ -523,31 +524,18 @@ static int mmio_move(struct hvm_hw_stdvg
                     tmp = stdvga_mem_read(data, p->size);
                 }
                 stdvga_mem_write(addr, tmp, p->size);
-                data += sign * p->size;
-                addr += sign * p->size;
+                data += step;
+                addr += step;
             }
         }
     }
     else
     {
+        ASSERT(p->count == 1);
         if ( p->dir == IOREQ_READ )
-        {
-            uint32_t addr = p->addr;
-            for ( i = 0; i < p->count; i++ )
-            {
-                p->data = stdvga_mem_read(addr, p->size);
-                addr += sign * p->size;
-            }
-        }
+            p->data = stdvga_mem_read(addr, p->size);
         else
-        {
-            uint32_t addr = p->addr;
-            for ( i = 0; i < p->count; i++ )
-            {
-                stdvga_mem_write(addr, p->data, p->size);
-                addr += sign * p->size;
-            }
-        }
+            stdvga_mem_write(addr, p->data, p->size);
     }
 
     read_data = p->data;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
  2013-09-30 12:57 ` [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation Jan Beulich
@ 2013-09-30 12:57 ` Jan Beulich
  2013-10-08 18:13   ` Andrew Cooper
  2013-09-30 12:58 ` [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept Jan Beulich
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:57 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 7645 bytes --]

dpci_ioport_{read,write}() guest memory access failure handling should
be modelled after process_portio_intercept()'s (and others): Upon
encountering an error on other than the first iteration, the count
successfully handled needs to be stored and X86EMUL_OKAY returned, in
order for the generic instruction emulator to update register state
correctly before reporting failure or retrying (both of which would
only happen after re-invoking emulation).

Further we leverage (and slightly extend, due to the above mentioned
need to return X86EMUL_OKAY) the "large MMIO" retry model.

Note that there is still a special case not explicitly taken care of
here: While the first retry on the last iteration of a "rep ins"
correctly recovers the already read data, an eventual subsequent retry
is being handled by the pre-existing mmio-large logic (through
hvmemul_do_io() storing the [recovered] data [again], also taking into
consideration that the emulator converts a single iteration "ins" to
->read_io() plus ->write()).

Also fix an off-by-one in the mmio-large-read logic, and slightly
simplify the copying of the data.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -173,6 +173,13 @@ static int hvmemul_do_io(
         (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
     vio->io_size = size;
 
+    /*
+     * When retrying a repeated string instruction, force exit to guest after
+     * completion of the retried iteration to allow handling of interrupts.
+     */
+    if ( vio->mmio_retrying )
+        *reps = 1;
+
     p->dir = dir;
     p->data_is_ptr = value_is_ptr;
     p->type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO;
@@ -202,8 +209,14 @@ static int hvmemul_do_io(
     case X86EMUL_RETRY:
         *reps = p->count;
         p->state = STATE_IORESP_READY;
-        hvm_io_assist();
-        vio->io_state = HVMIO_none;
+        if ( !vio->mmio_retry )
+        {
+            hvm_io_assist();
+            vio->io_state = HVMIO_none;
+        }
+        else
+            /* Defer hvm_io_assist() invocation to hvm_do_resume(). */
+            vio->io_state = HVMIO_handle_mmio_awaiting_completion;
         break;
     case X86EMUL_UNHANDLEABLE:
         rc = X86EMUL_RETRY;
@@ -249,10 +262,9 @@ static int hvmemul_do_io(
             if ( bytes == 0 )
                 pa = vio->mmio_large_read_pa = addr;
             if ( (addr == (pa + bytes)) &&
-                 ((bytes + size) <
-                  sizeof(vio->mmio_large_read)) )
+                 ((bytes + size) <= sizeof(vio->mmio_large_read)) )
             {
-                memcpy(&vio->mmio_large_read[addr - pa], p_data, size);
+                memcpy(&vio->mmio_large_read[bytes], p_data, size);
                 vio->mmio_large_read_bytes += size;
             }
         }
@@ -1151,9 +1163,13 @@ int hvm_emulate_one(
         ? sizeof(hvmemul_ctxt->insn_buf) : 0;
 
     hvmemul_ctxt->exn_pending = 0;
+    vio->mmio_retrying = vio->mmio_retry;
+    vio->mmio_retry = 0;
 
     rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops);
 
+    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
+        rc = X86EMUL_RETRY;
     if ( rc != X86EMUL_RETRY )
         vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
 
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -294,12 +294,21 @@ void hvm_io_assist(void)
 
 static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
 {
-    int i, step = p->df ? -p->size : p->size;
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data = 0;
 
     for ( i = 0; i < p->count; i++ )
     {
-        switch ( p->size )
+        if ( vio->mmio_retrying )
+        {
+            if ( vio->mmio_large_read_bytes != p->size )
+                return X86EMUL_UNHANDLEABLE;
+            memcpy(&data, vio->mmio_large_read, p->size);
+            vio->mmio_large_read_bytes = 0;
+            vio->mmio_retrying = 0;
+        }
+        else switch ( p->size )
         {
         case 1:
             data = inb(mport);
@@ -316,22 +325,51 @@ static int dpci_ioport_read(uint32_t mpo
 
         if ( p->data_is_ptr )
         {
-            int ret;
-            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
-            if ( (ret == HVMCOPY_gfn_paged_out) ||
-                 (ret == HVMCOPY_gfn_shared) )
-                return X86EMUL_RETRY;
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
+                rc = X86EMUL_RETRY;
+                break;
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
+            }
+            if ( rc != X86EMUL_OKAY)
+                break;
         }
         else
             p->data = data;
     }
     
-    return X86EMUL_OKAY;
+    if ( rc == X86EMUL_RETRY )
+    {
+        vio->mmio_retry = 1;
+        vio->mmio_large_read_bytes = p->size;
+        memcpy(vio->mmio_large_read, &data, p->size);
+    }
+
+    if ( i != 0 )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+
+    return rc;
 }
 
 static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
 {
-    int i, step = p->df ? -p->size : p->size;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     for ( i = 0; i < p->count; i++ )
@@ -346,7 +384,8 @@ static int dpci_ioport_write(uint32_t mp
                 break;
             case HVMCOPY_gfn_paged_out:
             case HVMCOPY_gfn_shared:
-                return X86EMUL_RETRY;
+                rc = X86EMUL_RETRY;
+                break;
             case HVMCOPY_bad_gfn_to_mfn:
                 data = ~0;
                 break;
@@ -354,8 +393,11 @@ static int dpci_ioport_write(uint32_t mp
                 ASSERT(0);
                 /* fall through */
             default:
-                return X86EMUL_UNHANDLEABLE;
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
             }
+            if ( rc != X86EMUL_OKAY)
+                break;
         }
 
         switch ( p->size )
@@ -374,7 +416,16 @@ static int dpci_ioport_write(uint32_t mp
         }
     }
 
-    return X86EMUL_OKAY;
+    if ( rc == X86EMUL_RETRY )
+        current->arch.hvm_vcpu.hvm_io.mmio_retry = 1;
+
+    if ( i != 0 )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+
+    return rc;
 }
 
 int dpci_ioport_intercept(ioreq_t *p)
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -66,6 +66,11 @@ struct hvm_vcpu_io {
     /* We may write up to m256 as a number of device-model transactions. */
     unsigned int mmio_large_write_bytes;
     paddr_t mmio_large_write_pa;
+    /*
+     * For string instruction emulation we need to be able to signal a
+     * necessary retry through other than function return codes.
+     */
+    bool_t mmio_retry, mmio_retrying;
 };
 
 #define VMCX_EADDR    (~0ULL)



[-- Attachment #2: x86-HVM-dpci-string-io.patch --]
[-- Type: text/plain, Size: 7712 bytes --]

x86/HVM: fix direct PCI port I/O emulation retry and error handling

dpci_ioport_{read,write}() guest memory access failure handling should
be modelled after process_portio_intercept()'s (and others): Upon
encountering an error on other than the first iteration, the count
successfully handled needs to be stored and X86EMUL_OKAY returned, in
order for the generic instruction emulator to update register state
correctly before reporting failure or retrying (both of which would
only happen after re-invoking emulation).

Further we leverage (and slightly extend, due to the above mentioned
need to return X86EMUL_OKAY) the "large MMIO" retry model.

Note that there is still a special case not explicitly taken care of
here: While the first retry on the last iteration of a "rep ins"
correctly recovers the already read data, an eventual subsequent retry
is being handled by the pre-existing mmio-large logic (through
hvmemul_do_io() storing the [recovered] data [again], also taking into
consideration that the emulator converts a single iteration "ins" to
->read_io() plus ->write()).

Also fix an off-by-one in the mmio-large-read logic, and slightly
simplify the copying of the data.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -173,6 +173,13 @@ static int hvmemul_do_io(
         (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
     vio->io_size = size;
 
+    /*
+     * When retrying a repeated string instruction, force exit to guest after
+     * completion of the retried iteration to allow handling of interrupts.
+     */
+    if ( vio->mmio_retrying )
+        *reps = 1;
+
     p->dir = dir;
     p->data_is_ptr = value_is_ptr;
     p->type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO;
@@ -202,8 +209,14 @@ static int hvmemul_do_io(
     case X86EMUL_RETRY:
         *reps = p->count;
         p->state = STATE_IORESP_READY;
-        hvm_io_assist();
-        vio->io_state = HVMIO_none;
+        if ( !vio->mmio_retry )
+        {
+            hvm_io_assist();
+            vio->io_state = HVMIO_none;
+        }
+        else
+            /* Defer hvm_io_assist() invocation to hvm_do_resume(). */
+            vio->io_state = HVMIO_handle_mmio_awaiting_completion;
         break;
     case X86EMUL_UNHANDLEABLE:
         rc = X86EMUL_RETRY;
@@ -249,10 +262,9 @@ static int hvmemul_do_io(
             if ( bytes == 0 )
                 pa = vio->mmio_large_read_pa = addr;
             if ( (addr == (pa + bytes)) &&
-                 ((bytes + size) <
-                  sizeof(vio->mmio_large_read)) )
+                 ((bytes + size) <= sizeof(vio->mmio_large_read)) )
             {
-                memcpy(&vio->mmio_large_read[addr - pa], p_data, size);
+                memcpy(&vio->mmio_large_read[bytes], p_data, size);
                 vio->mmio_large_read_bytes += size;
             }
         }
@@ -1151,9 +1163,13 @@ int hvm_emulate_one(
         ? sizeof(hvmemul_ctxt->insn_buf) : 0;
 
     hvmemul_ctxt->exn_pending = 0;
+    vio->mmio_retrying = vio->mmio_retry;
+    vio->mmio_retry = 0;
 
     rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops);
 
+    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
+        rc = X86EMUL_RETRY;
     if ( rc != X86EMUL_RETRY )
         vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
 
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -294,12 +294,21 @@ void hvm_io_assist(void)
 
 static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
 {
-    int i, step = p->df ? -p->size : p->size;
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data = 0;
 
     for ( i = 0; i < p->count; i++ )
     {
-        switch ( p->size )
+        if ( vio->mmio_retrying )
+        {
+            if ( vio->mmio_large_read_bytes != p->size )
+                return X86EMUL_UNHANDLEABLE;
+            memcpy(&data, vio->mmio_large_read, p->size);
+            vio->mmio_large_read_bytes = 0;
+            vio->mmio_retrying = 0;
+        }
+        else switch ( p->size )
         {
         case 1:
             data = inb(mport);
@@ -316,22 +325,51 @@ static int dpci_ioport_read(uint32_t mpo
 
         if ( p->data_is_ptr )
         {
-            int ret;
-            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
-            if ( (ret == HVMCOPY_gfn_paged_out) ||
-                 (ret == HVMCOPY_gfn_shared) )
-                return X86EMUL_RETRY;
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
+                rc = X86EMUL_RETRY;
+                break;
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
+            }
+            if ( rc != X86EMUL_OKAY)
+                break;
         }
         else
             p->data = data;
     }
     
-    return X86EMUL_OKAY;
+    if ( rc == X86EMUL_RETRY )
+    {
+        vio->mmio_retry = 1;
+        vio->mmio_large_read_bytes = p->size;
+        memcpy(vio->mmio_large_read, &data, p->size);
+    }
+
+    if ( i != 0 )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+
+    return rc;
 }
 
 static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
 {
-    int i, step = p->df ? -p->size : p->size;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
     for ( i = 0; i < p->count; i++ )
@@ -346,7 +384,8 @@ static int dpci_ioport_write(uint32_t mp
                 break;
             case HVMCOPY_gfn_paged_out:
             case HVMCOPY_gfn_shared:
-                return X86EMUL_RETRY;
+                rc = X86EMUL_RETRY;
+                break;
             case HVMCOPY_bad_gfn_to_mfn:
                 data = ~0;
                 break;
@@ -354,8 +393,11 @@ static int dpci_ioport_write(uint32_t mp
                 ASSERT(0);
                 /* fall through */
             default:
-                return X86EMUL_UNHANDLEABLE;
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
             }
+            if ( rc != X86EMUL_OKAY)
+                break;
         }
 
         switch ( p->size )
@@ -374,7 +416,16 @@ static int dpci_ioport_write(uint32_t mp
         }
     }
 
-    return X86EMUL_OKAY;
+    if ( rc == X86EMUL_RETRY )
+        current->arch.hvm_vcpu.hvm_io.mmio_retry = 1;
+
+    if ( i != 0 )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+
+    return rc;
 }
 
 int dpci_ioport_intercept(ioreq_t *p)
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -66,6 +66,11 @@ struct hvm_vcpu_io {
     /* We may write up to m256 as a number of device-model transactions. */
     unsigned int mmio_large_write_bytes;
     paddr_t mmio_large_write_pa;
+    /*
+     * For string instruction emulation we need to be able to signal a
+     * necessary retry through other than function return codes.
+     */
+    bool_t mmio_retry, mmio_retrying;
 };
 
 #define VMCX_EADDR    (~0ULL)

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
  2013-09-30 12:57 ` [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation Jan Beulich
  2013-09-30 12:57 ` [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling Jan Beulich
@ 2013-09-30 12:58 ` Jan Beulich
  2013-10-08 18:20   ` Andrew Cooper
  2013-09-30 12:58 ` [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors Jan Beulich
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:58 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 6652 bytes --]

Building upon the extended retry logic we can now also make sure to
not ignore errors resulting from writing data back to guest memory.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -47,6 +47,7 @@ static int hvm_mmio_access(struct vcpu *
                            hvm_mmio_read_t read_handler,
                            hvm_mmio_write_t write_handler)
 {
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
     unsigned long data;
     int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
 
@@ -54,7 +55,16 @@ static int hvm_mmio_access(struct vcpu *
     {
         if ( p->dir == IOREQ_READ )
         {
-            rc = read_handler(v, p->addr, p->size, &data);
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+                rc = read_handler(v, p->addr, p->size, &data);
             p->data = data;
         }
         else /* p->dir == IOREQ_WRITE */
@@ -66,18 +76,48 @@ static int hvm_mmio_access(struct vcpu *
     {
         for ( i = 0; i < p->count; i++ )
         {
-            int ret;
-
-            rc = read_handler(v, p->addr + step * i, p->size, &data);
-            if ( rc != X86EMUL_OKAY )
-                break;
-            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
-            if ( (ret == HVMCOPY_gfn_paged_out) || 
-                 (ret == HVMCOPY_gfn_shared) )
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
             {
+                rc = read_handler(v, p->addr + step * i, p->size, &data);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+            }
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
                 rc = X86EMUL_RETRY;
                 break;
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
             }
+            if ( rc != X86EMUL_OKAY)
+                break;
+        }
+
+        if ( rc == X86EMUL_RETRY )
+        {
+            vio->mmio_retry = 1;
+            vio->mmio_large_read_bytes = p->size;
+            memcpy(vio->mmio_large_read, &data, p->size);
         }
     }
     else
@@ -109,6 +149,9 @@ static int hvm_mmio_access(struct vcpu *
             if ( rc != X86EMUL_OKAY )
                 break;
         }
+
+        if ( rc == X86EMUL_RETRY )
+            vio->mmio_retry = 1;
     }
 
     if ( i != 0 )
@@ -137,6 +180,7 @@ int hvm_mmio_intercept(ioreq_t *p)
 
 static int process_portio_intercept(portio_action_t action, ioreq_t *p)
 {
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
     int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
@@ -144,7 +188,16 @@ static int process_portio_intercept(port
     {
         if ( p->dir == IOREQ_READ )
         {
-            rc = action(IOREQ_READ, p->addr, p->size, &data);
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+                rc = action(IOREQ_READ, p->addr, p->size, &data);
             p->data = data;
         }
         else
@@ -159,10 +212,48 @@ static int process_portio_intercept(port
     {
         for ( i = 0; i < p->count; i++ )
         {
-            rc = action(IOREQ_READ, p->addr, p->size, &data);
-            if ( rc != X86EMUL_OKAY )
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+            {
+                rc = action(IOREQ_READ, p->addr, p->size, &data);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+            }
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
+                rc = X86EMUL_RETRY;
                 break;
-            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
+            }
+            if ( rc != X86EMUL_OKAY)
+                break;
+        }
+
+        if ( rc == X86EMUL_RETRY )
+        {
+            vio->mmio_retry = 1;
+            vio->mmio_large_read_bytes = p->size;
+            memcpy(vio->mmio_large_read, &data, p->size);
         }
     }
     else /* p->dir == IOREQ_WRITE */
@@ -195,6 +286,9 @@ static int process_portio_intercept(port
             if ( rc != X86EMUL_OKAY )
                 break;
         }
+
+        if ( rc == X86EMUL_RETRY )
+            vio->mmio_retry = 1;
     }
 
     if ( i != 0 )



[-- Attachment #2: x86-HVM-intercept-io.patch --]
[-- Type: text/plain, Size: 6726 bytes --]

x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept

Building upon the extended retry logic we can now also make sure to
not ignore errors resulting from writing data back to guest memory.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -47,6 +47,7 @@ static int hvm_mmio_access(struct vcpu *
                            hvm_mmio_read_t read_handler,
                            hvm_mmio_write_t write_handler)
 {
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
     unsigned long data;
     int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
 
@@ -54,7 +55,16 @@ static int hvm_mmio_access(struct vcpu *
     {
         if ( p->dir == IOREQ_READ )
         {
-            rc = read_handler(v, p->addr, p->size, &data);
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+                rc = read_handler(v, p->addr, p->size, &data);
             p->data = data;
         }
         else /* p->dir == IOREQ_WRITE */
@@ -66,18 +76,48 @@ static int hvm_mmio_access(struct vcpu *
     {
         for ( i = 0; i < p->count; i++ )
         {
-            int ret;
-
-            rc = read_handler(v, p->addr + step * i, p->size, &data);
-            if ( rc != X86EMUL_OKAY )
-                break;
-            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
-            if ( (ret == HVMCOPY_gfn_paged_out) || 
-                 (ret == HVMCOPY_gfn_shared) )
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
             {
+                rc = read_handler(v, p->addr + step * i, p->size, &data);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+            }
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
                 rc = X86EMUL_RETRY;
                 break;
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
             }
+            if ( rc != X86EMUL_OKAY)
+                break;
+        }
+
+        if ( rc == X86EMUL_RETRY )
+        {
+            vio->mmio_retry = 1;
+            vio->mmio_large_read_bytes = p->size;
+            memcpy(vio->mmio_large_read, &data, p->size);
         }
     }
     else
@@ -109,6 +149,9 @@ static int hvm_mmio_access(struct vcpu *
             if ( rc != X86EMUL_OKAY )
                 break;
         }
+
+        if ( rc == X86EMUL_RETRY )
+            vio->mmio_retry = 1;
     }
 
     if ( i != 0 )
@@ -137,6 +180,7 @@ int hvm_mmio_intercept(ioreq_t *p)
 
 static int process_portio_intercept(portio_action_t action, ioreq_t *p)
 {
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
     int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
     uint32_t data;
 
@@ -144,7 +188,16 @@ static int process_portio_intercept(port
     {
         if ( p->dir == IOREQ_READ )
         {
-            rc = action(IOREQ_READ, p->addr, p->size, &data);
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+                rc = action(IOREQ_READ, p->addr, p->size, &data);
             p->data = data;
         }
         else
@@ -159,10 +212,48 @@ static int process_portio_intercept(port
     {
         for ( i = 0; i < p->count; i++ )
         {
-            rc = action(IOREQ_READ, p->addr, p->size, &data);
-            if ( rc != X86EMUL_OKAY )
+            if ( vio->mmio_retrying )
+            {
+                if ( vio->mmio_large_read_bytes != p->size )
+                    return X86EMUL_UNHANDLEABLE;
+                memcpy(&data, vio->mmio_large_read, p->size);
+                vio->mmio_large_read_bytes = 0;
+                vio->mmio_retrying = 0;
+            }
+            else
+            {
+                rc = action(IOREQ_READ, p->addr, p->size, &data);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+            }
+            switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                            &data, p->size) )
+            {
+            case HVMCOPY_okay:
+                break;
+            case HVMCOPY_gfn_paged_out:
+            case HVMCOPY_gfn_shared:
+                rc = X86EMUL_RETRY;
                 break;
-            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
+            case HVMCOPY_bad_gfn_to_mfn:
+                /* Drop the write as real hardware would. */
+                continue;
+            case HVMCOPY_bad_gva_to_gfn:
+                ASSERT(0);
+                /* fall through */
+            default:
+                rc = X86EMUL_UNHANDLEABLE;
+                break;
+            }
+            if ( rc != X86EMUL_OKAY)
+                break;
+        }
+
+        if ( rc == X86EMUL_RETRY )
+        {
+            vio->mmio_retry = 1;
+            vio->mmio_large_read_bytes = p->size;
+            memcpy(vio->mmio_large_read, &data, p->size);
         }
     }
     else /* p->dir == IOREQ_WRITE */
@@ -195,6 +286,9 @@ static int process_portio_intercept(port
             if ( rc != X86EMUL_OKAY )
                 break;
         }
+
+        if ( rc == X86EMUL_RETRY )
+            vio->mmio_retry = 1;
     }
 
     if ( i != 0 )

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
                   ` (2 preceding siblings ...)
  2013-09-30 12:58 ` [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept Jan Beulich
@ 2013-09-30 12:58 ` Jan Beulich
  2013-09-30 13:07   ` Andrew Cooper
  2013-09-30 12:59 ` [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing Jan Beulich
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:58 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 2768 bytes --]

In memory read/write handling the default case should tell the caller
that the operation cannot be handled rather than the operation having
succeeded, so that when new HVMCOPY_* states get added not handling
them explicitly will not result in errors being ignored.

In task switch emulation code stop handling some errors, but not
others.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -504,10 +504,10 @@ static int __hvmemul_read(
 
     switch ( rc )
     {
+    case HVMCOPY_okay:
+        break;
     case HVMCOPY_bad_gva_to_gfn:
         return X86EMUL_EXCEPTION;
-    case HVMCOPY_unhandleable:
-        return X86EMUL_UNHANDLEABLE;
     case HVMCOPY_bad_gfn_to_mfn:
         if ( access_type == hvm_access_insn_fetch )
             return X86EMUL_UNHANDLEABLE;
@@ -535,11 +535,10 @@ static int __hvmemul_read(
         }
         return rc;
     case HVMCOPY_gfn_paged_out:
-        return X86EMUL_RETRY;
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
     default:
-        break;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     return X86EMUL_OKAY;
@@ -634,10 +633,10 @@ static int hvmemul_write(
 
     switch ( rc )
     {
+    case HVMCOPY_okay:
+        break;
     case HVMCOPY_bad_gva_to_gfn:
         return X86EMUL_EXCEPTION;
-    case HVMCOPY_unhandleable:
-        return X86EMUL_UNHANDLEABLE;
     case HVMCOPY_bad_gfn_to_mfn:
         rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
                                     hvmemul_ctxt);
@@ -663,11 +662,10 @@ static int hvmemul_write(
         }
         return rc;
     case HVMCOPY_gfn_paged_out:
-        return X86EMUL_RETRY;
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
     default:
-        break;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     return X86EMUL_OKAY;
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2345,11 +2345,7 @@ void hvm_task_switch(
 
     rc = hvm_copy_to_guest_virt(
         prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
-    if ( rc == HVMCOPY_bad_gva_to_gfn )
-        goto out;
-    if ( rc == HVMCOPY_gfn_paged_out )
-        goto out;
-    if ( rc == HVMCOPY_gfn_shared )
+    if ( rc != HVMCOPY_okay )
         goto out;
 
     rc = hvm_copy_from_guest_virt(
@@ -2396,9 +2392,7 @@ void hvm_task_switch(
         tr.base, &tss, sizeof(tss), PFEC_page_present);
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         exn_raised = 1;
-    if ( rc == HVMCOPY_gfn_paged_out )
-        goto out;
-    if ( rc == HVMCOPY_gfn_shared )
+    else if ( rc != HVMCOPY_okay )
         goto out;
 
     if ( (tss.trace & 1) && !exn_raised )




[-- Attachment #2: x86-HVM-guest-virt-checks.patch --]
[-- Type: text/plain, Size: 2824 bytes --]

x86/HVM: properly deal with hvm_copy_*_guest_phys() errors

In memory read/write handling the default case should tell the caller
that the operation cannot be handled rather than the operation having
succeeded, so that when new HVMCOPY_* states get added not handling
them explicitly will not result in errors being ignored.

In task switch emulation code stop handling some errors, but not
others.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -504,10 +504,10 @@ static int __hvmemul_read(
 
     switch ( rc )
     {
+    case HVMCOPY_okay:
+        break;
     case HVMCOPY_bad_gva_to_gfn:
         return X86EMUL_EXCEPTION;
-    case HVMCOPY_unhandleable:
-        return X86EMUL_UNHANDLEABLE;
     case HVMCOPY_bad_gfn_to_mfn:
         if ( access_type == hvm_access_insn_fetch )
             return X86EMUL_UNHANDLEABLE;
@@ -535,11 +535,10 @@ static int __hvmemul_read(
         }
         return rc;
     case HVMCOPY_gfn_paged_out:
-        return X86EMUL_RETRY;
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
     default:
-        break;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     return X86EMUL_OKAY;
@@ -634,10 +633,10 @@ static int hvmemul_write(
 
     switch ( rc )
     {
+    case HVMCOPY_okay:
+        break;
     case HVMCOPY_bad_gva_to_gfn:
         return X86EMUL_EXCEPTION;
-    case HVMCOPY_unhandleable:
-        return X86EMUL_UNHANDLEABLE;
     case HVMCOPY_bad_gfn_to_mfn:
         rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
                                     hvmemul_ctxt);
@@ -663,11 +662,10 @@ static int hvmemul_write(
         }
         return rc;
     case HVMCOPY_gfn_paged_out:
-        return X86EMUL_RETRY;
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
     default:
-        break;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     return X86EMUL_OKAY;
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2345,11 +2345,7 @@ void hvm_task_switch(
 
     rc = hvm_copy_to_guest_virt(
         prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
-    if ( rc == HVMCOPY_bad_gva_to_gfn )
-        goto out;
-    if ( rc == HVMCOPY_gfn_paged_out )
-        goto out;
-    if ( rc == HVMCOPY_gfn_shared )
+    if ( rc != HVMCOPY_okay )
         goto out;
 
     rc = hvm_copy_from_guest_virt(
@@ -2396,9 +2392,7 @@ void hvm_task_switch(
         tr.base, &tss, sizeof(tss), PFEC_page_present);
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         exn_raised = 1;
-    if ( rc == HVMCOPY_gfn_paged_out )
-        goto out;
-    if ( rc == HVMCOPY_gfn_shared )
+    else if ( rc != HVMCOPY_okay )
         goto out;
 
     if ( (tss.trace & 1) && !exn_raised )

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
                   ` (3 preceding siblings ...)
  2013-09-30 12:58 ` [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors Jan Beulich
@ 2013-09-30 12:59 ` Jan Beulich
  2013-10-10 11:35   ` Andrew Cooper
  2013-12-18  8:36   ` Zhang, Yang Z
  2013-10-08 15:10 ` Ping: [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
  2013-10-14  7:29 ` Keir Fraser
  6 siblings, 2 replies; 27+ messages in thread
From: Jan Beulich @ 2013-09-30 12:59 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 4349 bytes --]

Rather than re-reading the instruction bytes upon retry processing,
stash away and re-use tha what we already read. That way we can be
certain that the retry won't do something different from what requested
the retry, getting once again closer to real hardware behavior (where
what we use retries for is simply a bus operation, not involving
redundant decoding of instructions).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -569,9 +569,19 @@ static int hvmemul_insn_fetch(
 
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
-        return __hvmemul_read(
-            seg, offset, p_data, bytes,
-            hvm_access_insn_fetch, hvmemul_ctxt);
+    {
+        int rc = __hvmemul_read(seg, offset, p_data, bytes,
+                                hvm_access_insn_fetch, hvmemul_ctxt);
+
+        if ( rc == X86EMUL_OKAY )
+        {
+            ASSERT(insn_off + bytes <= sizeof(hvmemul_ctxt->insn_buf));
+            memcpy(&hvmemul_ctxt->insn_buf[insn_off], p_data, bytes);
+            hvmemul_ctxt->insn_buf_bytes = insn_off + bytes;
+        }
+
+        return rc;
+    }
 
     /* Hit the cache. Simple memcpy. */
     memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
@@ -1148,17 +1158,27 @@ int hvm_emulate_one(
         pfec |= PFEC_user_mode;
 
     hvmemul_ctxt->insn_buf_eip = regs->eip;
-    hvmemul_ctxt->insn_buf_bytes =
-        hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)
-        ? :
-        (hvm_virtual_to_linear_addr(
-            x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs],
-            regs->eip, sizeof(hvmemul_ctxt->insn_buf),
-            hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) &&
-         !hvm_fetch_from_guest_virt_nofault(
-             hvmemul_ctxt->insn_buf, addr,
-             sizeof(hvmemul_ctxt->insn_buf), pfec))
-        ? sizeof(hvmemul_ctxt->insn_buf) : 0;
+    if ( !vio->mmio_insn_bytes )
+    {
+        hvmemul_ctxt->insn_buf_bytes =
+            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
+            (hvm_virtual_to_linear_addr(x86_seg_cs,
+                                        &hvmemul_ctxt->seg_reg[x86_seg_cs],
+                                        regs->eip,
+                                        sizeof(hvmemul_ctxt->insn_buf),
+                                        hvm_access_insn_fetch,
+                                        hvmemul_ctxt->ctxt.addr_size,
+                                        &addr) &&
+             hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr,
+                                               sizeof(hvmemul_ctxt->insn_buf),
+                                               pfec) == HVMCOPY_okay) ?
+            sizeof(hvmemul_ctxt->insn_buf) : 0;
+    }
+    else
+    {
+        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+    }
 
     hvmemul_ctxt->exn_pending = 0;
     vio->mmio_retrying = vio->mmio_retry;
@@ -1169,7 +1189,16 @@ int hvm_emulate_one(
     if ( rc == X86EMUL_OKAY && vio->mmio_retry )
         rc = X86EMUL_RETRY;
     if ( rc != X86EMUL_RETRY )
+    {
         vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
+        vio->mmio_insn_bytes = 0;
+    }
+    else
+    {
+        BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
+        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
+        memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+    }
 
     if ( rc != X86EMUL_OKAY )
         return rc;
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -66,6 +66,9 @@ struct hvm_vcpu_io {
     /* We may write up to m256 as a number of device-model transactions. */
     unsigned int mmio_large_write_bytes;
     paddr_t mmio_large_write_pa;
+    /* For retries we shouldn't re-fetch the instruction. */
+    unsigned int mmio_insn_bytes;
+    unsigned char mmio_insn[16];
     /*
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.



[-- Attachment #2: x86-HVM-cache-emulated-insn.patch --]
[-- Type: text/plain, Size: 4405 bytes --]

x86/HVM: cache emulated instruction for retry processing

Rather than re-reading the instruction bytes upon retry processing,
stash away and re-use tha what we already read. That way we can be
certain that the retry won't do something different from what requested
the retry, getting once again closer to real hardware behavior (where
what we use retries for is simply a bus operation, not involving
redundant decoding of instructions).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -569,9 +569,19 @@ static int hvmemul_insn_fetch(
 
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
-        return __hvmemul_read(
-            seg, offset, p_data, bytes,
-            hvm_access_insn_fetch, hvmemul_ctxt);
+    {
+        int rc = __hvmemul_read(seg, offset, p_data, bytes,
+                                hvm_access_insn_fetch, hvmemul_ctxt);
+
+        if ( rc == X86EMUL_OKAY )
+        {
+            ASSERT(insn_off + bytes <= sizeof(hvmemul_ctxt->insn_buf));
+            memcpy(&hvmemul_ctxt->insn_buf[insn_off], p_data, bytes);
+            hvmemul_ctxt->insn_buf_bytes = insn_off + bytes;
+        }
+
+        return rc;
+    }
 
     /* Hit the cache. Simple memcpy. */
     memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
@@ -1148,17 +1158,27 @@ int hvm_emulate_one(
         pfec |= PFEC_user_mode;
 
     hvmemul_ctxt->insn_buf_eip = regs->eip;
-    hvmemul_ctxt->insn_buf_bytes =
-        hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)
-        ? :
-        (hvm_virtual_to_linear_addr(
-            x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs],
-            regs->eip, sizeof(hvmemul_ctxt->insn_buf),
-            hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) &&
-         !hvm_fetch_from_guest_virt_nofault(
-             hvmemul_ctxt->insn_buf, addr,
-             sizeof(hvmemul_ctxt->insn_buf), pfec))
-        ? sizeof(hvmemul_ctxt->insn_buf) : 0;
+    if ( !vio->mmio_insn_bytes )
+    {
+        hvmemul_ctxt->insn_buf_bytes =
+            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
+            (hvm_virtual_to_linear_addr(x86_seg_cs,
+                                        &hvmemul_ctxt->seg_reg[x86_seg_cs],
+                                        regs->eip,
+                                        sizeof(hvmemul_ctxt->insn_buf),
+                                        hvm_access_insn_fetch,
+                                        hvmemul_ctxt->ctxt.addr_size,
+                                        &addr) &&
+             hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr,
+                                               sizeof(hvmemul_ctxt->insn_buf),
+                                               pfec) == HVMCOPY_okay) ?
+            sizeof(hvmemul_ctxt->insn_buf) : 0;
+    }
+    else
+    {
+        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+    }
 
     hvmemul_ctxt->exn_pending = 0;
     vio->mmio_retrying = vio->mmio_retry;
@@ -1169,7 +1189,16 @@ int hvm_emulate_one(
     if ( rc == X86EMUL_OKAY && vio->mmio_retry )
         rc = X86EMUL_RETRY;
     if ( rc != X86EMUL_RETRY )
+    {
         vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
+        vio->mmio_insn_bytes = 0;
+    }
+    else
+    {
+        BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
+        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
+        memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+    }
 
     if ( rc != X86EMUL_OKAY )
         return rc;
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -66,6 +66,9 @@ struct hvm_vcpu_io {
     /* We may write up to m256 as a number of device-model transactions. */
     unsigned int mmio_large_write_bytes;
     paddr_t mmio_large_write_pa;
+    /* For retries we shouldn't re-fetch the instruction. */
+    unsigned int mmio_insn_bytes;
+    unsigned char mmio_insn[16];
     /*
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors
  2013-09-30 12:58 ` [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors Jan Beulich
@ 2013-09-30 13:07   ` Andrew Cooper
  0 siblings, 0 replies; 27+ messages in thread
From: Andrew Cooper @ 2013-09-30 13:07 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser


[-- Attachment #1.1: Type: text/plain, Size: 3089 bytes --]

On 30/09/13 13:58, Jan Beulich wrote:
> In memory read/write handling the default case should tell the caller
> that the operation cannot be handled rather than the operation having
> succeeded, so that when new HVMCOPY_* states get added not handling
> them explicitly will not result in errors being ignored.
>
> In task switch emulation code stop handling some errors, but not
> others.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -504,10 +504,10 @@ static int __hvmemul_read(
>  
>      switch ( rc )
>      {
> +    case HVMCOPY_okay:
> +        break;
>      case HVMCOPY_bad_gva_to_gfn:
>          return X86EMUL_EXCEPTION;
> -    case HVMCOPY_unhandleable:
> -        return X86EMUL_UNHANDLEABLE;
>      case HVMCOPY_bad_gfn_to_mfn:
>          if ( access_type == hvm_access_insn_fetch )
>              return X86EMUL_UNHANDLEABLE;
> @@ -535,11 +535,10 @@ static int __hvmemul_read(
>          }
>          return rc;
>      case HVMCOPY_gfn_paged_out:
> -        return X86EMUL_RETRY;
>      case HVMCOPY_gfn_shared:
>          return X86EMUL_RETRY;
>      default:
> -        break;
> +        return X86EMUL_UNHANDLEABLE;
>      }
>  
>      return X86EMUL_OKAY;
> @@ -634,10 +633,10 @@ static int hvmemul_write(
>  
>      switch ( rc )
>      {
> +    case HVMCOPY_okay:
> +        break;
>      case HVMCOPY_bad_gva_to_gfn:
>          return X86EMUL_EXCEPTION;
> -    case HVMCOPY_unhandleable:
> -        return X86EMUL_UNHANDLEABLE;
>      case HVMCOPY_bad_gfn_to_mfn:
>          rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
>                                      hvmemul_ctxt);
> @@ -663,11 +662,10 @@ static int hvmemul_write(
>          }
>          return rc;
>      case HVMCOPY_gfn_paged_out:
> -        return X86EMUL_RETRY;
>      case HVMCOPY_gfn_shared:
>          return X86EMUL_RETRY;
>      default:
> -        break;
> +        return X86EMUL_UNHANDLEABLE;
>      }
>  
>      return X86EMUL_OKAY;
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2345,11 +2345,7 @@ void hvm_task_switch(
>  
>      rc = hvm_copy_to_guest_virt(
>          prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
> -    if ( rc == HVMCOPY_bad_gva_to_gfn )
> -        goto out;
> -    if ( rc == HVMCOPY_gfn_paged_out )
> -        goto out;
> -    if ( rc == HVMCOPY_gfn_shared )
> +    if ( rc != HVMCOPY_okay )
>          goto out;
>  
>      rc = hvm_copy_from_guest_virt(
> @@ -2396,9 +2392,7 @@ void hvm_task_switch(
>          tr.base, &tss, sizeof(tss), PFEC_page_present);
>      if ( rc == HVMCOPY_bad_gva_to_gfn )
>          exn_raised = 1;
> -    if ( rc == HVMCOPY_gfn_paged_out )
> -        goto out;
> -    if ( rc == HVMCOPY_gfn_shared )
> +    else if ( rc != HVMCOPY_okay )
>          goto out;
>  
>      if ( (tss.trace & 1) && !exn_raised )
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 3862 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Ping: [PATCH 0/5] x86/HVM: XSA-63 follow-ups
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
                   ` (4 preceding siblings ...)
  2013-09-30 12:59 ` [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing Jan Beulich
@ 2013-10-08 15:10 ` Jan Beulich
  2013-10-14  7:29 ` Keir Fraser
  6 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2013-10-08 15:10 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Andrew Cooper, Tim Deegan, xen-devel

>>> On 30.09.13 at 14:51, "Jan Beulich" <JBeulich@suse.com> wrote:
> 1: properly handle backward string instruction emulation
> 2: fix direct PCI port I/O emulation retry and error handling
> 3: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
> 4: properly deal with hvm_copy_*_guest_phys() errors
> 5: cache emulated instruction for retry processing
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

The only response so far was a review of patch 4 by Andrew...

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation
  2013-09-30 12:57 ` [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation Jan Beulich
@ 2013-10-08 16:36   ` Andrew Cooper
  0 siblings, 0 replies; 27+ messages in thread
From: Andrew Cooper @ 2013-10-08 16:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser


[-- Attachment #1.1: Type: text/plain, Size: 7828 bytes --]

On 30/09/13 13:57, Jan Beulich wrote:
> Multiplying a signed 32-bit quantity with an unsigned 32-bit quantity
> produces an unsigned 32-bit result, yet for emulation of backward
> string instructions we need the result sign extended before getting
> added to the base address.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> --- a/xen/arch/x86/hvm/intercept.c
> +++ b/xen/arch/x86/hvm/intercept.c
> @@ -48,7 +48,7 @@ static int hvm_mmio_access(struct vcpu *
>                             hvm_mmio_write_t write_handler)
>  {
>      unsigned long data;
> -    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
> +    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>  
>      if ( !p->data_is_ptr )
>      {
> @@ -68,13 +68,10 @@ static int hvm_mmio_access(struct vcpu *
>          {
>              int ret;
>  
> -            rc = read_handler(v, p->addr + (sign * i * p->size), p->size,
> -                              &data);
> +            rc = read_handler(v, p->addr + step * i, p->size, &data);
>              if ( rc != X86EMUL_OKAY )
>                  break;
> -            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size),
> -                                         &data,
> -                                         p->size);
> +            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
>              if ( (ret == HVMCOPY_gfn_paged_out) || 
>                   (ret == HVMCOPY_gfn_shared) )
>              {
> @@ -87,8 +84,7 @@ static int hvm_mmio_access(struct vcpu *
>      {
>          for ( i = 0; i < p->count; i++ )
>          {
> -            switch ( hvm_copy_from_guest_phys(&data,
> -                                              p->data + sign * i * p->size,
> +            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
>                                                p->size) )
>              {
>              case HVMCOPY_okay:
> @@ -109,8 +105,7 @@ static int hvm_mmio_access(struct vcpu *
>              }
>              if ( rc != X86EMUL_OKAY )
>                  break;
> -            rc = write_handler(v, p->addr + (sign * i * p->size), p->size,
> -                               data);
> +            rc = write_handler(v, p->addr + step * i, p->size, data);
>              if ( rc != X86EMUL_OKAY )
>                  break;
>          }
> @@ -142,7 +137,7 @@ int hvm_mmio_intercept(ioreq_t *p)
>  
>  static int process_portio_intercept(portio_action_t action, ioreq_t *p)
>  {
> -    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
> +    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>      uint32_t data;
>  
>      if ( !p->data_is_ptr )
> @@ -167,8 +162,7 @@ static int process_portio_intercept(port
>              rc = action(IOREQ_READ, p->addr, p->size, &data);
>              if ( rc != X86EMUL_OKAY )
>                  break;
> -            (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size,
> -                                         &data, p->size);
> +            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
>          }
>      }
>      else /* p->dir == IOREQ_WRITE */
> @@ -176,8 +170,7 @@ static int process_portio_intercept(port
>          for ( i = 0; i < p->count; i++ )
>          {
>              data = 0;
> -            switch ( hvm_copy_from_guest_phys(&data,
> -                                              p->data + sign * i * p->size,
> +            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
>                                                p->size) )
>              {
>              case HVMCOPY_okay:
> --- a/xen/arch/x86/hvm/io.c
> +++ b/xen/arch/x86/hvm/io.c
> @@ -294,7 +294,7 @@ void hvm_io_assist(void)
>  
>  static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
>  {
> -    int i, sign = p->df ? -1 : 1;
> +    int i, step = p->df ? -p->size : p->size;
>      uint32_t data = 0;
>  
>      for ( i = 0; i < p->count; i++ )
> @@ -317,8 +317,7 @@ static int dpci_ioport_read(uint32_t mpo
>          if ( p->data_is_ptr )
>          {
>              int ret;
> -            ret = hvm_copy_to_guest_phys(p->data + (sign * i * p->size), &data,
> -                                         p->size);
> +            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
>              if ( (ret == HVMCOPY_gfn_paged_out) ||
>                   (ret == HVMCOPY_gfn_shared) )
>                  return X86EMUL_RETRY;
> @@ -332,7 +331,7 @@ static int dpci_ioport_read(uint32_t mpo
>  
>  static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
>  {
> -    int i, sign = p->df ? -1 : 1;
> +    int i, step = p->df ? -p->size : p->size;
>      uint32_t data;
>  
>      for ( i = 0; i < p->count; i++ )
> @@ -340,8 +339,7 @@ static int dpci_ioport_write(uint32_t mp
>          data = p->data;
>          if ( p->data_is_ptr )
>          {
> -            switch ( hvm_copy_from_guest_phys(&data,
> -                                              p->data + sign * i * p->size,
> +            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
>                                                p->size) )
>              {
>              case HVMCOPY_okay:
> --- a/xen/arch/x86/hvm/stdvga.c
> +++ b/xen/arch/x86/hvm/stdvga.c
> @@ -467,15 +467,17 @@ static uint32_t read_data;
>  static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
>  {
>      int i;
> -    int sign = p->df ? -1 : 1;
> +    uint64_t addr = p->addr;
>      p2m_type_t p2mt;
>      struct domain *d = current->domain;
>  
>      if ( p->data_is_ptr )
>      {
> +        uint64_t data = p->data, tmp;
> +        int step = p->df ? -p->size : p->size;
> +
>          if ( p->dir == IOREQ_READ )
>          {
> -            uint64_t addr = p->addr, data = p->data, tmp;
>              for ( i = 0; i < p->count; i++ ) 
>              {
>                  tmp = stdvga_mem_read(addr, p->size);
> @@ -498,13 +500,12 @@ static int mmio_move(struct hvm_hw_stdvg
>                      ASSERT(!dp);
>                      stdvga_mem_write(data, tmp, p->size);
>                  }
> -                data += sign * p->size;
> -                addr += sign * p->size;
> +                data += step;
> +                addr += step;
>              }
>          }
>          else
>          {
> -            uint32_t addr = p->addr, data = p->data, tmp;
>              for ( i = 0; i < p->count; i++ )
>              {
>                  if ( hvm_copy_from_guest_phys(&tmp, data, p->size) !=
> @@ -523,31 +524,18 @@ static int mmio_move(struct hvm_hw_stdvg
>                      tmp = stdvga_mem_read(data, p->size);
>                  }
>                  stdvga_mem_write(addr, tmp, p->size);
> -                data += sign * p->size;
> -                addr += sign * p->size;
> +                data += step;
> +                addr += step;
>              }
>          }
>      }
>      else
>      {
> +        ASSERT(p->count == 1);
>          if ( p->dir == IOREQ_READ )
> -        {
> -            uint32_t addr = p->addr;
> -            for ( i = 0; i < p->count; i++ )
> -            {
> -                p->data = stdvga_mem_read(addr, p->size);
> -                addr += sign * p->size;
> -            }
> -        }
> +            p->data = stdvga_mem_read(addr, p->size);
>          else
> -        {
> -            uint32_t addr = p->addr;
> -            for ( i = 0; i < p->count; i++ )
> -            {
> -                stdvga_mem_write(addr, p->data, p->size);
> -                addr += sign * p->size;
> -            }
> -        }
> +            stdvga_mem_write(addr, p->data, p->size);
>      }
>  
>      read_data = p->data;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 8763 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling
  2013-09-30 12:57 ` [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling Jan Beulich
@ 2013-10-08 18:13   ` Andrew Cooper
  0 siblings, 0 replies; 27+ messages in thread
From: Andrew Cooper @ 2013-10-08 18:13 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 30/09/13 13:57, Jan Beulich wrote:
> dpci_ioport_{read,write}() guest memory access failure handling should
> be modelled after process_portio_intercept()'s (and others): Upon
> encountering an error on other than the first iteration, the count
> successfully handled needs to be stored and X86EMUL_OKAY returned, in
> order for the generic instruction emulator to update register state
> correctly before reporting failure or retrying (both of which would
> only happen after re-invoking emulation).
>
> Further we leverage (and slightly extend, due to the above mentioned
> need to return X86EMUL_OKAY) the "large MMIO" retry model.
>
> Note that there is still a special case not explicitly taken care of
> here: While the first retry on the last iteration of a "rep ins"
> correctly recovers the already read data, an eventual subsequent retry
> is being handled by the pre-existing mmio-large logic (through
> hvmemul_do_io() storing the [recovered] data [again], also taking into
> consideration that the emulator converts a single iteration "ins" to
> ->read_io() plus ->write()).
>
> Also fix an off-by-one in the mmio-large-read logic, and slightly
> simplify the copying of the data.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

One trivial thought comes to mind which you could easily do when
committing the patch...

> @@ -316,22 +325,51 @@ static int dpci_ioport_read(uint32_t mpo
>  
>          if ( p->data_is_ptr )
>          {
> -            int ret;
> -            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
> -            if ( (ret == HVMCOPY_gfn_paged_out) ||
> -                 (ret == HVMCOPY_gfn_shared) )
> -                return X86EMUL_RETRY;
> +            switch ( hvm_copy_to_guest_phys(p->data + step * i,
> +                                            &data, p->size) )
> +            {
> +            case HVMCOPY_okay:
> +                break;
> +            case HVMCOPY_gfn_paged_out:
> +            case HVMCOPY_gfn_shared:
> +                rc = X86EMUL_RETRY;
> +                break;
> +            case HVMCOPY_bad_gfn_to_mfn:
> +                /* Drop the write as real hardware would. */
> +                continue;
> +            case HVMCOPY_bad_gva_to_gfn:
> +                ASSERT(0);
> +                /* fall through */
> +            default:
> +                rc = X86EMUL_UNHANDLEABLE;
> +                break;
> +            }
> +            if ( rc != X86EMUL_OKAY)
> +                break;
>          }
>          else
>              p->data = data;
>      }
>      

Nuke the trailing whitespace on the line above here, which will
fractionally increase the size of the hunk below.

~Andrew

> -    return X86EMUL_OKAY;
> +    if ( rc == X86EMUL_RETRY )
> +    {
> +        vio->mmio_retry = 1;
> +        vio->mmio_large_read_bytes = p->size;
> +        memcpy(vio->mmio_large_read, &data, p->size);
> +    }
> +
> +    if ( i != 0 )
> +    {
> +        p->count = i;
> +        rc = X86EMUL_OKAY;
> +    }
> +
> +    return rc;
>  }
>  

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
  2013-09-30 12:58 ` [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept Jan Beulich
@ 2013-10-08 18:20   ` Andrew Cooper
  2013-10-09  7:48     ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: Andrew Cooper @ 2013-10-08 18:20 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser


[-- Attachment #1.1: Type: text/plain, Size: 7254 bytes --]

On 30/09/13 13:58, Jan Beulich wrote:
> Building upon the extended retry logic we can now also make sure to
> not ignore errors resulting from writing data back to guest memory.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/hvm/intercept.c
> +++ b/xen/arch/x86/hvm/intercept.c
> @@ -47,6 +47,7 @@ static int hvm_mmio_access(struct vcpu *
>                             hvm_mmio_read_t read_handler,
>                             hvm_mmio_write_t write_handler)
>  {
> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;

struct vcpu is passed in to this function, which can be used in
preference to current.  (And is possibly fractionally faster unless the
compiler can prove that v is always current)

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>      unsigned long data;
>      int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>  
> @@ -54,7 +55,16 @@ static int hvm_mmio_access(struct vcpu *
>      {
>          if ( p->dir == IOREQ_READ )
>          {
> -            rc = read_handler(v, p->addr, p->size, &data);
> +            if ( vio->mmio_retrying )
> +            {
> +                if ( vio->mmio_large_read_bytes != p->size )
> +                    return X86EMUL_UNHANDLEABLE;
> +                memcpy(&data, vio->mmio_large_read, p->size);
> +                vio->mmio_large_read_bytes = 0;
> +                vio->mmio_retrying = 0;
> +            }
> +            else
> +                rc = read_handler(v, p->addr, p->size, &data);
>              p->data = data;
>          }
>          else /* p->dir == IOREQ_WRITE */
> @@ -66,18 +76,48 @@ static int hvm_mmio_access(struct vcpu *
>      {
>          for ( i = 0; i < p->count; i++ )
>          {
> -            int ret;
> -
> -            rc = read_handler(v, p->addr + step * i, p->size, &data);
> -            if ( rc != X86EMUL_OKAY )
> -                break;
> -            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
> -            if ( (ret == HVMCOPY_gfn_paged_out) || 
> -                 (ret == HVMCOPY_gfn_shared) )
> +            if ( vio->mmio_retrying )
> +            {
> +                if ( vio->mmio_large_read_bytes != p->size )
> +                    return X86EMUL_UNHANDLEABLE;
> +                memcpy(&data, vio->mmio_large_read, p->size);
> +                vio->mmio_large_read_bytes = 0;
> +                vio->mmio_retrying = 0;
> +            }
> +            else
>              {
> +                rc = read_handler(v, p->addr + step * i, p->size, &data);
> +                if ( rc != X86EMUL_OKAY )
> +                    break;
> +            }
> +            switch ( hvm_copy_to_guest_phys(p->data + step * i,
> +                                            &data, p->size) )
> +            {
> +            case HVMCOPY_okay:
> +                break;
> +            case HVMCOPY_gfn_paged_out:
> +            case HVMCOPY_gfn_shared:
>                  rc = X86EMUL_RETRY;
>                  break;
> +            case HVMCOPY_bad_gfn_to_mfn:
> +                /* Drop the write as real hardware would. */
> +                continue;
> +            case HVMCOPY_bad_gva_to_gfn:
> +                ASSERT(0);
> +                /* fall through */
> +            default:
> +                rc = X86EMUL_UNHANDLEABLE;
> +                break;
>              }
> +            if ( rc != X86EMUL_OKAY)
> +                break;
> +        }
> +
> +        if ( rc == X86EMUL_RETRY )
> +        {
> +            vio->mmio_retry = 1;
> +            vio->mmio_large_read_bytes = p->size;
> +            memcpy(vio->mmio_large_read, &data, p->size);
>          }
>      }
>      else
> @@ -109,6 +149,9 @@ static int hvm_mmio_access(struct vcpu *
>              if ( rc != X86EMUL_OKAY )
>                  break;
>          }
> +
> +        if ( rc == X86EMUL_RETRY )
> +            vio->mmio_retry = 1;
>      }
>  
>      if ( i != 0 )
> @@ -137,6 +180,7 @@ int hvm_mmio_intercept(ioreq_t *p)
>  
>  static int process_portio_intercept(portio_action_t action, ioreq_t *p)
>  {
> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
>      int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>      uint32_t data;
>  
> @@ -144,7 +188,16 @@ static int process_portio_intercept(port
>      {
>          if ( p->dir == IOREQ_READ )
>          {
> -            rc = action(IOREQ_READ, p->addr, p->size, &data);
> +            if ( vio->mmio_retrying )
> +            {
> +                if ( vio->mmio_large_read_bytes != p->size )
> +                    return X86EMUL_UNHANDLEABLE;
> +                memcpy(&data, vio->mmio_large_read, p->size);
> +                vio->mmio_large_read_bytes = 0;
> +                vio->mmio_retrying = 0;
> +            }
> +            else
> +                rc = action(IOREQ_READ, p->addr, p->size, &data);
>              p->data = data;
>          }
>          else
> @@ -159,10 +212,48 @@ static int process_portio_intercept(port
>      {
>          for ( i = 0; i < p->count; i++ )
>          {
> -            rc = action(IOREQ_READ, p->addr, p->size, &data);
> -            if ( rc != X86EMUL_OKAY )
> +            if ( vio->mmio_retrying )
> +            {
> +                if ( vio->mmio_large_read_bytes != p->size )
> +                    return X86EMUL_UNHANDLEABLE;
> +                memcpy(&data, vio->mmio_large_read, p->size);
> +                vio->mmio_large_read_bytes = 0;
> +                vio->mmio_retrying = 0;
> +            }
> +            else
> +            {
> +                rc = action(IOREQ_READ, p->addr, p->size, &data);
> +                if ( rc != X86EMUL_OKAY )
> +                    break;
> +            }
> +            switch ( hvm_copy_to_guest_phys(p->data + step * i,
> +                                            &data, p->size) )
> +            {
> +            case HVMCOPY_okay:
> +                break;
> +            case HVMCOPY_gfn_paged_out:
> +            case HVMCOPY_gfn_shared:
> +                rc = X86EMUL_RETRY;
>                  break;
> -            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
> +            case HVMCOPY_bad_gfn_to_mfn:
> +                /* Drop the write as real hardware would. */
> +                continue;
> +            case HVMCOPY_bad_gva_to_gfn:
> +                ASSERT(0);
> +                /* fall through */
> +            default:
> +                rc = X86EMUL_UNHANDLEABLE;
> +                break;
> +            }
> +            if ( rc != X86EMUL_OKAY)
> +                break;
> +        }
> +
> +        if ( rc == X86EMUL_RETRY )
> +        {
> +            vio->mmio_retry = 1;
> +            vio->mmio_large_read_bytes = p->size;
> +            memcpy(vio->mmio_large_read, &data, p->size);
>          }
>      }
>      else /* p->dir == IOREQ_WRITE */
> @@ -195,6 +286,9 @@ static int process_portio_intercept(port
>              if ( rc != X86EMUL_OKAY )
>                  break;
>          }
> +
> +        if ( rc == X86EMUL_RETRY )
> +            vio->mmio_retry = 1;
>      }
>  
>      if ( i != 0 )
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 8158 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
  2013-10-08 18:20   ` Andrew Cooper
@ 2013-10-09  7:48     ` Jan Beulich
  0 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2013-10-09  7:48 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 08.10.13 at 20:20, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 30/09/13 13:58, Jan Beulich wrote:
>> Building upon the extended retry logic we can now also make sure to
>> not ignore errors resulting from writing data back to guest memory.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/hvm/intercept.c
>> +++ b/xen/arch/x86/hvm/intercept.c
>> @@ -47,6 +47,7 @@ static int hvm_mmio_access(struct vcpu *
>>                             hvm_mmio_read_t read_handler,
>>                             hvm_mmio_write_t write_handler)
>>  {
>> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
> 
> struct vcpu is passed in to this function, which can be used in
> preference to current.  (And is possibly fractionally faster unless the
> compiler can prove that v is always current)

Right, I simply overlooked this. Adjusted.

> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, Jan

>>      unsigned long data;
>>      int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>>  
>> @@ -54,7 +55,16 @@ static int hvm_mmio_access(struct vcpu *
>>      {
>>          if ( p->dir == IOREQ_READ )
>>          {
>> -            rc = read_handler(v, p->addr, p->size, &data);
>> +            if ( vio->mmio_retrying )
>> +            {
>> +                if ( vio->mmio_large_read_bytes != p->size )
>> +                    return X86EMUL_UNHANDLEABLE;
>> +                memcpy(&data, vio->mmio_large_read, p->size);
>> +                vio->mmio_large_read_bytes = 0;
>> +                vio->mmio_retrying = 0;
>> +            }
>> +            else
>> +                rc = read_handler(v, p->addr, p->size, &data);
>>              p->data = data;
>>          }
>>          else /* p->dir == IOREQ_WRITE */
>> @@ -66,18 +76,48 @@ static int hvm_mmio_access(struct vcpu *
>>      {
>>          for ( i = 0; i < p->count; i++ )
>>          {
>> -            int ret;
>> -
>> -            rc = read_handler(v, p->addr + step * i, p->size, &data);
>> -            if ( rc != X86EMUL_OKAY )
>> -                break;
>> -            ret = hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
>> -            if ( (ret == HVMCOPY_gfn_paged_out) || 
>> -                 (ret == HVMCOPY_gfn_shared) )
>> +            if ( vio->mmio_retrying )
>> +            {
>> +                if ( vio->mmio_large_read_bytes != p->size )
>> +                    return X86EMUL_UNHANDLEABLE;
>> +                memcpy(&data, vio->mmio_large_read, p->size);
>> +                vio->mmio_large_read_bytes = 0;
>> +                vio->mmio_retrying = 0;
>> +            }
>> +            else
>>              {
>> +                rc = read_handler(v, p->addr + step * i, p->size, &data);
>> +                if ( rc != X86EMUL_OKAY )
>> +                    break;
>> +            }
>> +            switch ( hvm_copy_to_guest_phys(p->data + step * i,
>> +                                            &data, p->size) )
>> +            {
>> +            case HVMCOPY_okay:
>> +                break;
>> +            case HVMCOPY_gfn_paged_out:
>> +            case HVMCOPY_gfn_shared:
>>                  rc = X86EMUL_RETRY;
>>                  break;
>> +            case HVMCOPY_bad_gfn_to_mfn:
>> +                /* Drop the write as real hardware would. */
>> +                continue;
>> +            case HVMCOPY_bad_gva_to_gfn:
>> +                ASSERT(0);
>> +                /* fall through */
>> +            default:
>> +                rc = X86EMUL_UNHANDLEABLE;
>> +                break;
>>              }
>> +            if ( rc != X86EMUL_OKAY)
>> +                break;
>> +        }
>> +
>> +        if ( rc == X86EMUL_RETRY )
>> +        {
>> +            vio->mmio_retry = 1;
>> +            vio->mmio_large_read_bytes = p->size;
>> +            memcpy(vio->mmio_large_read, &data, p->size);
>>          }
>>      }
>>      else
>> @@ -109,6 +149,9 @@ static int hvm_mmio_access(struct vcpu *
>>              if ( rc != X86EMUL_OKAY )
>>                  break;
>>          }
>> +
>> +        if ( rc == X86EMUL_RETRY )
>> +            vio->mmio_retry = 1;
>>      }
>>  
>>      if ( i != 0 )
>> @@ -137,6 +180,7 @@ int hvm_mmio_intercept(ioreq_t *p)
>>  
>>  static int process_portio_intercept(portio_action_t action, ioreq_t *p)
>>  {
>> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
>>      int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
>>      uint32_t data;
>>  
>> @@ -144,7 +188,16 @@ static int process_portio_intercept(port
>>      {
>>          if ( p->dir == IOREQ_READ )
>>          {
>> -            rc = action(IOREQ_READ, p->addr, p->size, &data);
>> +            if ( vio->mmio_retrying )
>> +            {
>> +                if ( vio->mmio_large_read_bytes != p->size )
>> +                    return X86EMUL_UNHANDLEABLE;
>> +                memcpy(&data, vio->mmio_large_read, p->size);
>> +                vio->mmio_large_read_bytes = 0;
>> +                vio->mmio_retrying = 0;
>> +            }
>> +            else
>> +                rc = action(IOREQ_READ, p->addr, p->size, &data);
>>              p->data = data;
>>          }
>>          else
>> @@ -159,10 +212,48 @@ static int process_portio_intercept(port
>>      {
>>          for ( i = 0; i < p->count; i++ )
>>          {
>> -            rc = action(IOREQ_READ, p->addr, p->size, &data);
>> -            if ( rc != X86EMUL_OKAY )
>> +            if ( vio->mmio_retrying )
>> +            {
>> +                if ( vio->mmio_large_read_bytes != p->size )
>> +                    return X86EMUL_UNHANDLEABLE;
>> +                memcpy(&data, vio->mmio_large_read, p->size);
>> +                vio->mmio_large_read_bytes = 0;
>> +                vio->mmio_retrying = 0;
>> +            }
>> +            else
>> +            {
>> +                rc = action(IOREQ_READ, p->addr, p->size, &data);
>> +                if ( rc != X86EMUL_OKAY )
>> +                    break;
>> +            }
>> +            switch ( hvm_copy_to_guest_phys(p->data + step * i,
>> +                                            &data, p->size) )
>> +            {
>> +            case HVMCOPY_okay:
>> +                break;
>> +            case HVMCOPY_gfn_paged_out:
>> +            case HVMCOPY_gfn_shared:
>> +                rc = X86EMUL_RETRY;
>>                  break;
>> -            (void)hvm_copy_to_guest_phys(p->data + step * i, &data, p->size);
>> +            case HVMCOPY_bad_gfn_to_mfn:
>> +                /* Drop the write as real hardware would. */
>> +                continue;
>> +            case HVMCOPY_bad_gva_to_gfn:
>> +                ASSERT(0);
>> +                /* fall through */
>> +            default:
>> +                rc = X86EMUL_UNHANDLEABLE;
>> +                break;
>> +            }
>> +            if ( rc != X86EMUL_OKAY)
>> +                break;
>> +        }
>> +
>> +        if ( rc == X86EMUL_RETRY )
>> +        {
>> +            vio->mmio_retry = 1;
>> +            vio->mmio_large_read_bytes = p->size;
>> +            memcpy(vio->mmio_large_read, &data, p->size);
>>          }
>>      }
>>      else /* p->dir == IOREQ_WRITE */
>> @@ -195,6 +286,9 @@ static int process_portio_intercept(port
>>              if ( rc != X86EMUL_OKAY )
>>                  break;
>>          }
>> +
>> +        if ( rc == X86EMUL_RETRY )
>> +            vio->mmio_retry = 1;
>>      }
>>  
>>      if ( i != 0 )
>>
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org 
>> http://lists.xen.org/xen-devel 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-09-30 12:59 ` [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing Jan Beulich
@ 2013-10-10 11:35   ` Andrew Cooper
  2013-12-18  8:36   ` Zhang, Yang Z
  1 sibling, 0 replies; 27+ messages in thread
From: Andrew Cooper @ 2013-10-10 11:35 UTC (permalink / raw)
  To: xen-devel, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 4850 bytes --]

On 30/09/13 13:59, Jan Beulich wrote:
> Rather than re-reading the instruction bytes upon retry processing,
> stash away and re-use tha what we already read. That way we can be
> certain that the retry won't do something different from what requested
> the retry, getting once again closer to real hardware behavior (where
> what we use retries for is simply a bus operation, not involving
> redundant decoding of instructions).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

It occurs to me that this caching is also needed in the shadow emulation
path, but that is probably better in a separate patch/series than
polluting this HVM series.

>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -569,9 +569,19 @@ static int hvmemul_insn_fetch(
>  
>      /* Fall back if requested bytes are not in the prefetch cache. */
>      if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
> -        return __hvmemul_read(
> -            seg, offset, p_data, bytes,
> -            hvm_access_insn_fetch, hvmemul_ctxt);
> +    {
> +        int rc = __hvmemul_read(seg, offset, p_data, bytes,
> +                                hvm_access_insn_fetch, hvmemul_ctxt);
> +
> +        if ( rc == X86EMUL_OKAY )
> +        {
> +            ASSERT(insn_off + bytes <= sizeof(hvmemul_ctxt->insn_buf));
> +            memcpy(&hvmemul_ctxt->insn_buf[insn_off], p_data, bytes);
> +            hvmemul_ctxt->insn_buf_bytes = insn_off + bytes;
> +        }
> +
> +        return rc;
> +    }
>  
>      /* Hit the cache. Simple memcpy. */
>      memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
> @@ -1148,17 +1158,27 @@ int hvm_emulate_one(
>          pfec |= PFEC_user_mode;
>  
>      hvmemul_ctxt->insn_buf_eip = regs->eip;
> -    hvmemul_ctxt->insn_buf_bytes =
> -        hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)
> -        ? :
> -        (hvm_virtual_to_linear_addr(
> -            x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs],
> -            regs->eip, sizeof(hvmemul_ctxt->insn_buf),
> -            hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) &&
> -         !hvm_fetch_from_guest_virt_nofault(
> -             hvmemul_ctxt->insn_buf, addr,
> -             sizeof(hvmemul_ctxt->insn_buf), pfec))
> -        ? sizeof(hvmemul_ctxt->insn_buf) : 0;
> +    if ( !vio->mmio_insn_bytes )
> +    {
> +        hvmemul_ctxt->insn_buf_bytes =
> +            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
> +            (hvm_virtual_to_linear_addr(x86_seg_cs,
> +                                        &hvmemul_ctxt->seg_reg[x86_seg_cs],
> +                                        regs->eip,
> +                                        sizeof(hvmemul_ctxt->insn_buf),
> +                                        hvm_access_insn_fetch,
> +                                        hvmemul_ctxt->ctxt.addr_size,
> +                                        &addr) &&
> +             hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr,
> +                                               sizeof(hvmemul_ctxt->insn_buf),
> +                                               pfec) == HVMCOPY_okay) ?
> +            sizeof(hvmemul_ctxt->insn_buf) : 0;
> +    }
> +    else
> +    {
> +        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
> +        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
> +    }
>  
>      hvmemul_ctxt->exn_pending = 0;
>      vio->mmio_retrying = vio->mmio_retry;
> @@ -1169,7 +1189,16 @@ int hvm_emulate_one(
>      if ( rc == X86EMUL_OKAY && vio->mmio_retry )
>          rc = X86EMUL_RETRY;
>      if ( rc != X86EMUL_RETRY )
> +    {
>          vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
> +        vio->mmio_insn_bytes = 0;
> +    }
> +    else
> +    {
> +        BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
> +        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
> +        memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
> +    }
>  
>      if ( rc != X86EMUL_OKAY )
>          return rc;
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -66,6 +66,9 @@ struct hvm_vcpu_io {
>      /* We may write up to m256 as a number of device-model transactions. */
>      unsigned int mmio_large_write_bytes;
>      paddr_t mmio_large_write_pa;
> +    /* For retries we shouldn't re-fetch the instruction. */
> +    unsigned int mmio_insn_bytes;
> +    unsigned char mmio_insn[16];
>      /*
>       * For string instruction emulation we need to be able to signal a
>       * necessary retry through other than function return codes.
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 5787 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/5] x86/HVM: XSA-63 follow-ups
  2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
                   ` (5 preceding siblings ...)
  2013-10-08 15:10 ` Ping: [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
@ 2013-10-14  7:29 ` Keir Fraser
  6 siblings, 0 replies; 27+ messages in thread
From: Keir Fraser @ 2013-10-14  7:29 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 30/09/2013 13:51, "Jan Beulich" <JBeulich@suse.com> wrote:

> 1: properly handle backward string instruction emulation
> 2: fix direct PCI port I/O emulation retry and error handling
> 3: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept
> 4: properly deal with hvm_copy_*_guest_phys() errors
> 5: cache emulated instruction for retry processing
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Keir Fraser <keir@xen.org>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-09-30 12:59 ` [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing Jan Beulich
  2013-10-10 11:35   ` Andrew Cooper
@ 2013-12-18  8:36   ` Zhang, Yang Z
  2013-12-18  8:48     ` Jan Beulich
  1 sibling, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2013-12-18  8:36 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Keir Fraser

Jan Beulich wrote on 2013-09-30:
> Rather than re-reading the instruction bytes upon retry processing, 
> stash away and re-use tha what we already read. That way we can be 
> certain that the retry won't do something different from what 
> requested the retry, getting once again closer to real hardware 
> behavior (where what we use retries for is simply a bus operation, not involving redundant decoding of instructions).
> 

This patch doesn't consider the nested case. 
For example, if the buffer saved the L2's instruction, then vmexit to L1 and L1 may use the wrong instruction.

There are two ways to fix it, but I am not sure one is better:

one is record instruction eip and check whether the eip is same when reading from buffer:

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index f39c173..738eaca 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1158,7 +1158,8 @@ int hvm_emulate_one(
         pfec |= PFEC_user_mode;
 
     hvmemul_ctxt->insn_buf_eip = regs->eip;
-    if ( !vio->mmio_insn_bytes )
+    if ( !vio->mmio_insn_bytes || 
+         (vio->mmio_insn_eip != hvmemul_ctxt->insn_buf_eip) )
     {
         hvmemul_ctxt->insn_buf_bytes =
             hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
@@ -1192,12 +1193,14 @@ int hvm_emulate_one(
     {
         vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
         vio->mmio_insn_bytes = 0;
+        vio->mmio_insn_eip = 0;
     }
     else
     {
         BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
         vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
         memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+        vio->mmio_insn_eip = hvmemul_ctxt->insn_buf_eip;
     }
 
     if ( rc != X86EMUL_OKAY )
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 258e5d0..01c5dc4 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -69,6 +69,7 @@ struct hvm_vcpu_io {
     /* For retries we shouldn't re-fetch the instruction. */
     unsigned int mmio_insn_bytes;
     unsigned char mmio_insn[16];
+    unsigned long mmio_insn_eip;
     /*
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.


Another one is to clear buffer when virtual vmentry and virtual vmexit happens:

diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 8617a7b..822b92f 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1136,12 +1136,14 @@ static void virtual_vmentry(struct cpu_user_regs *regs)
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     void *vvmcs = nvcpu->nv_vvmcx;
     unsigned long lm_l1, lm_l2;
+    struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io;
 
     vmx_vmcs_switch(v->arch.hvm_vmx.vmcs, nvcpu->nv_n2vmcx);
 
     nestedhvm_vcpu_enter_guestmode(v);
     nvcpu->nv_vmentry_pending = 0;
     nvcpu->nv_vmswitch_in_progress = 1;
+    vio->mmio_insn_bytes = 0;
 
     /*
      * EFER handling:
@@ -1331,6 +1333,7 @@ static void virtual_vmexit(struct cpu_user_regs *regs)
     struct vcpu *v = current;
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     unsigned long lm_l1, lm_l2;
+    struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io;
 
     sync_vvmcs_ro(v);
     sync_vvmcs_guest_state(v, regs);
@@ -1346,6 +1349,8 @@ static void virtual_vmexit(struct cpu_user_regs *regs)
     nvcpu->nv_vmexit_pending = 0;
     nvcpu->nv_vmswitch_in_progress = 1;
 
+    vio->mmio_insn_bytes = 0;
+
     lm_l2 = !!hvm_long_mode_enabled(v);
     lm_l1 = !!(__get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_CONTROLS) &
                            VM_EXIT_IA32E_MODE);

> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -569,9 +569,19 @@ static int hvmemul_insn_fetch(
> 
>      /* Fall back if requested bytes are not in the prefetch cache. */
>      if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
> -        return __hvmemul_read( -            seg, offset, p_data, bytes,
> -            hvm_access_insn_fetch, hvmemul_ctxt); +    { +        int
> rc = __hvmemul_read(seg, offset, p_data, bytes, +                       
>         hvm_access_insn_fetch, hvmemul_ctxt); + +        if ( rc ==
> X86EMUL_OKAY ) +        { +            ASSERT(insn_off + bytes <=
> sizeof(hvmemul_ctxt->insn_buf)); +           
> memcpy(&hvmemul_ctxt->insn_buf[insn_off], p_data, bytes); +           
> hvmemul_ctxt->insn_buf_bytes = insn_off + bytes; +        } + +       
> return rc; +    }
> 
>      /* Hit the cache. Simple memcpy. */
>      memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes); @@
> -1148,17 +1158,27 @@ int hvm_emulate_one(
>          pfec |= PFEC_user_mode;
>      hvmemul_ctxt->insn_buf_eip = regs->eip;
> -    hvmemul_ctxt->insn_buf_bytes = -        hvm_get_insn_bytes(curr,
> hvmemul_ctxt->insn_buf) -        ? : -       
> (hvm_virtual_to_linear_addr( -            x86_seg_cs,
> &hvmemul_ctxt->seg_reg[x86_seg_cs], -            regs->eip,
> sizeof(hvmemul_ctxt->insn_buf), -            hvm_access_insn_fetch,
> hvmemul_ctxt->ctxt.addr_size, &addr) && -        
> !hvm_fetch_from_guest_virt_nofault( -            
> hvmemul_ctxt->insn_buf, addr, -            
> sizeof(hvmemul_ctxt->insn_buf), pfec)) -        ?
> sizeof(hvmemul_ctxt->insn_buf) : 0; +    if ( !vio->mmio_insn_bytes ) + 
>   { +        hvmemul_ctxt->insn_buf_bytes = +           
> hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?: +           
> (hvm_virtual_to_linear_addr(x86_seg_cs, +
> &hvmemul_ctxt->seg_reg[x86_seg_cs], +                                   
>     regs->eip, + sizeof(hvmemul_ctxt->insn_buf), +                      
>                  hvm_access_insn_fetch, + 
> hvmemul_ctxt->ctxt.addr_size,
> +                                        &addr) && + +
> hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr, +
> sizeof(hvmemul_ctxt->insn_buf), +                                       
>        pfec) == HVMCOPY_okay) ? +           
> sizeof(hvmemul_ctxt->insn_buf) : 0; +    } +    else +    { +       
> hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes; +       
> memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes); + 
>   }
> 
>      hvmemul_ctxt->exn_pending = 0; vio->mmio_retrying =
>      vio->mmio_retry; @@ -1169,7 +1189,16 @@ int hvm_emulate_one( if (
>      rc == X86EMUL_OKAY && vio->mmio_retry )
>          rc = X86EMUL_RETRY;
>      if ( rc != X86EMUL_RETRY )
> +    {
>          vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
> +        vio->mmio_insn_bytes = 0; +    } +    else +    { +       
> BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); +
>        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; +       
> memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); + 
>   }
> 
>      if ( rc != X86EMUL_OKAY )
>          return rc;
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -66,6 +66,9 @@ struct hvm_vcpu_io {
>      /* We may write up to m256 as a number of device-model
>      transactions. */ unsigned int mmio_large_write_bytes; paddr_t
>      mmio_large_write_pa;
> +    /* For retries we shouldn't re-fetch the instruction. */
> +    unsigned int mmio_insn_bytes;
> +    unsigned char mmio_insn[16];
>      /*
>       * For string instruction emulation we need to be able to signal a
>       * necessary retry through other than function return codes.


Best regards,
Yang

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-12-18  8:36   ` Zhang, Yang Z
@ 2013-12-18  8:48     ` Jan Beulich
  2013-12-18  9:40       ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-12-18  8:48 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: Andrew Cooper, Keir Fraser, xen-devel

>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> Jan Beulich wrote on 2013-09-30:
>> Rather than re-reading the instruction bytes upon retry processing, 
>> stash away and re-use tha what we already read. That way we can be 
>> certain that the retry won't do something different from what 
>> requested the retry, getting once again closer to real hardware 
>> behavior (where what we use retries for is simply a bus operation, not 
> involving redundant decoding of instructions).
>> 
> 
> This patch doesn't consider the nested case. 
> For example, if the buffer saved the L2's instruction, then vmexit to L1 and 
> L1 may use the wrong instruction.

I'm having difficulty seeing how the two could get intermixed: There
should be, at any given point in time, at most one instruction being
emulated. Can you please give a more elaborate explanation of the
situation where you see a (theoretical? practical?) problem?

> There are two ways to fix it, but I am not sure one is better:
> 
> one is record instruction eip and check whether the eip is same when reading 
> from buffer:
>...
> Another one is to clear buffer when virtual vmentry and virtual vmexit 
> happens:

The former is unsuitable (what if both L1's and L2's instructions
happen to be on the same address?

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-12-18  8:48     ` Jan Beulich
@ 2013-12-18  9:40       ` Zhang, Yang Z
  2013-12-18 10:53         ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2013-12-18  9:40 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Keir Fraser, xen-devel

Jan Beulich wrote on 2013-12-18:
>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>> Jan Beulich wrote on 2013-09-30:
>>> Rather than re-reading the instruction bytes upon retry processing,
>>> stash away and re-use tha what we already read. That way we can be
>>> certain that the retry won't do something different from what
>>> requested the retry, getting once again closer to real hardware
>>> behavior (where what we use retries for is simply a bus operation, not
>>> involving redundant decoding of instructions).
>>> 
>> 
>> This patch doesn't consider the nested case.
>> For example, if the buffer saved the L2's instruction, then vmexit
>> to
>> L1 and
>> L1 may use the wrong instruction.
> 
> I'm having difficulty seeing how the two could get intermixed: There
> should be, at any given point in time, at most one instruction being
> emulated. Can you please give a more elaborate explanation of the
> situation where you see a (theoretical? practical?) problem?

I saw this issue when booting L1 hyper-v. I added some debug info and saw the strange phenomenon: 

(XEN) write to buffer: eip 0xfffff8800430bc80, size 16, content:f7420c1f608488b 44000000011442c7
(XEN) read from buffer: eip 0xfffff800002f6138, size 16, content:f7420c1f608488b 44000000011442c7

>From the log, we can see different eip but using the same buffer. Since I don't know how hyper-v working, so I cannot give more information why this happens. And I only saw it with L1 hyper-v (Xen on Xen and KVM on Xen don't have this issue) .

> 
>> There are two ways to fix it, but I am not sure one is better:
>> 
>> one is record instruction eip and check whether the eip is same when
>> reading  from buffer:
>> ...
>> Another one is to clear buffer when virtual vmentry and virtual
>> vmexit
>> happens:
> 
> The former is unsuitable (what if both L1's and L2's instructions
> happen to be on the same address?
> 
> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-12-18  9:40       ` Zhang, Yang Z
@ 2013-12-18 10:53         ` Jan Beulich
  2013-12-24 11:29           ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2013-12-18 10:53 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: Andrew Cooper, Keir Fraser, xen-devel

>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> Jan Beulich wrote on 2013-12-18:
>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>>> Jan Beulich wrote on 2013-09-30:
>>>> Rather than re-reading the instruction bytes upon retry processing,
>>>> stash away and re-use tha what we already read. That way we can be
>>>> certain that the retry won't do something different from what
>>>> requested the retry, getting once again closer to real hardware
>>>> behavior (where what we use retries for is simply a bus operation, not
>>>> involving redundant decoding of instructions).
>>>> 
>>> 
>>> This patch doesn't consider the nested case.
>>> For example, if the buffer saved the L2's instruction, then vmexit
>>> to
>>> L1 and
>>> L1 may use the wrong instruction.
>> 
>> I'm having difficulty seeing how the two could get intermixed: There
>> should be, at any given point in time, at most one instruction being
>> emulated. Can you please give a more elaborate explanation of the
>> situation where you see a (theoretical? practical?) problem?
> 
> I saw this issue when booting L1 hyper-v. I added some debug info and saw the 
> strange phenomenon: 
> 
> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16, 
> content:f7420c1f608488b 44000000011442c7
> (XEN) read from buffer: eip 0xfffff800002f6138, size 16, 
> content:f7420c1f608488b 44000000011442c7
> 
> From the log, we can see different eip but using the same buffer. Since I 
> don't know how hyper-v working, so I cannot give more information why this 
> happens. And I only saw it with L1 hyper-v (Xen on Xen and KVM on Xen don't 
> have this issue) .

But in order to validate the fix is (a) needed and (b) correct, a proper
understanding of the issue is necessary as the very first step. Which
doesn't require knowing internals of Hyper-V, all you need is tracking
of emulator state changes in Xen (which I realize is said easier than
done, since you want to make sure you don't generate huge amounts
of output before actually hitting the issue, making it close to impossible
to analyze).

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-12-18 10:53         ` Jan Beulich
@ 2013-12-24 11:29           ` Zhang, Yang Z
  2014-01-07  8:28             ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2013-12-24 11:29 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Keir Fraser, xen-devel

Jan Beulich wrote on 2013-12-18:
>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>> Jan Beulich wrote on 2013-12-18:
>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com>
> wrote:
>>>> Jan Beulich wrote on 2013-09-30:
>>>>> Rather than re-reading the instruction bytes upon retry
>>>>> processing, stash away and re-use tha what we already read. That
>>>>> way we can be certain that the retry won't do something different
>>>>> from what requested the retry, getting once again closer to real
>>>>> hardware behavior (where what we use retries for is simply a bus
>>>>> operation, not involving redundant decoding of instructions).
>>>>> 
>>>> 
>>>> This patch doesn't consider the nested case.
>>>> For example, if the buffer saved the L2's instruction, then vmexit
>>>> to
>>>> L1 and
>>>> L1 may use the wrong instruction.
>>> 
>>> I'm having difficulty seeing how the two could get intermixed:
>>> There should be, at any given point in time, at most one
>>> instruction being emulated. Can you please give a more elaborate
>>> explanation of the situation where you see a (theoretical? practical?) problem?
>> 
>> I saw this issue when booting L1 hyper-v. I added some debug info
>> and saw the strange phenomenon:
>> 
>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>> content:f7420c1f608488b 44000000011442c7
>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>> content:f7420c1f608488b 44000000011442c7
>> 
>> From the log, we can see different eip but using the same buffer.
>> Since I don't know how hyper-v working, so I cannot give more
>> information why this happens. And I only saw it with L1 hyper-v (Xen
>> on Xen and KVM on Xen don't have this issue) .
> 
> But in order to validate the fix is (a) needed and (b) correct, a
> proper understanding of the issue is necessary as the very first step.
> Which doesn't require knowing internals of Hyper-V, all you need is
> tracking of emulator state changes in Xen (which I realize is said
> easier than done, since you want to make sure you don't generate huge
> amounts of output before actually hitting the issue, making it close to impossible to analyze).

Ok. It should be an old issue and just exposed by your patch only:
Sometimes, L0 need to decode the L2's instruction to handle IO access directly. For example, if L1 pass through (not VT-d) a device to L2 directly. And L0 may get X86EMUL_RETRY when handling this IO request. At same time, if there is a virtual vmexit pending (for example, an interrupt pending to inject to L1) and hypervisor will switch the VCPU context from L2 to L1. Now we already in L1's context, but since we got a X86EMUL_RETRY just now and this means hyprevisor will retry to handle the IO request later and unfortunately, the retry will happen in L1's context.
Without your patch, L0 just emulate an L1's instruction and everything goes on. With your patch, L0 will get the instruction from the buffer which is belonging to L2, and problem arises.

So the fixing is that if there is a pending IO request, no virtual vmexit/vmentry is allowed which following hardware's behavior.

diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 41db52b..c5446a9 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
     struct vcpu *v = current;
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     struct cpu_user_regs *regs = guest_cpu_user_regs();
+    ioreq_t *p = get_ioreq(v);
 
+    if ( p->state != STATE_IOREQ_NONE )
+        return;
     /*
      * a softirq may interrupt us between a virtual vmentry is
      * just handled and the true vmentry. If during this window,

> 
> Jan


Best regards,
Yang

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2013-12-24 11:29           ` Zhang, Yang Z
@ 2014-01-07  8:28             ` Jan Beulich
  2014-01-07  8:54               ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2014-01-07  8:28 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: Andrew Cooper, Keir Fraser, xen-devel

>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> Jan Beulich wrote on 2013-12-18:
>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>>> Jan Beulich wrote on 2013-12-18:
>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>> wrote:
>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>> processing, stash away and re-use tha what we already read. That
>>>>>> way we can be certain that the retry won't do something different
>>>>>> from what requested the retry, getting once again closer to real
>>>>>> hardware behavior (where what we use retries for is simply a bus
>>>>>> operation, not involving redundant decoding of instructions).
>>>>>> 
>>>>> 
>>>>> This patch doesn't consider the nested case.
>>>>> For example, if the buffer saved the L2's instruction, then vmexit
>>>>> to
>>>>> L1 and
>>>>> L1 may use the wrong instruction.
>>>> 
>>>> I'm having difficulty seeing how the two could get intermixed:
>>>> There should be, at any given point in time, at most one
>>>> instruction being emulated. Can you please give a more elaborate
>>>> explanation of the situation where you see a (theoretical? practical?) 
> problem?
>>> 
>>> I saw this issue when booting L1 hyper-v. I added some debug info
>>> and saw the strange phenomenon:
>>> 
>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>> content:f7420c1f608488b 44000000011442c7
>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>> content:f7420c1f608488b 44000000011442c7
>>> 
>>> From the log, we can see different eip but using the same buffer.
>>> Since I don't know how hyper-v working, so I cannot give more
>>> information why this happens. And I only saw it with L1 hyper-v (Xen
>>> on Xen and KVM on Xen don't have this issue) .
>> 
>> But in order to validate the fix is (a) needed and (b) correct, a
>> proper understanding of the issue is necessary as the very first step.
>> Which doesn't require knowing internals of Hyper-V, all you need is
>> tracking of emulator state changes in Xen (which I realize is said
>> easier than done, since you want to make sure you don't generate huge
>> amounts of output before actually hitting the issue, making it close to 
> impossible to analyze).
> 
> Ok. It should be an old issue and just exposed by your patch only:
> Sometimes, L0 need to decode the L2's instruction to handle IO access 
> directly. For example, if L1 pass through (not VT-d) a device to L2 directly. 
> And L0 may get X86EMUL_RETRY when handling this IO request. At same time, if 
> there is a virtual vmexit pending (for example, an interrupt pending to 
> inject to L1) and hypervisor will switch the VCPU context from L2 to L1. Now 
> we already in L1's context, but since we got a X86EMUL_RETRY just now and 
> this means hyprevisor will retry to handle the IO request later and 
> unfortunately, the retry will happen in L1's context.
> Without your patch, L0 just emulate an L1's instruction and everything goes 
> on. With your patch, L0 will get the instruction from the buffer which is 
> belonging to L2, and problem arises.
> 
> So the fixing is that if there is a pending IO request, no virtual 
> vmexit/vmentry is allowed which following hardware's behavior.
> 
> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
> index 41db52b..c5446a9 100644
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>      struct vcpu *v = current;
>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>      struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    ioreq_t *p = get_ioreq(v);
>  
> +    if ( p->state != STATE_IOREQ_NONE )
> +        return;
>      /*
>       * a softirq may interrupt us between a virtual vmentry is
>       * just handled and the true vmentry. If during this window,

This change looks much more sensible; question is whether simply
ignoring the switch request is acceptable (and I don't know the
nested HVM code well enough to tell). Furthermore I wonder
whether that's really a VMX-only issue.

Another alternative might be to flush the cached instruction when
a guest mode switch is being done. Again, I don't know the
nested code well enough to tell what the most suitable behavior
here would be.

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-07  8:28             ` Jan Beulich
@ 2014-01-07  8:54               ` Zhang, Yang Z
  2014-01-07  9:43                 ` Egger, Christoph
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2014-01-07  8:54 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Keir Fraser, Andrew Cooper, Egger, Christoph (chegger@amazon.de),
	Dong, Eddie, Nakajima, Jun, xen-devel

Jan Beulich wrote on 2014-01-07:
>>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>> Jan Beulich wrote on 2013-12-18:
>>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com>
> wrote:
>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>>> wrote:
>>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>>> processing, stash away and re-use tha what we already read.
>>>>>>> That way we can be certain that the retry won't do something
>>>>>>> different from what requested the retry, getting once again
>>>>>>> closer to real hardware behavior (where what we use retries for
>>>>>>> is simply a bus operation, not involving redundant decoding of instructions).
>>>>>>> 
>>>>>> 
>>>>>> This patch doesn't consider the nested case.
>>>>>> For example, if the buffer saved the L2's instruction, then
>>>>>> vmexit to
>>>>>> L1 and
>>>>>> L1 may use the wrong instruction.
>>>>> 
>>>>> I'm having difficulty seeing how the two could get intermixed:
>>>>> There should be, at any given point in time, at most one
>>>>> instruction being emulated. Can you please give a more elaborate
>>>>> explanation of the situation where you see a (theoretical?
>>>>> practical?)
>> problem?
>>>> 
>>>> I saw this issue when booting L1 hyper-v. I added some debug info
>>>> and saw the strange phenomenon:
>>>> 
>>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>>> content:f7420c1f608488b 44000000011442c7
>>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>>> content:f7420c1f608488b 44000000011442c7
>>>> 
>>>> From the log, we can see different eip but using the same buffer.
>>>> Since I don't know how hyper-v working, so I cannot give more
>>>> information why this happens. And I only saw it with L1 hyper-v
>>>> (Xen on Xen and KVM on Xen don't have this issue) .
>>> 
>>> But in order to validate the fix is (a) needed and (b) correct, a
>>> proper understanding of the issue is necessary as the very first step.
>>> Which doesn't require knowing internals of Hyper-V, all you need is
>>> tracking of emulator state changes in Xen (which I realize is said
>>> easier than done, since you want to make sure you don't generate
>>> huge amounts of output before actually hitting the issue, making it
>>> close to
>> impossible to analyze).
>> 
>> Ok. It should be an old issue and just exposed by your patch only:
>> Sometimes, L0 need to decode the L2's instruction to handle IO
>> access directly. For example, if L1 pass through (not VT-d) a device to L2 directly.
>> And L0 may get X86EMUL_RETRY when handling this IO request. At same
>> time, if there is a virtual vmexit pending (for example, an
>> interrupt pending to inject to L1) and hypervisor will switch the
>> VCPU context from L2 to L1. Now we already in L1's context, but
>> since we got a X86EMUL_RETRY just now and this means hyprevisor will
>> retry to handle the IO request later and unfortunately, the retry will happen in L1's context.
>> Without your patch, L0 just emulate an L1's instruction and
>> everything goes on. With your patch, L0 will get the instruction
>> from the buffer which is belonging to L2, and problem arises.
>> 
>> So the fixing is that if there is a pending IO request, no virtual
>> vmexit/vmentry is allowed which following hardware's behavior.
>> 
>> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c
>> b/xen/arch/x86/hvm/vmx/vvmx.c index 41db52b..c5446a9 100644
>> --- a/xen/arch/x86/hvm/vmx/vvmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
>> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>>      struct vcpu *v = current;
>>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>> +    ioreq_t *p = get_ioreq(v);
>> 
>> +    if ( p->state != STATE_IOREQ_NONE )
>> +        return;
>>      /*
>>       * a softirq may interrupt us between a virtual vmentry is
>>       * just handled and the true vmentry. If during this window,
> 
> This change looks much more sensible; question is whether simply
> ignoring the switch request is acceptable (and I don't know the nested
> HVM code well enough to tell). Furthermore I wonder whether that's really a VMX-only issue.

>From hardware's point, an IO operation is handled atomically. So the problem will never happen. But in Xen, an IO operation is divided into several steps. Without nested virtualization, the VCPU is paused or looped until the IO emulation is finished. But in nested environment, the VCPU will continue running even the IO emulation is not finished. So my patch will check this and allow the VCPU to continue running only there is no pending IO request. This is matching hardware's behavior.

I guess SVM also has this problem. But since I don't know nested SVM well, so perhaps Christoph can help to have a double check.

> 
> Another alternative might be to flush the cached instruction when a
> guest mode switch is being done. Again, I don't know the nested code
> well enough to tell what the most suitable behavior here would be.
> 

Your patch to use cached instruction just exposed the issue. But the essence of it is the wrong vmswitch during the IO emulation. So, even we flushed the cached instruction during the vmswitch, the problem still exists. But maybe hard to trigger.

> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-07  8:54               ` Zhang, Yang Z
@ 2014-01-07  9:43                 ` Egger, Christoph
  2014-01-08  5:50                   ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Egger, Christoph @ 2014-01-07  9:43 UTC (permalink / raw)
  To: Zhang, Yang Z, Jan Beulich
  Cc: Andrew Cooper, Keir Fraser, Dong, Eddie, Nakajima, Jun, xen-devel

On 07.01.14 09:54, Zhang, Yang Z wrote:
> Jan Beulich wrote on 2014-01-07:
>>>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>>> Jan Beulich wrote on 2013-12-18:
>>>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>> wrote:
>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>>>> wrote:
>>>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>>>> processing, stash away and re-use tha what we already read.
>>>>>>>> That way we can be certain that the retry won't do something
>>>>>>>> different from what requested the retry, getting once again
>>>>>>>> closer to real hardware behavior (where what we use retries for
>>>>>>>> is simply a bus operation, not involving redundant decoding of instructions).
>>>>>>>>
>>>>>>>
>>>>>>> This patch doesn't consider the nested case.
>>>>>>> For example, if the buffer saved the L2's instruction, then
>>>>>>> vmexit to
>>>>>>> L1 and
>>>>>>> L1 may use the wrong instruction.
>>>>>>
>>>>>> I'm having difficulty seeing how the two could get intermixed:
>>>>>> There should be, at any given point in time, at most one
>>>>>> instruction being emulated. Can you please give a more elaborate
>>>>>> explanation of the situation where you see a (theoretical?
>>>>>> practical?)
>>> problem?
>>>>>
>>>>> I saw this issue when booting L1 hyper-v. I added some debug info
>>>>> and saw the strange phenomenon:
>>>>>
>>>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>>>> content:f7420c1f608488b 44000000011442c7
>>>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>
>>>>> From the log, we can see different eip but using the same buffer.
>>>>> Since I don't know how hyper-v working, so I cannot give more
>>>>> information why this happens. And I only saw it with L1 hyper-v
>>>>> (Xen on Xen and KVM on Xen don't have this issue) .
>>>>
>>>> But in order to validate the fix is (a) needed and (b) correct, a
>>>> proper understanding of the issue is necessary as the very first step.
>>>> Which doesn't require knowing internals of Hyper-V, all you need is
>>>> tracking of emulator state changes in Xen (which I realize is said
>>>> easier than done, since you want to make sure you don't generate
>>>> huge amounts of output before actually hitting the issue, making it
>>>> close to
>>> impossible to analyze).
>>>
>>> Ok. It should be an old issue and just exposed by your patch only:
>>> Sometimes, L0 need to decode the L2's instruction to handle IO
>>> access directly. For example, if L1 pass through (not VT-d) a device to L2 directly.
>>> And L0 may get X86EMUL_RETRY when handling this IO request. At same
>>> time, if there is a virtual vmexit pending (for example, an
>>> interrupt pending to inject to L1) and hypervisor will switch the
>>> VCPU context from L2 to L1. Now we already in L1's context, but
>>> since we got a X86EMUL_RETRY just now and this means hyprevisor will
>>> retry to handle the IO request later and unfortunately, the retry will happen in L1's context.
>>> Without your patch, L0 just emulate an L1's instruction and
>>> everything goes on. With your patch, L0 will get the instruction
>>> from the buffer which is belonging to L2, and problem arises.
>>>
>>> So the fixing is that if there is a pending IO request, no virtual
>>> vmexit/vmentry is allowed which following hardware's behavior.
>>>
>>> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c
>>> b/xen/arch/x86/hvm/vmx/vvmx.c index 41db52b..c5446a9 100644
>>> --- a/xen/arch/x86/hvm/vmx/vvmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
>>> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>>>      struct vcpu *v = current;
>>>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>>>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>>> +    ioreq_t *p = get_ioreq(v);
>>>
>>> +    if ( p->state != STATE_IOREQ_NONE )
>>> +        return;
>>>      /*
>>>       * a softirq may interrupt us between a virtual vmentry is
>>>       * just handled and the true vmentry. If during this window,
>>
>> This change looks much more sensible; question is whether simply
>> ignoring the switch request is acceptable (and I don't know the nested
>> HVM code well enough to tell). Furthermore I wonder whether that's really a VMX-only issue.
> 
> From hardware's point, an IO operation is handled atomically. So the problem
> will never happen. But in Xen, an IO operation is divided into several
steps.
> Without nested virtualization, the VCPU is paused or looped until the
IO emulation
> is finished. But in nested environment, the VCPU will continue running
even the
> IO emulation is not finished. So my patch will check this and allow
the VCPU to
> continue running only there is no pending IO request. This is matching
hardware's behavior.
> 
> I guess SVM also has this problem. But since I don't know nested SVM well,
> so perhaps Christoph can help to have a double check.

For SVM this issue was fixed with commit
d740d811925385c09553cbe6dee8e77c1d43b198

And there is a followup cleanup commit
ac97fa6a21ccd395cca43890bbd0bf32e3255ebb

The change in nestedsvm.c in commit d740d811925385c09553cbe6dee8e77c1d43b198
is actually not SVM specific.

Move that into nhvm_interrupt_blocked()
in hvm.c right before

    return hvm_funcs.nhvm_intr_blocked(v);

and the fix applies for both SVM and VMX.

Christoph

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-07  9:43                 ` Egger, Christoph
@ 2014-01-08  5:50                   ` Zhang, Yang Z
  2014-01-09 12:19                     ` Egger, Christoph
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2014-01-08  5:50 UTC (permalink / raw)
  To: Egger, Christoph, Jan Beulich
  Cc: Andrew Cooper, Keir Fraser, Dong, Eddie, Nakajima, Jun, xen-devel

Egger, Christoph wrote on 2014-01-07:
> On 07.01.14 09:54, Zhang, Yang Z wrote:
>> Jan Beulich wrote on 2014-01-07:
>>>>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com>
> wrote:
>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>>> wrote:
>>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z"
>>>>>>>>>> <yang.z.zhang@intel.com>
>>>>> wrote:
>>>>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>>>>> processing, stash away and re-use tha what we already read. That
>>>>>>>>> way we can be certain that the retry won't do something
>>>>>>>>> different from what requested the retry, getting once again
>>>>>>>>> closer to real hardware behavior (where what we use retries for
>>>>>>>>> is simply a bus operation, not involving redundant decoding of
>>>>>>>>> instructions).
>>>>>>>>> 
>>>>>>>> 
>>>>>>>> This patch doesn't consider the nested case.
>>>>>>>> For example, if the buffer saved the L2's instruction, then
>>>>>>>> vmexit to
>>>>>>>> L1 and
>>>>>>>> L1 may use the wrong instruction.
>>>>>>> 
>>>>>>> I'm having difficulty seeing how the two could get intermixed:
>>>>>>> There should be, at any given point in time, at most one
>>>>>>> instruction being emulated. Can you please give a more
>>>>>>> elaborate explanation of the situation where you see a (theoretical?
>>>>>>> practical?)
>>>> problem?
>>>>>> 
>>>>>> I saw this issue when booting L1 hyper-v. I added some debug
>>>>>> info and saw the strange phenomenon:
>>>>>> 
>>>>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>> 
>>>>>> From the log, we can see different eip but using the same buffer.
>>>>>> Since I don't know how hyper-v working, so I cannot give more
>>>>>> information why this happens. And I only saw it with L1 hyper-v
>>>>>> (Xen on Xen and KVM on Xen don't have this issue) .
>>>>> 
>>>>> But in order to validate the fix is (a) needed and (b) correct, a
>>>>> proper understanding of the issue is necessary as the very first step.
>>>>> Which doesn't require knowing internals of Hyper-V, all you need
>>>>> is tracking of emulator state changes in Xen (which I realize is
>>>>> said easier than done, since you want to make sure you don't
>>>>> generate huge amounts of output before actually hitting the
>>>>> issue, making it close to
>>>> impossible to analyze).
>>>> 
>>>> Ok. It should be an old issue and just exposed by your patch only:
>>>> Sometimes, L0 need to decode the L2's instruction to handle IO access
>>>> directly. For example, if L1 pass through (not VT-d) a device to L2
>>>> directly. And L0 may get X86EMUL_RETRY when handling this IO request.
>>>> At same time, if there is a virtual vmexit pending (for example, an
>>>> interrupt pending to inject to L1) and hypervisor will switch the
>>>> VCPU context from L2 to L1. Now we already in L1's context, but since
>>>> we got a X86EMUL_RETRY just now and this means hyprevisor will retry
>>>> to handle the IO request later and unfortunately, the retry will
>>>> happen in L1's context. Without your patch, L0 just emulate an L1's
>>>> instruction and everything goes on. With your patch, L0 will get the
>>>> instruction from the buffer which is belonging to L2, and problem
>>>> arises.
>>>> 
>>>> So the fixing is that if there is a pending IO request, no virtual
>>>> vmexit/vmentry is allowed which following hardware's behavior.
>>>> 
>>>> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c
>>>> b/xen/arch/x86/hvm/vmx/vvmx.c index 41db52b..c5446a9 100644
>>>> --- a/xen/arch/x86/hvm/vmx/vvmx.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
>>>> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>>>>      struct vcpu *v = current;
>>>>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>>>>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>>>> +    ioreq_t *p = get_ioreq(v);
>>>> 
>>>> +    if ( p->state != STATE_IOREQ_NONE )
>>>> +        return;
>>>>      /*
>>>>       * a softirq may interrupt us between a virtual vmentry is
>>>>       * just handled and the true vmentry. If during this window,
>>> 
>>> This change looks much more sensible; question is whether simply
>>> ignoring the switch request is acceptable (and I don't know the
>>> nested HVM code well enough to tell). Furthermore I wonder whether
>>> that's
> really a VMX-only issue.
>> 
>> From hardware's point, an IO operation is handled atomically. So the
>> problem will never happen. But in Xen, an IO operation is divided into
>> several steps. Without nested virtualization, the VCPU is paused or
>> looped until the IO emulation is finished. But in nested environment,
>> the VCPU will continue running even the IO emulation is not finished.
>> So my patch will check this and allow the VCPU to continue running only
>> there is no pending IO request. This is matching hardware's behavior.
>> 
>> I guess SVM also has this problem. But since I don't know nested SVM
>> well, so perhaps Christoph can help to have a double check.
> 
> For SVM this issue was fixed with commit
> d740d811925385c09553cbe6dee8e77c1d43b198
> 
> And there is a followup cleanup commit
> ac97fa6a21ccd395cca43890bbd0bf32e3255ebb
> 
> The change in nestedsvm.c in commit
> d740d811925385c09553cbe6dee8e77c1d43b198 is actually not SVM specific.
> 
> Move that into nhvm_interrupt_blocked() in hvm.c right before
> 
>     return hvm_funcs.nhvm_intr_blocked(v);
> and the fix applies for both SVM and VMX.
> 

I guess this is no enough. L2 may vmexit to L1 during IO emulation not only due to interrupt. Now I cannot give an example, but the hyper-v cannot boot up with your suggestion. So I guess only consider external interrupt is not enough. We should prohibit vmswith if there is a pending IO emulation both from L1 or L2(This may never happen to L1, but from hardware's behavior, we should add this check).

Best regards,
Yang

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-08  5:50                   ` Zhang, Yang Z
@ 2014-01-09 12:19                     ` Egger, Christoph
  2014-01-16  4:42                       ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Egger, Christoph @ 2014-01-09 12:19 UTC (permalink / raw)
  To: Zhang, Yang Z, Jan Beulich
  Cc: Andrew Cooper, Keir Fraser, Dong, Eddie, Nakajima, Jun, xen-devel

On 08.01.14 06:50, Zhang, Yang Z wrote:
> Egger, Christoph wrote on 2014-01-07:
>> On 07.01.14 09:54, Zhang, Yang Z wrote:
>>> Jan Beulich wrote on 2014-01-07:
>>>>>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>> wrote:
>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>>>> wrote:
>>>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z"
>>>>>>>>>>> <yang.z.zhang@intel.com>
>>>>>> wrote:
>>>>>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>>>>>> processing, stash away and re-use tha what we already read. That
>>>>>>>>>> way we can be certain that the retry won't do something
>>>>>>>>>> different from what requested the retry, getting once again
>>>>>>>>>> closer to real hardware behavior (where what we use retries for
>>>>>>>>>> is simply a bus operation, not involving redundant decoding of
>>>>>>>>>> instructions).
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> This patch doesn't consider the nested case.
>>>>>>>>> For example, if the buffer saved the L2's instruction, then
>>>>>>>>> vmexit to
>>>>>>>>> L1 and
>>>>>>>>> L1 may use the wrong instruction.
>>>>>>>>
>>>>>>>> I'm having difficulty seeing how the two could get intermixed:
>>>>>>>> There should be, at any given point in time, at most one
>>>>>>>> instruction being emulated. Can you please give a more
>>>>>>>> elaborate explanation of the situation where you see a (theoretical?
>>>>>>>> practical?)
>>>>> problem?
>>>>>>>
>>>>>>> I saw this issue when booting L1 hyper-v. I added some debug
>>>>>>> info and saw the strange phenomenon:
>>>>>>>
>>>>>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>>>
>>>>>>> From the log, we can see different eip but using the same buffer.
>>>>>>> Since I don't know how hyper-v working, so I cannot give more
>>>>>>> information why this happens. And I only saw it with L1 hyper-v
>>>>>>> (Xen on Xen and KVM on Xen don't have this issue) .
>>>>>>
>>>>>> But in order to validate the fix is (a) needed and (b) correct, a
>>>>>> proper understanding of the issue is necessary as the very first step.
>>>>>> Which doesn't require knowing internals of Hyper-V, all you need
>>>>>> is tracking of emulator state changes in Xen (which I realize is
>>>>>> said easier than done, since you want to make sure you don't
>>>>>> generate huge amounts of output before actually hitting the
>>>>>> issue, making it close to
>>>>> impossible to analyze).
>>>>>
>>>>> Ok. It should be an old issue and just exposed by your patch only:
>>>>> Sometimes, L0 need to decode the L2's instruction to handle IO access
>>>>> directly. For example, if L1 pass through (not VT-d) a device to L2
>>>>> directly. And L0 may get X86EMUL_RETRY when handling this IO request.
>>>>> At same time, if there is a virtual vmexit pending (for example, an
>>>>> interrupt pending to inject to L1) and hypervisor will switch the
>>>>> VCPU context from L2 to L1. Now we already in L1's context, but since
>>>>> we got a X86EMUL_RETRY just now and this means hyprevisor will retry
>>>>> to handle the IO request later and unfortunately, the retry will
>>>>> happen in L1's context. Without your patch, L0 just emulate an L1's
>>>>> instruction and everything goes on. With your patch, L0 will get the
>>>>> instruction from the buffer which is belonging to L2, and problem
>>>>> arises.
>>>>>
>>>>> So the fixing is that if there is a pending IO request, no virtual
>>>>> vmexit/vmentry is allowed which following hardware's behavior.
>>>>>
>>>>> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c
>>>>> b/xen/arch/x86/hvm/vmx/vvmx.c index 41db52b..c5446a9 100644
>>>>> --- a/xen/arch/x86/hvm/vmx/vvmx.c
>>>>> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
>>>>> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>>>>>      struct vcpu *v = current;
>>>>>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>>>>>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>>>>> +    ioreq_t *p = get_ioreq(v);
>>>>>
>>>>> +    if ( p->state != STATE_IOREQ_NONE )
>>>>> +        return;
>>>>>      /*
>>>>>       * a softirq may interrupt us between a virtual vmentry is
>>>>>       * just handled and the true vmentry. If during this window,
>>>>
>>>> This change looks much more sensible; question is whether simply
>>>> ignoring the switch request is acceptable (and I don't know the
>>>> nested HVM code well enough to tell). Furthermore I wonder whether
>>>> that's
>> really a VMX-only issue.
>>>
>>> From hardware's point, an IO operation is handled atomically. So the
>>> problem will never happen. But in Xen, an IO operation is divided into
>>> several steps. Without nested virtualization, the VCPU is paused or
>>> looped until the IO emulation is finished. But in nested environment,
>>> the VCPU will continue running even the IO emulation is not finished.
>>> So my patch will check this and allow the VCPU to continue running only
>>> there is no pending IO request. This is matching hardware's behavior.
>>>
>>> I guess SVM also has this problem. But since I don't know nested SVM
>>> well, so perhaps Christoph can help to have a double check.
>>
>> For SVM this issue was fixed with commit
>> d740d811925385c09553cbe6dee8e77c1d43b198
>>
>> And there is a followup cleanup commit
>> ac97fa6a21ccd395cca43890bbd0bf32e3255ebb
>>
>> The change in nestedsvm.c in commit
>> d740d811925385c09553cbe6dee8e77c1d43b198 is actually not SVM specific.
>>
>> Move that into nhvm_interrupt_blocked() in hvm.c right before
>>
>>     return hvm_funcs.nhvm_intr_blocked(v);
>> and the fix applies for both SVM and VMX.
>>
> 
> I guess this is no enough. L2 may vmexit to L1 during IO emulation
> not only due to interrupt. Now I cannot give an example, but the
> hyper-v cannot boot up with your suggestion. So I guess only consider
> external interrupt is not enough. We should prohibit vmswith if there
> is a pending IO emulation both from L1 or L2(This may never happen to L1,
> but from hardware's behavior, we should add this check).

I compared nsvm_vcpu_switch() with nvmx_switch_guest() (both are
called from entry.S) and come up with one question:

How do you handle the case of a virtual TLB flush (which shoots down
EPT tables) from another virtual CPU while you are in the middle
of a vmentry/vmexit emulation? This happens when a vCPU sends
an IPI to an other vCPU.

If you do not handle this case you launch a l2 guest with
an empty EPT table (you set it up correctly but an other CPU shot it
down right after you set it up) and that lets you end up
in an EPT fault endless loop.

Christoph

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-09 12:19                     ` Egger, Christoph
@ 2014-01-16  4:42                       ` Zhang, Yang Z
  2014-01-16  8:23                         ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yang Z @ 2014-01-16  4:42 UTC (permalink / raw)
  To: Egger, Christoph, Jan Beulich
  Cc: Andrew Cooper, Keir Fraser, Dong, Eddie, Nakajima, Jun, xen-devel

Egger, Christoph wrote on 2014-01-09:
> On 08.01.14 06:50, Zhang, Yang Z wrote:
>> Egger, Christoph wrote on 2014-01-07:
>>> On 07.01.14 09:54, Zhang, Yang Z wrote:
>>>> Jan Beulich wrote on 2014-01-07:
>>>>>>>> On 24.12.13 at 12:29, "Zhang, Yang Z" <yang.z.zhang@intel.com>
>>> wrote:
>>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>>> On 18.12.13 at 10:40, "Zhang, Yang Z"
>>>>>>>>>> <yang.z.zhang@intel.com>
>>>>> wrote:
>>>>>>>> Jan Beulich wrote on 2013-12-18:
>>>>>>>>>>>> On 18.12.13 at 09:36, "Zhang, Yang Z"
>>>>>>>>>>>> <yang.z.zhang@intel.com>
>>>>>>> wrote:
>>>>>>>>>> Jan Beulich wrote on 2013-09-30:
>>>>>>>>>>> Rather than re-reading the instruction bytes upon retry
>>>>>>>>>>> processing, stash away and re-use tha what we already read.
>>>>>>>>>>> That way we can be certain that the retry won't do
>>>>>>>>>>> something different from what requested the retry, getting
>>>>>>>>>>> once again closer to real hardware behavior (where what we
>>>>>>>>>>> use retries for is simply a bus operation, not involving
>>>>>>>>>>> redundant decoding of instructions).
>>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> This patch doesn't consider the nested case.
>>>>>>>>>> For example, if the buffer saved the L2's instruction, then
>>>>>>>>>> vmexit to
>>>>>>>>>> L1 and
>>>>>>>>>> L1 may use the wrong instruction.
>>>>>>>>> 
>>>>>>>>> I'm having difficulty seeing how the two could get intermixed:
>>>>>>>>> There should be, at any given point in time, at most one
>>>>>>>>> instruction being emulated. Can you please give a more
>>>>>>>>> elaborate explanation of the situation where you see a (theoretical?
>>>>>>>>> practical?)
>>>>>> problem?
>>>>>>>> 
>>>>>>>> I saw this issue when booting L1 hyper-v. I added some debug
>>>>>>>> info and saw the strange phenomenon:
>>>>>>>> 
>>>>>>>> (XEN) write to buffer: eip 0xfffff8800430bc80, size 16,
>>>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>>>> (XEN) read from buffer: eip 0xfffff800002f6138, size 16,
>>>>>>>> content:f7420c1f608488b 44000000011442c7
>>>>>>>> 
>>>>>>>> From the log, we can see different eip but using the same buffer.
>>>>>>>> Since I don't know how hyper-v working, so I cannot give more
>>>>>>>> information why this happens. And I only saw it with L1
>>>>>>>> hyper-v (Xen on Xen and KVM on Xen don't have this issue) .
>>>>>>> 
>>>>>>> But in order to validate the fix is (a) needed and (b) correct,
>>>>>>> a proper understanding of the issue is necessary as the very first step.
>>>>>>> Which doesn't require knowing internals of Hyper-V, all you
>>>>>>> need is tracking of emulator state changes in Xen (which I
>>>>>>> realize is said easier than done, since you want to make sure
>>>>>>> you don't generate huge amounts of output before actually
>>>>>>> hitting the issue, making it close to
>>>>>> impossible to analyze).
>>>>>> 
>>>>>> Ok. It should be an old issue and just exposed by your patch only:
>>>>>> Sometimes, L0 need to decode the L2's instruction to handle IO
>>>>>> access directly. For example, if L1 pass through (not VT-d) a
>>>>>> device to L2 directly. And L0 may get X86EMUL_RETRY when handling
>>>>>> this IO request. At same time, if there is a virtual vmexit pending
>>>>>> (for example, an interrupt pending to inject to L1) and hypervisor
>>>>>> will switch the VCPU context from L2 to L1. Now we already in L1's
>>>>>> context, but since we got a X86EMUL_RETRY just now and this means
>>>>>> hyprevisor will retry to handle the IO request later and
>>>>>> unfortunately, the retry will happen in L1's context. Without your
>>>>>> patch, L0 just emulate an L1's instruction and everything goes on.
>>>>>> With your patch, L0 will get the instruction from the buffer which
>>>>>> is belonging to L2, and problem arises.
>>>>>> 
>>>>>> So the fixing is that if there is a pending IO request, no
>>>>>> virtual vmexit/vmentry is allowed which following hardware's behavior.
>>>>>> 
>>>>>> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c
>>>>>> b/xen/arch/x86/hvm/vmx/vvmx.c index 41db52b..c5446a9 100644
>>>>>> --- a/xen/arch/x86/hvm/vmx/vvmx.c
>>>>>> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
>>>>>> @@ -1394,7 +1394,10 @@ void nvmx_switch_guest(void)
>>>>>>      struct vcpu *v = current;
>>>>>>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>>>>>>      struct cpu_user_regs *regs = guest_cpu_user_regs();
>>>>>> +    ioreq_t *p = get_ioreq(v);
>>>>>> 
>>>>>> +    if ( p->state != STATE_IOREQ_NONE )
>>>>>> +        return;
>>>>>>      /*
>>>>>>       * a softirq may interrupt us between a virtual vmentry is
>>>>>>       * just handled and the true vmentry. If during this
>>>>>> window,
>>>>> 
>>>>> This change looks much more sensible; question is whether simply
>>>>> ignoring the switch request is acceptable (and I don't know the
>>>>> nested HVM code well enough to tell). Furthermore I wonder
>>>>> whether that's
>>> really a VMX-only issue.
>>>> 
>>>> From hardware's point, an IO operation is handled atomically. So the
>>>> problem will never happen. But in Xen, an IO operation is divided
>>>> into several steps. Without nested virtualization, the VCPU is paused
>>>> or looped until the IO emulation is finished. But in nested
>>>> environment, the VCPU will continue running even the IO emulation is
>>>> not finished. So my patch will check this and allow the VCPU to
>>>> continue running only there is no pending IO request. This is
>>>> matching hardware's behavior.
>>>> 
>>>> I guess SVM also has this problem. But since I don't know nested
>>>> SVM well, so perhaps Christoph can help to have a double check.
>>> 
>>> For SVM this issue was fixed with commit
>>> d740d811925385c09553cbe6dee8e77c1d43b198
>>> 
>>> And there is a followup cleanup commit
>>> ac97fa6a21ccd395cca43890bbd0bf32e3255ebb
>>> 
>>> The change in nestedsvm.c in commit
>>> d740d811925385c09553cbe6dee8e77c1d43b198 is actually not SVM specific.
>>> 
>>> Move that into nhvm_interrupt_blocked() in hvm.c right before
>>> 
>>>     return hvm_funcs.nhvm_intr_blocked(v); and the fix applies for
>>> both SVM and VMX.
>>> 
>> 
>> I guess this is no enough. L2 may vmexit to L1 during IO emulation
>> not only due to interrupt. Now I cannot give an example, but the
>> hyper-v cannot boot up with your suggestion. So I guess only
>> consider external interrupt is not enough. We should prohibit
>> vmswith if there is a pending IO emulation both from L1 or L2(This
>> may never happen to L1, but from hardware's behavior, we should add this check).
> 
> I compared nsvm_vcpu_switch() with nvmx_switch_guest() (both are
> called from entry.S) and come up with one question:
> 
> How do you handle the case of a virtual TLB flush (which shoots down EPT
> tables) from another virtual CPU while you are in the middle of a
> vmentry/vmexit emulation? This happens when a vCPU sends an IPI to an
> other vCPU.
> 
> If you do not handle this case you launch a l2 guest with an empty EPT
> table (you set it up correctly but an other CPU shot it down right
> after you set it up) and that lets you end up in an EPT fault endless loop.
> 

Yes. It should be another issue. We may need another patch to solve it. 

Back to the IO emulation issue, your suggestion didn't solve the problem. So how about my solution that not allow doing virtual vmenty/vmexit when there is pending IO request not finished.

> Christoph


Best regards,
Yang

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-16  4:42                       ` Zhang, Yang Z
@ 2014-01-16  8:23                         ` Jan Beulich
  2014-01-17  2:53                           ` Zhang, Yang Z
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2014-01-16  8:23 UTC (permalink / raw)
  To: Christoph Egger, Yang Z Zhang
  Cc: Andrew Cooper, Keir Fraser, Eddie Dong, Jun Nakajima, xen-devel

>>> On 16.01.14 at 05:42, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> Back to the IO emulation issue, your suggestion didn't solve the problem. So 
> how about my solution that not allow doing virtual vmenty/vmexit when there 
> is pending IO request not finished.

Sounds pretty reasonable to me (i.e. in line with striving towards
emulation being as close to real hardware behavior as possible).

Jan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing
  2014-01-16  8:23                         ` Jan Beulich
@ 2014-01-17  2:53                           ` Zhang, Yang Z
  0 siblings, 0 replies; 27+ messages in thread
From: Zhang, Yang Z @ 2014-01-17  2:53 UTC (permalink / raw)
  To: Jan Beulich, Christoph Egger
  Cc: Andrew Cooper, Keir Fraser, Dong, Eddie, Nakajima, Jun, xen-devel

Jan Beulich wrote on 2014-01-16:
>>>> On 16.01.14 at 05:42, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>> Back to the IO emulation issue, your suggestion didn't solve the
>> problem. So how about my solution that not allow doing virtual
>> vmenty/vmexit when there is pending IO request not finished.
> 
> Sounds pretty reasonable to me (i.e. in line with striving towards
> emulation being as close to real hardware behavior as possible).
> 

Ok. I will send out it later.

> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2014-01-17  2:53 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-30 12:51 [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
2013-09-30 12:57 ` [PATCH 1/5] x86/HVM: properly handle backward string instruction emulation Jan Beulich
2013-10-08 16:36   ` Andrew Cooper
2013-09-30 12:57 ` [PATCH 2/5] x86/HVM: fix direct PCI port I/O emulation retry and error handling Jan Beulich
2013-10-08 18:13   ` Andrew Cooper
2013-09-30 12:58 ` [PATCH 3/5] x86/HVM: don't ignore hvm_copy_to_guest_phys() errors during I/O intercept Jan Beulich
2013-10-08 18:20   ` Andrew Cooper
2013-10-09  7:48     ` Jan Beulich
2013-09-30 12:58 ` [PATCH 4/5] x86/HVM: properly deal with hvm_copy_*_guest_phys() errors Jan Beulich
2013-09-30 13:07   ` Andrew Cooper
2013-09-30 12:59 ` [PATCH 5/5] x86/HVM: cache emulated instruction for retry processing Jan Beulich
2013-10-10 11:35   ` Andrew Cooper
2013-12-18  8:36   ` Zhang, Yang Z
2013-12-18  8:48     ` Jan Beulich
2013-12-18  9:40       ` Zhang, Yang Z
2013-12-18 10:53         ` Jan Beulich
2013-12-24 11:29           ` Zhang, Yang Z
2014-01-07  8:28             ` Jan Beulich
2014-01-07  8:54               ` Zhang, Yang Z
2014-01-07  9:43                 ` Egger, Christoph
2014-01-08  5:50                   ` Zhang, Yang Z
2014-01-09 12:19                     ` Egger, Christoph
2014-01-16  4:42                       ` Zhang, Yang Z
2014-01-16  8:23                         ` Jan Beulich
2014-01-17  2:53                           ` Zhang, Yang Z
2013-10-08 15:10 ` Ping: [PATCH 0/5] x86/HVM: XSA-63 follow-ups Jan Beulich
2013-10-14  7:29 ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.