[PATCH v2 3/3] x86/ioreq server: Add HVMOP to map guest ram with p2m_ioreq_server to an ioreq server

From: Yu Zhang <yu.c.zhang@linux.intel.com>
To: xen-devel@lists.xen.org
Cc: Kevin Tian <kevin.tian@intel.com>, Keir Fraser <keir@xen.org>,
	Jun Nakajima <jun.nakajima@intel.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Tim Deegan <tim@xen.org>, Paul Durrant <paul.durrant@citrix.com>,
	zhiyuan.lv@intel.com, Jan Beulich <jbeulich@suse.com>
Subject: [PATCH v2 3/3] x86/ioreq server: Add HVMOP to map guest ram with p2m_ioreq_server to an ioreq server
Date: Thu, 31 Mar 2016 18:53:38 +0800	[thread overview]
Message-ID: <1459421618-5991-4-git-send-email-yu.c.zhang@linux.intel.com> (raw)
In-Reply-To: <1459421618-5991-1-git-send-email-yu.c.zhang@linux.intel.com>

A new HVMOP - HVMOP_map_mem_type_to_ioreq_server, is added to
let one ioreq server claim/disclaim its responsibility for the
handling of guest pages with p2m type p2m_ioreq_server. Users
of this HVMOP can specify whether the p2m_ioreq_server is supposed
to handle write accesses or read ones or both in a parameter named
flags. For now, we only support one ioreq server for this p2m type,
so once an ioreq server has claimed its ownership, subsequent calls
of the HVMOP_map_mem_type_to_ioreq_server will fail. Users can also
disclaim the ownership of guest ram pages with this p2m type, by
triggering this new HVMOP, with ioreq server id set to the current
owner's and flags parameter set to 0.

For now, both HVMOP_map_mem_type_to_ioreq_server and p2m_ioreq_server
are only supported for HVMs with HAP enabled.

Note that flags parameter(if not 0) of this HVMOP only indicates
which kind of memory accesses are to be forwarded to an ioreq server,
it has impact on the access rights of guest ram pages, but are not
the same. Due to hardware limitations, if only write operations are
to be forwarded, read ones will be performed at full speed, with
no hypervisor intervention. But if read ones are to be forwarded to
an ioreq server, writes will inevitably be trapped into hypervisor,
which means significant performance impact.

Also note that this HVMOP_map_mem_type_to_ioreq_server will not
change the p2m type of any guest ram page, until HVMOP_set_mem_type
is triggered. So normally the steps should be the backend driver
first claims its ownership of guest ram pages with p2m_ioreq_server
type, and then sets the memory type to p2m_ioreq_server for specified
guest ram pages.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Cc: Keir Fraser <keir@xen.org>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Tim Deegan <tim@xen.org>
---
 xen/arch/x86/hvm/emulate.c       | 125 +++++++++++++++++++++++++++++++++++++--
 xen/arch/x86/hvm/hvm.c           |  95 +++++++++++++++++++++++++++--
 xen/arch/x86/mm/hap/nested_hap.c |   2 +-
 xen/arch/x86/mm/p2m-ept.c        |  14 ++++-
 xen/arch/x86/mm/p2m-pt.c         |  25 +++++---
 xen/arch/x86/mm/p2m.c            |  82 +++++++++++++++++++++++++
 xen/arch/x86/mm/shadow/multi.c   |   3 +-
 xen/include/asm-x86/p2m.h        |  36 +++++++++--
 xen/include/public/hvm/hvm_op.h  |  37 ++++++++++++
 9 files changed, 395 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index ddc8007..77a4793 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -94,11 +94,69 @@ static const struct hvm_io_handler null_handler = {
     .ops = &null_ops
 };
 
+static int mem_read(const struct hvm_io_handler *io_handler,
+                    uint64_t addr,
+                    uint32_t size,
+                    uint64_t *data)
+{
+    struct domain *currd = current->domain;
+    unsigned long gmfn = paddr_to_pfn(addr);
+    unsigned long offset = addr & ~PAGE_MASK;
+    struct page_info *page = get_page_from_gfn(currd, gmfn, NULL, P2M_UNSHARE);
+    uint8_t *p;
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    p = __map_domain_page(page);
+    p += offset;
+    memcpy(data, p, size);
+
+    unmap_domain_page(p);
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
+static int mem_write(const struct hvm_io_handler *handler,
+                     uint64_t addr,
+                     uint32_t size,
+                     uint64_t data)
+{
+    struct domain *currd = current->domain;
+    unsigned long gmfn = paddr_to_pfn(addr);
+    unsigned long offset = addr & ~PAGE_MASK;
+    struct page_info *page = get_page_from_gfn(currd, gmfn, NULL, P2M_UNSHARE);
+    uint8_t *p;
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    p = __map_domain_page(page);
+    p += offset;
+    memcpy(p, &data, size);
+
+    unmap_domain_page(p);
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
+static const struct hvm_io_ops mem_ops = {
+    .read = mem_read,
+    .write = mem_write
+};
+
+static const struct hvm_io_handler mem_handler = {
+    .ops = &mem_ops
+};
+
 static int hvmemul_do_io(
     bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
     uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
 {
     struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
     ioreq_t p = {
         .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
@@ -140,7 +198,7 @@ static int hvmemul_do_io(
              (p.dir != dir) ||
              (p.df != df) ||
              (p.data_is_ptr != data_is_addr) )
-            domain_crash(curr->domain);
+            domain_crash(currd);
 
         if ( data_is_addr )
             return X86EMUL_UNHANDLEABLE;
@@ -168,13 +226,72 @@ static int hvmemul_do_io(
         break;
     case X86EMUL_UNHANDLEABLE:
     {
-        struct hvm_ioreq_server *s =
-            hvm_select_ioreq_server(curr->domain, &p);
+        struct hvm_ioreq_server *s;
+        p2m_type_t p2mt;
+
+        if ( is_mmio )
+        {
+            unsigned long gmfn = paddr_to_pfn(addr);
+
+            (void) get_gfn_query_unlocked(currd, gmfn, &p2mt);
+
+            switch ( p2mt )
+            {
+                case p2m_ioreq_server:
+                {
+                    unsigned long flags;
+
+                    p2m_get_ioreq_server(currd, &flags, &s);
+
+                    if ( !s )
+                        break;
+
+                    if ( (dir == IOREQ_READ &&
+                          !(flags & P2M_IOREQ_HANDLE_READ_ACCESS)) ||
+                         (dir == IOREQ_WRITE &&
+                          !(flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) )
+                        s = NULL;
+
+                    break;
+                }
+                case p2m_ram_rw:
+                    s = NULL;
+                    break;
+
+                default:
+                    s = hvm_select_ioreq_server(currd, &p);
+                    break;
+            }
+        }
+        else
+        {
+            p2mt = p2m_invalid;
+
+            s = hvm_select_ioreq_server(currd, &p);
+        }
 
         /* If there is no suitable backing DM, just ignore accesses */
         if ( !s )
         {
-            rc = hvm_process_io_intercept(&null_handler, &p);
+            switch ( p2mt )
+            {
+            case p2m_ioreq_server:
+            /*
+             * Race conditions may exist when access to a gfn with
+             * p2m_ioreq_server is intercepted by hypervisor, during
+             * which time p2m type of this gfn is recalculated back
+             * to p2m_ram_rw. mem_handler is used to handle this
+             * corner case.
+             */
+            case p2m_ram_rw:
+                rc = hvm_process_io_intercept(&mem_handler, &p);
+                break;
+
+            default:
+                rc = hvm_process_io_intercept(&null_handler, &p);
+                break;
+            }
+
             vio->io_req.state = STATE_IOREQ_NONE;
         }
         else
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index bec6a8a..ba1de00 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1252,6 +1252,8 @@ static int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
 
         domain_pause(d);
 
+        p2m_destroy_ioreq_server(d, s);
+
         hvm_ioreq_server_disable(s, 0);
 
         list_del(&s->list_entry);
@@ -1411,6 +1413,47 @@ static int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
     return rc;
 }
 
+static int hvm_map_mem_type_to_ioreq_server(struct domain *d,
+                                            ioservid_t id,
+                                            hvmmem_type_t type,
+                                            uint32_t flags)
+{
+    struct hvm_ioreq_server *s;
+    int rc;
+
+    /* For now, only HVMMEM_ioreq_server is supported */
+    if ( type != HVMMEM_ioreq_server )
+        return -EINVAL;
+
+    if ( flags & ~(HVMOP_IOREQ_MEM_ACCESS_READ |
+                   HVMOP_IOREQ_MEM_ACCESS_WRITE) )
+        return -EINVAL;
+
+    spin_lock(&d->arch.hvm_domain.ioreq_server.lock);
+
+    rc = -ENOENT;
+    list_for_each_entry ( s,
+                          &d->arch.hvm_domain.ioreq_server.list,
+                          list_entry )
+    {
+        if ( s == d->arch.hvm_domain.default_ioreq_server )
+            continue;
+
+        if ( s->id == id )
+        {
+            rc = p2m_set_ioreq_server(d, flags, s);
+            if ( rc == 0 )
+                gdprintk(XENLOG_DEBUG, "%u %s type HVMMEM_ioreq_server.\n",
+                         s->id, (flags != 0) ? "mapped to" : "unmapped from");
+
+            break;
+        }
+    }
+
+    spin_unlock(&d->arch.hvm_domain.ioreq_server.lock);
+    return rc;
+}
+
 static int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
                                       bool_t enabled)
 {
@@ -3164,9 +3207,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
      * If this GFN is emulated MMIO or marked as read-only, pass the fault
      * to the mmio handler.
      */
-    if ( (p2mt == p2m_mmio_dm) || 
-         (npfec.write_access &&
-          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
+    if ( (p2mt == p2m_mmio_dm) ||
+         (p2mt == p2m_ioreq_server) ||
+         (npfec.write_access && p2m_is_discard_write(p2mt)) )
     {
         __put_gfn(p2m, gfn);
         if ( ap2m_active )
@@ -5979,6 +6022,40 @@ static int hvmop_unmap_io_range_from_ioreq_server(
     return rc;
 }
 
+static int hvmop_map_mem_type_to_ioreq_server(
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
+{
+    xen_hvm_map_mem_type_to_ioreq_server_t op;
+    struct domain *d;
+    int rc;
+
+    if ( copy_from_guest(&op, uop, 1) )
+        return -EFAULT;
+
+    rc = rcu_lock_remote_domain_by_id(op.domid, &d);
+    if ( rc != 0 )
+        return rc;
+
+    rc = -EINVAL;
+    if ( !is_hvm_domain(d) )
+        goto out;
+
+    /* For now, only support for HAP enabled hvm */
+    if ( !hap_enabled(d) )
+        goto out;
+
+    rc = xsm_hvm_ioreq_server(XSM_DM_PRIV, d,
+                              HVMOP_map_mem_type_to_ioreq_server);
+    if ( rc != 0 )
+        goto out;
+
+    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
+
+ out:
+    rcu_unlock_domain(d);
+    return rc;
+}
+
 static int hvmop_set_ioreq_server_state(
     XEN_GUEST_HANDLE_PARAM(xen_hvm_set_ioreq_server_state_t) uop)
 {
@@ -6613,8 +6690,7 @@ static int hvmop_get_mem_type(
 static bool_t hvm_allow_p2m_type_change(p2m_type_t old, p2m_type_t new)
 {
     if ( p2m_is_ram(old) ||
-         (p2m_is_hole(old) && new == p2m_mmio_dm) ||
-         (old == p2m_ioreq_server && new == p2m_ram_rw) )
+         (p2m_is_hole(old) && new == p2m_mmio_dm) )
         return 1;
 
     return 0;
@@ -6648,6 +6724,10 @@ static int hvmop_set_mem_type(
     if ( !is_hvm_domain(d) )
         goto out;
 
+    /* For now, HVMMEM_ioreq_server is only supported for HAP enabled hvm. */
+    if ( a.hvmmem_type == HVMMEM_ioreq_server && !hap_enabled(d) )
+        goto out;
+
     rc = xsm_hvm_control(XSM_DM_PRIV, d, HVMOP_set_mem_type);
     if ( rc )
         goto out;
@@ -6748,6 +6828,11 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
             guest_handle_cast(arg, xen_hvm_io_range_t));
         break;
 
+    case HVMOP_map_mem_type_to_ioreq_server:
+        rc = hvmop_map_mem_type_to_ioreq_server(
+            guest_handle_cast(arg, xen_hvm_map_mem_type_to_ioreq_server_t));
+        break;
+
     case HVMOP_set_ioreq_server_state:
         rc = hvmop_set_ioreq_server_state(
             guest_handle_cast(arg, xen_hvm_set_ioreq_server_state_t));
diff --git a/xen/arch/x86/mm/hap/nested_hap.c b/xen/arch/x86/mm/hap/nested_hap.c
index 9cee5a0..bbb6d85 100644
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -174,7 +174,7 @@ nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa,
     if ( *p2mt == p2m_mmio_direct )
         goto direct_mmio_out;
     rc = NESTEDHVM_PAGEFAULT_MMIO;
-    if ( *p2mt == p2m_mmio_dm )
+    if ( *p2mt == p2m_mmio_dm || *p2mt == p2m_ioreq_server )
         goto out;
 
     rc = NESTEDHVM_PAGEFAULT_L0_ERROR;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 380ec25..854e158 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -132,6 +132,19 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
             entry->r = entry->w = entry->x = 1;
             entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
+        case p2m_ioreq_server:
+            entry->r = !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS);
+	    /*
+	     * write access right is disabled when entry->r is 0, but whether
+	     * write accesses are emulated by hypervisor or forwarded to an
+	     * ioreq server depends on the setting of p2m->ioreq.flags.
+	     */
+            entry->w = (entry->r &&
+                        !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS));
+            entry->x = entry->r;
+            entry->a = !!cpu_has_vmx_ept_ad;
+            entry->d = entry->w && cpu_has_vmx_ept_ad;
+            break;
         case p2m_mmio_direct:
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
@@ -171,7 +184,6 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
             entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
         case p2m_grant_map_ro:
-        case p2m_ioreq_server:
             entry->r = 1;
             entry->w = entry->x = 0;
             entry->a = !!cpu_has_vmx_ept_ad;
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index eabd2e3..7a0ddb8 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,8 +72,8 @@ static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
-                                       unsigned int level)
+static unsigned long p2m_type_to_flags(struct p2m_domain *p2m, p2m_type_t t,
+                                       mfn_t mfn, unsigned int level)
 {
     unsigned long flags;
     /*
@@ -94,8 +94,18 @@ static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
     default:
         return flags | _PAGE_NX_BIT;
     case p2m_grant_map_ro:
-    case p2m_ioreq_server:
         return flags | P2M_BASE_FLAGS | _PAGE_NX_BIT;
+    case p2m_ioreq_server:
+    {
+        flags |= P2M_BASE_FLAGS | _PAGE_RW;
+
+        if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS )
+            return flags & ~(_PAGE_PRESENT | _PAGE_RW);
+        else if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS )
+            return flags & ~_PAGE_RW;
+        else
+            return flags;
+    }
     case p2m_ram_ro:
     case p2m_ram_logdirty:
     case p2m_ram_shared:
@@ -442,7 +452,8 @@ static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
+            unsigned long flags = p2m_type_to_flags(p2m, p2mt,
+                                                    _mfn(mfn), level);
 
             if ( level )
             {
@@ -579,7 +590,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
+                           p2m_type_to_flags(p2m, p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -615,7 +626,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn, 0));
+                                         p2m_type_to_flags(p2m, p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -651,7 +662,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn, 1) |
+                                       p2m_type_to_flags(p2m, p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index b3fce1b..f7d2f60 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -83,6 +83,8 @@ static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
     else
         p2m_pt_init(p2m);
 
+    spin_lock_init(&p2m->ioreq.lock);
+
     return ret;
 }
 
@@ -289,6 +291,86 @@ void p2m_memory_type_changed(struct domain *d)
     }
 }
 
+int p2m_set_ioreq_server(struct domain *d,
+                         unsigned long flags,
+                         struct hvm_ioreq_server *s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc;
+
+    spin_lock(&p2m->ioreq.lock);
+
+    rc = -EBUSY;
+    if ( (flags != 0) && (p2m->ioreq.server != NULL) )
+        goto out;
+
+    rc = -EINVAL;
+    /* unmap ioreq server from p2m type by passing flags with 0 */
+    if ( (flags == 0) && (p2m->ioreq.server != s) )
+        goto out;
+
+    if ( flags == 0 )
+    {
+        p2m->ioreq.server = NULL;
+        p2m->ioreq.flags = 0;
+    }
+    else
+    {
+        p2m->ioreq.server = s;
+        p2m->ioreq.flags = flags;
+    }
+
+    /*
+     * Each time we map/unmap an ioreq server to/from p2m_ioreq_server,
+     * we mark the p2m table to be recalculated, so that gfns which were
+     * previously marked with p2m_ioreq_server can be resynced.
+     */
+    p2m_change_entry_type_global(d, p2m_ioreq_server, p2m_ram_rw);
+
+    rc = 0;
+
+out:
+    spin_unlock(&p2m->ioreq.lock);
+
+    return rc;
+}
+
+void p2m_get_ioreq_server(struct domain *d,
+                          unsigned long *flags,
+                          struct hvm_ioreq_server **s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    spin_lock(&p2m->ioreq.lock);
+
+    *s = p2m->ioreq.server;
+    *flags = p2m->ioreq.flags;
+
+    spin_unlock(&p2m->ioreq.lock);
+}
+
+void p2m_destroy_ioreq_server(struct domain *d,
+                              struct hvm_ioreq_server *s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    spin_lock(&p2m->ioreq.lock);
+
+    if ( p2m->ioreq.server == s )
+    {
+        p2m->ioreq.server = NULL;
+        p2m->ioreq.flags = 0;
+
+        /*
+         * Mark p2m table to be recalculated, so that gfns which were
+         * previously marked with p2m_ioreq_server can be resynced.
+         */
+        p2m_change_entry_type_global(d, p2m_ioreq_server, p2m_ram_rw);
+    }
+
+    spin_unlock(&p2m->ioreq.lock);
+}
+
 void p2m_enable_hardware_log_dirty(struct domain *d)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index c81302a..2e0d258 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3224,8 +3224,7 @@ static int sh_page_fault(struct vcpu *v,
     }
 
     /* Need to hand off device-model MMIO to the device model */
-    if ( p2mt == p2m_mmio_dm
-         || (p2mt == p2m_ioreq_server && ft == ft_demand_write) )
+    if ( p2mt == p2m_mmio_dm )
     {
         gpa = guest_walk_to_gpa(&gw);
         goto mmio;
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index ee2ea9c..8f925ac 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -89,7 +89,8 @@ typedef unsigned int p2m_query_t;
                        | p2m_to_mask(p2m_ram_paging_out)      \
                        | p2m_to_mask(p2m_ram_paged)           \
                        | p2m_to_mask(p2m_ram_paging_in)       \
-                       | p2m_to_mask(p2m_ram_shared))
+                       | p2m_to_mask(p2m_ram_shared)          \
+                       | p2m_to_mask(p2m_ioreq_server))
 
 /* Types that represent a physmap hole that is ok to replace with a shared
  * entry */
@@ -111,8 +112,7 @@ typedef unsigned int p2m_query_t;
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro)         \
                       | p2m_to_mask(p2m_grant_map_ro)   \
-                      | p2m_to_mask(p2m_ram_shared)     \
-                      | p2m_to_mask(p2m_ioreq_server))
+                      | p2m_to_mask(p2m_ram_shared))
 
 /* Write-discard types, which should discard the write operations */
 #define P2M_DISCARD_WRITE_TYPES (p2m_to_mask(p2m_ram_ro)     \
@@ -120,7 +120,8 @@ typedef unsigned int p2m_query_t;
 
 /* Types that can be subject to bulk transitions. */
 #define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
-                              | p2m_to_mask(p2m_ram_logdirty) )
+                              | p2m_to_mask(p2m_ram_logdirty) \
+                              | p2m_to_mask(p2m_ioreq_server) )
 
 #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand))
 
@@ -320,6 +321,27 @@ struct p2m_domain {
         struct ept_data ept;
         /* NPT-equivalent structure could be added here. */
     };
+
+    struct {
+        spinlock_t lock;
+        /*
+         * ioreq server who's responsible for the emulation of
+         * gfns with specific p2m type(for now, p2m_ioreq_server).
+         * Behaviors of gfns with p2m_ioreq_server set but no
+         * ioreq server mapped in advance should be the same as
+         * p2m_ram_rw.
+         */
+        struct hvm_ioreq_server *server;
+        /*
+         * flags specifies whether read, write or both operations
+         * are to be emulated by an ioreq server.
+         */
+        unsigned long flags;
+
+#define P2M_IOREQ_HANDLE_WRITE_ACCESS HVMOP_IOREQ_MEM_ACCESS_WRITE
+#define P2M_IOREQ_HANDLE_READ_ACCESS  HVMOP_IOREQ_MEM_ACCESS_READ
+
+    } ioreq;
 };
 
 /* get host p2m table */
@@ -821,6 +843,12 @@ static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt)
     return flags;
 }
 
+int p2m_set_ioreq_server(struct domain *d, unsigned long flags,
+                         struct hvm_ioreq_server *s);
+void p2m_get_ioreq_server(struct domain *d, unsigned long *flags,
+                         struct hvm_ioreq_server **s);
+void p2m_destroy_ioreq_server(struct domain *d, struct hvm_ioreq_server *s);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
index a1eae52..d46f186 100644
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -489,6 +489,43 @@ struct xen_hvm_altp2m_op {
 typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
 
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/*
+ * HVMOP_map_mem_type_to_ioreq_server : map or unmap the IOREQ Server <id>
+ *                                      to specific memroy type <type>
+ *                                      for specific accesses <flags>
+ *
+ * Note that if only write operations are to be forwarded to an ioreq server,
+ * read operations will be performed with no hypervisor intervention. But if
+ * flags indicates that read operations are to be forwarded to an ioreq server,
+ * write operations will inevitably be trapped into hypervisor, whether they
+ * are emulated by hypervisor or forwarded to ioreq server depends on the flags
+ * setting. This situation means significant performance impact.
+ */
+#define HVMOP_map_mem_type_to_ioreq_server 26
+struct xen_hvm_map_mem_type_to_ioreq_server {
+    domid_t domid;      /* IN - domain to be serviced */
+    ioservid_t id;      /* IN - ioreq server id */
+    hvmmem_type_t type; /* IN - memory type */
+    uint32_t flags;     /* IN - types of accesses to be forwarded to the
+                           ioreq server. flags with 0 means to unmap the
+                           ioreq server */
+#define _HVMOP_IOREQ_MEM_ACCESS_READ 0
+#define HVMOP_IOREQ_MEM_ACCESS_READ \
+    (1u << _HVMOP_IOREQ_MEM_ACCESS_READ)
+
+#define _HVMOP_IOREQ_MEM_ACCESS_WRITE 1
+#define HVMOP_IOREQ_MEM_ACCESS_WRITE \
+    (1u << _HVMOP_IOREQ_MEM_ACCESS_WRITE)
+};
+typedef struct xen_hvm_map_mem_type_to_ioreq_server
+    xen_hvm_map_mem_type_to_ioreq_server_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_map_mem_type_to_ioreq_server_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
 
 /*
-- 
1.9.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel