Re: open-coded page list manipulation in shadow code

From: Tim Deegan <tim@xen.org>
To: Jan Beulich <JBeulich@suse.com>
Cc: xen-devel <xen-devel@lists.xenproject.org>
Subject: Re: open-coded page list manipulation in shadow code
Date: Thu, 29 Jan 2015 18:30:46 +0100	[thread overview]
Message-ID: <20150129173046.GG30353@deinos.phlegethon.org> (raw)
In-Reply-To: <20150127105026.GA38811@deinos.phlegethon.org>

Hi, 

At 11:50 +0100 on 27 Jan (1422355826), Tim Deegan wrote:
> Let me have a think about it on Thursday (assuming I don't have too
> many patch series to review by then!)

OK, here's what I have.  It keeps the mechanism the same, threading on
the main page list entry, but tidies it up to use the page_list operations
rather than open-coding.  I've tested it (lightly - basically jsut
boot testing) with 32bit, 32pae and 64bit Windows VMs (all SMP), with
bigmem=y and bigmem=n.

It was developed on top of your bigmem series, but it should apply OK
before patch 3/4, removing the need to nobble shadow-paging in that
patch.

Cheers,

Tim.

>From 57d211e47ef42012791252b4897231b392b7aff7 Mon Sep 17 00:00:00 2001
From: Tim Deegan <tim@xen.org>
Date: Thu, 29 Jan 2015 17:09:58 +0000
Subject: [PATCH] x86/shadow: Tidy up fragmentary page lists in multi-page
 shadows.

Multi-page shadows are linked together using the 'list' field.  When
those shadows are in the pinned list, the list fragments are spliced
into the pinned list; otherwise they have no associated list head.

Rework the code that handles these fragments to use the page_list
interface rather than manipulating the fields directly.  This makes
the code cleaner, and allows the 'list' field to be either the
compact pdx form or a normal list_entry.

Signed-off-by: Tim Deegan <tim@xen.org>
---
 xen/arch/x86/mm/shadow/common.c  |  42 +++--------
 xen/arch/x86/mm/shadow/multi.c   |  14 ++--
 xen/arch/x86/mm/shadow/private.h | 151 ++++++++++++++++++++++-----------------
 xen/include/xen/mm.h             |   9 +++
 4 files changed, 114 insertions(+), 102 deletions(-)

diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 61110bb..12a86c2 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1220,33 +1220,6 @@ static unsigned int shadow_min_acceptable_pages(struct domain *d)
     return (vcpu_count * 128);
 } 
 
-/* Figure out the size (in pages) of a given shadow type */
-static inline u32
-shadow_size(unsigned int shadow_type) 
-{
-    static const u32 type_to_size[SH_type_unused] = {
-        1, /* SH_type_none           */
-        2, /* SH_type_l1_32_shadow   */
-        2, /* SH_type_fl1_32_shadow  */
-        4, /* SH_type_l2_32_shadow   */
-        1, /* SH_type_l1_pae_shadow  */
-        1, /* SH_type_fl1_pae_shadow */
-        1, /* SH_type_l2_pae_shadow  */
-        1, /* SH_type_l2h_pae_shadow */
-        1, /* SH_type_l1_64_shadow   */
-        1, /* SH_type_fl1_64_shadow  */
-        1, /* SH_type_l2_64_shadow   */
-        1, /* SH_type_l2h_64_shadow  */
-        1, /* SH_type_l3_64_shadow   */
-        1, /* SH_type_l4_64_shadow   */
-        1, /* SH_type_p2m_table      */
-        1, /* SH_type_monitor_table  */
-        1  /* SH_type_oos_snapshot   */
-        };
-    ASSERT(shadow_type < SH_type_unused);
-    return type_to_size[shadow_type];
-}
-
 /* Dispatcher function: call the per-mode function that will unhook the
  * non-Xen mappings in this top-level shadow mfn.  With user_only == 1,
  * unhooks only the user-mode mappings. */
@@ -1489,9 +1462,6 @@ mfn_t shadow_alloc(struct domain *d,
         break;
     }
 
-    /* Page lists don't have pointers back to the head structure, so
-     * it's safe to use a head structure on the stack to link the pages
-     * together. */
     INIT_PAGE_LIST_HEAD(&tmp_list);
 
     /* Init page info fields and clear the pages */
@@ -1525,6 +1495,14 @@ mfn_t shadow_alloc(struct domain *d,
     if ( shadow_type >= SH_type_min_shadow 
          && shadow_type <= SH_type_max_shadow )
         sp->u.sh.head = 1;
+
+#ifndef PAGE_LIST_NULL
+    /* The temporary list-head is on our stack.  Blank out the
+     * pointers to it in the shadows, just to get a clean failure if
+     * we accidentally follow them. */
+    tmp_list.next->prev = tmp_list.prev->next = NULL;
+#endif
+
     return page_to_mfn(sp);
 }
 
@@ -1533,6 +1511,7 @@ mfn_t shadow_alloc(struct domain *d,
 void shadow_free(struct domain *d, mfn_t smfn)
 {
     struct page_info *next = NULL, *sp = mfn_to_page(smfn); 
+    struct page_list_head *pin_list;
     unsigned int pages;
     u32 shadow_type;
     int i;
@@ -1544,6 +1523,7 @@ void shadow_free(struct domain *d, mfn_t smfn)
     ASSERT(shadow_type != SH_type_none);
     ASSERT(sp->u.sh.head || (shadow_type > SH_type_max_shadow));
     pages = shadow_size(shadow_type);
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
 
     for ( i = 0; i < pages; i++ ) 
     {
@@ -1564,7 +1544,7 @@ void shadow_free(struct domain *d, mfn_t smfn)
 #endif
         /* Get the next page before we overwrite the list header */
         if ( i < pages - 1 )
-            next = pdx_to_page(sp->list.next);
+            next = page_list_next(sp, pin_list);
         /* Strip out the type: this is now a free shadow page */
         sp->u.sh.type = sp->u.sh.head = 0;
         /* Remember the TLB timestamp so we will know whether to flush 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 65815bb..5fc10c9 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -428,20 +428,20 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
 /* From one page of a multi-page shadow, find the next one */
 static inline mfn_t sh_next_page(mfn_t smfn)
 {
-    mfn_t next;
-    struct page_info *pg = mfn_to_page(smfn);
+    struct page_info *pg = mfn_to_page(smfn), *next;
+    struct page_list_head h = PAGE_LIST_HEAD_INIT(h);
 
     ASSERT(pg->u.sh.type == SH_type_l1_32_shadow
            || pg->u.sh.type == SH_type_fl1_32_shadow
            || pg->u.sh.type == SH_type_l2_32_shadow);
     ASSERT(pg->u.sh.type == SH_type_l2_32_shadow || pg->u.sh.head);
-    ASSERT(pg->list.next != PAGE_LIST_NULL);
 
-    next = _mfn(pdx_to_pfn(pg->list.next));
+    next = page_list_next(pg, &h);
 
-    ASSERT(mfn_to_page(next)->u.sh.type == pg->u.sh.type);
-    ASSERT(!mfn_to_page(next)->u.sh.head);
-    return next;
+    ASSERT(next);
+    ASSERT(next->u.sh.type == pg->u.sh.type);
+    ASSERT(!next->u.sh.head);
+    return page_to_mfn(next);
 }
 
 static inline u32
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index b778fcf..b145cee 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -318,6 +318,33 @@ static inline int mfn_oos_may_write(mfn_t gmfn)
 }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
+/* Figure out the size (in pages) of a given shadow type */
+static inline u32
+shadow_size(unsigned int shadow_type) 
+{
+    static const u32 type_to_size[SH_type_unused] = {
+        1, /* SH_type_none           */
+        2, /* SH_type_l1_32_shadow   */
+        2, /* SH_type_fl1_32_shadow  */
+        4, /* SH_type_l2_32_shadow   */
+        1, /* SH_type_l1_pae_shadow  */
+        1, /* SH_type_fl1_pae_shadow */
+        1, /* SH_type_l2_pae_shadow  */
+        1, /* SH_type_l2h_pae_shadow */
+        1, /* SH_type_l1_64_shadow   */
+        1, /* SH_type_fl1_64_shadow  */
+        1, /* SH_type_l2_64_shadow   */
+        1, /* SH_type_l2h_64_shadow  */
+        1, /* SH_type_l3_64_shadow   */
+        1, /* SH_type_l4_64_shadow   */
+        1, /* SH_type_p2m_table      */
+        1, /* SH_type_monitor_table  */
+        1  /* SH_type_oos_snapshot   */
+        };
+    ASSERT(shadow_type < SH_type_unused);
+    return type_to_size[shadow_type];
+}
+
 /******************************************************************************
  * Various function declarations 
  */
@@ -586,22 +613,25 @@ prev_pinned_shadow(const struct page_info *page,
                    const struct domain *d)
 {
     struct page_info *p;
+    const struct page_list_head *pin_list;
+
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
 
-    if ( page == d->arch.paging.shadow.pinned_shadows.next ) 
+    if ( page_list_empty(pin_list) || page == page_list_first(pin_list) )
         return NULL;
-    
+
     if ( page == NULL ) /* If no current place, start at the tail */
-        p = d->arch.paging.shadow.pinned_shadows.tail;
+        p = page_list_last(pin_list);
     else
-        p = pdx_to_page(page->list.prev);
+        p = page_list_prev(page, pin_list);
     /* Skip over the non-tail parts of multi-page shadows */
     if ( p && p->u.sh.type == SH_type_l2_32_shadow )
     {
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
     }
     ASSERT(!p || p->u.sh.head);
@@ -618,49 +648,48 @@ prev_pinned_shadow(const struct page_info *page,
  * Returns 0 for failure, 1 for success. */
 static inline int sh_pin(struct vcpu *v, mfn_t smfn)
 {
-    struct page_info *sp;
-    struct page_list_head h, *pin_list;
-    
+    struct page_info *sp[4];
+    struct page_list_head *pin_list;
+    unsigned int i, pages;
+    bool_t already_pinned;
+
     ASSERT(mfn_valid(smfn));
-    sp = mfn_to_page(smfn);
-    ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
-    ASSERT(sp->u.sh.head);
+    sp[0] = mfn_to_page(smfn);
+    pages = shadow_size(sp[0]->u.sh.type);
+    already_pinned = sp[0]->u.sh.pinned;
+    ASSERT(sh_type_is_pinnable(v, sp[0]->u.sh.type));
+    ASSERT(sp[0]->u.sh.head);
+
+    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
+    if ( already_pinned && sp[0] == page_list_first(pin_list) )
+        return 1;
 
     /* Treat the up-to-four pages of the shadow as a unit in the list ops */
-    h.next = h.tail = sp; 
-    if ( sp->u.sh.type == SH_type_l2_32_shadow ) 
+    for ( i = 1; i < pages; i++ )
     {
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); 
+        sp[i] = page_list_next(sp[i - 1], pin_list);
+        ASSERT(sp[i]->u.sh.type == sp[0]->u.sh.type);
+        ASSERT(!sp[i]->u.sh.head);
     }
-    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
 
-    if ( sp->u.sh.pinned )
+    if ( already_pinned )
     {
-        /* Already pinned: take it out of the pinned-list so it can go 
-         * at the front */
-        if ( pin_list->next == h.next )
-            return 1;
-        page_list_prev(h.next, pin_list)->list.next = h.tail->list.next;
-        if ( pin_list->tail == h.tail )
-            pin_list->tail = page_list_prev(h.next, pin_list);
-        else
-            page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev;
-        h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL;
+        /* Take it out of the pinned-list so it can go at the front */
+        for ( i = 0; i < pages; i++ )
+            page_list_del(sp[i], pin_list);
     }
     else
     {
         /* Not pinned: pin it! */
         if ( !sh_get_ref(v, smfn, 0) )
             return 0;
-        sp->u.sh.pinned = 1;
-        ASSERT(h.next->list.prev == PAGE_LIST_NULL);
-        ASSERT(h.tail->list.next == PAGE_LIST_NULL);
+        sp[0]->u.sh.pinned = 1;
     }
+
     /* Put it at the head of the list of pinned shadows */
-    page_list_splice(&h, pin_list);
+    for ( i = pages; i > 0; i-- )
+        page_list_add(sp[i - 1], pin_list);
+
     return 1;
 }
 
@@ -668,46 +697,40 @@ static inline int sh_pin(struct vcpu *v, mfn_t smfn)
  * of pinned shadows, and release the extra ref. */
 static inline void sh_unpin(struct vcpu *v, mfn_t smfn)
 {
-    struct page_list_head h, *pin_list;
-    struct page_info *sp;
-    
+    struct page_list_head tmp_list, *pin_list;
+    struct page_info *sp, *next;
+    unsigned int i, head_type;
+
     ASSERT(mfn_valid(smfn));
     sp = mfn_to_page(smfn);
+    head_type = sp->u.sh.type;
     ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
     ASSERT(sp->u.sh.head);
 
-    /* Treat the up-to-four pages of the shadow as a unit in the list ops */
-    h.next = h.tail = sp; 
-    if ( sp->u.sh.type == SH_type_l2_32_shadow ) 
-    {
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); 
-    }
-    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
-
     if ( !sp->u.sh.pinned )
         return;
-
     sp->u.sh.pinned = 0;
 
-    /* Cut the sub-list out of the list of pinned shadows */
-    if ( pin_list->next == h.next && pin_list->tail == h.tail )
-        pin_list->next = pin_list->tail = NULL;
-    else 
+    /* Cut the sub-list out of the list of pinned shadows,
+     * stitching it back into a list fragment of its own. */
+    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
+    INIT_PAGE_LIST_HEAD(&tmp_list);
+    for ( i = 0; i < shadow_size(head_type); i++ )
     {
-        if ( pin_list->next == h.next )
-            pin_list->next = page_list_next(h.tail, pin_list);
-        else
-            page_list_prev(h.next, pin_list)->list.next = h.tail->list.next;
-        if ( pin_list->tail == h.tail )
-            pin_list->tail = page_list_prev(h.next, pin_list);
-        else
-            page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev;
+        ASSERT(sp->u.sh.type == head_type);
+        ASSERT(!i || !sp->u.sh.head);
+        next = page_list_next(sp, pin_list);
+        page_list_del(sp, pin_list);
+        page_list_add_tail(sp, &tmp_list);
+        sp = next;
     }
-    h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL;
-    
+#ifndef PAGE_LIST_NULL
+    /* The temporary list-head is on our stack.  Blank out the
+     * pointers to it in the shadows, just to get a clean failure if
+     * we accidentally follow them. */
+    tmp_list.next->prev = tmp_list.prev->next = NULL;
+#endif
+
     sh_put_ref(v, smfn, 0);
 }
 
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 74a65a6..a62ee1e 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -172,6 +172,11 @@ page_list_first(const struct page_list_head *head)
     return head->next;
 }
 static inline struct page_info *
+page_list_last(const struct page_list_head *head)
+{
+    return head->tail;
+}
+static inline struct page_info *
 page_list_next(const struct page_info *page,
                const struct page_list_head *head)
 {
@@ -331,8 +336,12 @@ page_list_splice(struct page_list_head *list, struct page_list_head *head)
 # define page_list_empty                 list_empty
 # define page_list_first(hd)             list_entry((hd)->next, \
                                                     struct page_info, list)
+# define page_list_last(hd)              list_entry((hd)->prev, \
+                                                    struct page_info, list)
 # define page_list_next(pg, hd)          list_entry((pg)->list.next, \
                                                     struct page_info, list)
+# define page_list_prev(pg, hd)          list_entry((pg)->list.prev, \
+                                                    struct page_info, list)
 # define page_list_add(pg, hd)           list_add(&(pg)->list, hd)
 # define page_list_add_tail(pg, hd)      list_add_tail(&(pg)->list, hd)
 # define page_list_del(pg, hd)           list_del(&(pg)->list)
-- 
2.1.4