From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andres Lagar-Cavilla Subject: [PATCH 3 of 3] x86/mem_sharing: For shared pages with many references, use a hash table instead of a list Date: Thu, 12 Apr 2012 10:16:14 -0400 Message-ID: <7b606c043208d98d218b.1334240174@xdev.gridcentric.ca> References: Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel@lists.xen.org Cc: andres@gridcentric.ca, keir.xen@gmail.com, tim@xen.org, adin@gridcentric.ca List-Id: xen-devel@lists.xenproject.org xen/arch/x86/mm/mem_sharing.c | 170 +++++++++++++++++++++++++++++++++++-- xen/include/asm-x86/mem_sharing.h | 13 ++- 2 files changed, 169 insertions(+), 14 deletions(-) For shared frames that have many references, the doubly-linked list used to store the rmap results in costly scans during unshare operations. To alleviate the overhead, replace the linked list by a hash table. However, hash tables are space-intensive, so only use them for pages that have "many" (arbitrary threshold) references. Unsharing is heaviliy exercised during domain destroy. In experimental testing, for a domain that points over 100 thousand pages to the same shared frame, domain destruction dropped from over 7 minutes(!) to less than two seconds. Signed-off-by: Andres Lagar-Cavilla diff -r 1730bff8fccf -r 7b606c043208 xen/arch/x86/mm/mem_sharing.c --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -49,6 +49,17 @@ DEFINE_PER_CPU(pg_lock_data_t, __pld); #define MEM_SHARING_DEBUG(_f, _a...) \ debugtrace_printk("mem_sharing_debug: %s(): " _f, __func__, ##_a) +/* Reverse map defines */ +#define RMAP_HASHTAB_ORDER 0 +#define RMAP_HASHTAB_SIZE \ + ((PAGE_SIZE << RMAP_HASHTAB_ORDER) / sizeof(struct list_head)) +#define RMAP_USES_HASHTAB(page) \ + ((page)->sharing->hash_table.flag == NULL) +#define RMAP_HEAVY_SHARED_PAGE RMAP_HASHTAB_SIZE +/* A bit of hysteresis. We don't want to be mutating between list and hash + * table constantly. */ +#define RMAP_LIGHT_SHARED_PAGE (RMAP_HEAVY_SHARED_PAGE >> 2) + #if MEM_SHARING_AUDIT static struct list_head shr_audit_list; @@ -72,6 +83,11 @@ static inline void audit_add_list(struct /* Removes from the audit list and cleans up the page sharing metadata. */ static inline void page_sharing_dispose(struct page_info *page) { + /* Unlikely given our thresholds, but we should be careful. */ + if ( unlikely(RMAP_USES_HASHTAB(page)) ) + free_xenheap_pages(page->sharing->hash_table.bucket, + RMAP_HASHTAB_ORDER); + spin_lock(&shr_audit_lock); list_del_rcu(&page->sharing->entry); spin_unlock(&shr_audit_lock); @@ -89,6 +105,10 @@ int mem_sharing_audit(void) #define audit_add_list(p) ((void)0) static inline void page_sharing_dispose(struct page_info *page) { + /* Unlikely given our thresholds, but we should be careful. */ + if ( unlikely(RMAP_USES_HASHTAB(page)) ) + free_xenheap_pages(page->sharing->hash_table.bucket, + RMAP_HASHTAB_ORDER); xfree(page->sharing); } @@ -145,6 +165,11 @@ static atomic_t nr_saved_mfns = ATOMIC static atomic_t nr_shared_mfns = ATOMIC_INIT(0); /** Reverse map **/ +/* Every shared frame keeps a reverse map (rmap) of tuples that + * this shared frame backs. For pages with a low degree of sharing, a O(n) + * search linked list is good enough. For pages with higher degree of sharing, + * we use a hash table instead. */ + typedef struct gfn_info { unsigned long gfn; @@ -155,20 +180,109 @@ typedef struct gfn_info static inline void rmap_init(struct page_info *page) { + /* We always start off as a doubly linked list. */ INIT_LIST_HEAD(&page->sharing->gfns); } +/* Exceedingly simple "hash function" */ +#define HASH(domain, gfn) \ + (((gfn) + (domain)) % RMAP_HASHTAB_SIZE) + +/* Conversions. Tuned by the thresholds. Should only happen twice + * (once each) during the lifetime of a shared page */ +static inline int +rmap_list_to_hash_table(struct page_info *page) +{ + unsigned int i; + struct list_head *pos, *tmp, *b = + alloc_xenheap_pages(RMAP_HASHTAB_ORDER, 0); + + if ( b == NULL ) + return -ENOMEM; + + for (i = 0; i < RMAP_HASHTAB_SIZE; i++) + INIT_LIST_HEAD(b + i); + + list_for_each_safe(pos, tmp, &page->sharing->gfns) + { + gfn_info_t *gfn_info = list_entry(pos, gfn_info_t, list); + struct list_head *bucket = b + HASH(gfn_info->domain, gfn_info->gfn); + list_del(pos); + list_add(pos, bucket); + } + + page->sharing->hash_table.bucket = b; + page->sharing->hash_table.flag = NULL; + + return 0; +} + static inline void -rmap_del(gfn_info_t *gfn_info, struct page_info *page) +rmap_hash_table_to_list(struct page_info *page) { + unsigned int i; + struct list_head *bucket = page->sharing->hash_table.bucket; + + INIT_LIST_HEAD(&page->sharing->gfns); + + for (i = 0; i < RMAP_HASHTAB_SIZE; i++) + { + struct list_head *pos, *tmp, *head = bucket + i; + list_for_each_safe(pos, tmp, head) + { + list_del(pos); + list_add(pos, &page->sharing->gfns); + } + } + + free_xenheap_pages(bucket, RMAP_HASHTAB_ORDER); +} + +/* Generic accessors to the rmap */ +static inline unsigned long +rmap_count(struct page_info *pg) +{ + unsigned long count; + unsigned long t = read_atomic(&pg->u.inuse.type_info); + count = t & PGT_count_mask; + if ( t & PGT_locked ) + count--; + return count; +} + +/* The page type count is always decreased after removing from the rmap. + * Use a convert flag to avoid mutating the rmap if in the middle of an + * iterator, or if the page will be soon destroyed anyways. */ +static inline void +rmap_del(gfn_info_t *gfn_info, struct page_info *page, int convert) +{ + if ( RMAP_USES_HASHTAB(page) && convert && + (rmap_count(page) <= RMAP_LIGHT_SHARED_PAGE) ) + rmap_hash_table_to_list(page); + + /* Regardless of rmap type, same removal operation */ list_del(&gfn_info->list); } +/* The page type count is always increased before adding to the rmap. */ static inline void rmap_add(gfn_info_t *gfn_info, struct page_info *page) { + struct list_head *head; + + if ( !RMAP_USES_HASHTAB(page) && + (rmap_count(page) >= RMAP_HEAVY_SHARED_PAGE) ) + /* The conversion may fail with ENOMEM. We'll be less efficient, + * but no reason to panic. */ + (void)rmap_list_to_hash_table(page); + + head = (RMAP_USES_HASHTAB(page)) ? + page->sharing->hash_table.bucket + + HASH(gfn_info->domain, gfn_info->gfn) : + &page->sharing->gfns; + INIT_LIST_HEAD(&gfn_info->list); - list_add(&gfn_info->list, &page->sharing->gfns); + list_add(&gfn_info->list, head); } static inline gfn_info_t * @@ -176,27 +290,33 @@ rmap_retrieve(uint16_t domain_id, unsign struct page_info *page) { gfn_info_t *gfn_info; - struct list_head *le; - list_for_each(le, &page->sharing->gfns) + struct list_head *le, *head; + + head = (RMAP_USES_HASHTAB(page)) ? + page->sharing->hash_table.bucket + HASH(domain_id, gfn) : + &page->sharing->gfns; + + list_for_each(le, head) { gfn_info = list_entry(le, gfn_info_t, list); if ( (gfn_info->gfn == gfn) && (gfn_info->domain == domain_id) ) return gfn_info; } + + /* Nothing was found */ return NULL; } /* Returns true if the rmap has only one entry. O(1) complexity. */ static inline int rmap_has_one_entry(struct page_info *page) { - struct list_head *head = &page->sharing->gfns; - return (head->next != head) && (head->next->next == head); + return (rmap_count(page) == 1); } /* Returns true if the rmap has any entries. O(1) complexity. */ static inline int rmap_has_entries(struct page_info *page) { - return (!list_empty(&page->sharing->gfns)); + return (rmap_count(page) != 0); } /* The iterator hides the details of how the rmap is implemented. This @@ -204,20 +324,43 @@ static inline int rmap_has_entries(struc struct rmap_iterator { struct list_head *curr; struct list_head *next; + unsigned int bucket; }; static inline void rmap_seed_iterator(struct page_info *page, struct rmap_iterator *ri) { - ri->curr = &page->sharing->gfns; + ri->curr = (RMAP_USES_HASHTAB(page)) ? + page->sharing->hash_table.bucket : + &page->sharing->gfns; ri->next = ri->curr->next; + ri->bucket = 0; } static inline gfn_info_t * rmap_iterate(struct page_info *page, struct rmap_iterator *ri) { - if ( ri->next == &page->sharing->gfns) - return NULL; + struct list_head *head = (RMAP_USES_HASHTAB(page)) ? + page->sharing->hash_table.bucket + ri->bucket : + &page->sharing->gfns; + +retry: + if ( ri->next == head) + { + if ( RMAP_USES_HASHTAB(page) ) + { + ri->bucket++; + if ( ri->bucket >= RMAP_HASHTAB_SIZE ) + /* No more hash table buckets */ + return NULL; + head = page->sharing->hash_table.bucket + ri->bucket; + ri->curr = head; + ri->next = ri->curr->next; + goto retry; + } else + /* List exhausted */ + return NULL; + } ri->curr = ri->next; ri->next = ri->curr->next; @@ -253,7 +396,7 @@ static inline void mem_sharing_gfn_destr atomic_dec(&d->shr_pages); /* Free the gfn_info structure. */ - rmap_del(gfn_info, page); + rmap_del(gfn_info, page, 1); xfree(gfn_info); } @@ -870,8 +1013,9 @@ int mem_sharing_share_pages(struct domai /* Get the source page and type, this should never fail: * we are under shr lock, and got a successful lookup */ BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page)); - /* Move the gfn_info from client list to source list */ - rmap_del(gfn, cpage); + /* Move the gfn_info from client list to source list. + * Don't change the type of rmap for the client page. */ + rmap_del(gfn, cpage, 0); rmap_add(gfn, spage); put_page_and_type(cpage); d = get_domain_by_id(gfn->domain); diff -r 1730bff8fccf -r 7b606c043208 xen/include/asm-x86/mem_sharing.h --- a/xen/include/asm-x86/mem_sharing.h +++ b/xen/include/asm-x86/mem_sharing.h @@ -30,6 +30,13 @@ typedef uint64_t shr_handle_t; +typedef struct rmap_hashtab { + struct list_head *bucket; + /* Overlaps with prev pointer of list_head in union below. + * Unlike the prev pointer, this can be NULL. */ + void *flag; +} rmap_hashtab_t; + struct page_sharing_info { struct page_info *pg; /* Back pointer to the page. */ @@ -38,7 +45,11 @@ struct page_sharing_info struct list_head entry; /* List of all shared pages (entry). */ struct rcu_head rcu_head; /* List of all shared pages (entry). */ #endif - struct list_head gfns; /* List of domains and gfns for this page (head). */ + /* Reverse map of tuples for this shared frame. */ + union { + struct list_head gfns; + rmap_hashtab_t hash_table; + }; }; #ifdef __x86_64__