From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Andrew Morton <akpm@linux-foundation.org>,
Andrea Arcangeli <aarcange@redhat.com>,
linux-mm@kvack.org
Cc: Andi Kleen <ak@linux.intel.com>,
"H. Peter Anvin" <hpa@linux.intel.com>,
linux-kernel@vger.kernel.org,
"Kirill A. Shutemov" <kirill@shutemov.name>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCH v4 10/10] thp: implement refcounting for huge zero page
Date: Mon, 15 Oct 2012 09:00:59 +0300 [thread overview]
Message-ID: <1350280859-18801-11-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1350280859-18801-1-git-send-email-kirill.shutemov@linux.intel.com>
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
H. Peter Anvin doesn't like huge zero page which sticks in memory forever
after the first allocation. Here's implementation of lockless refcounting
for huge zero page.
We have two basic primitives: {get,put}_huge_zero_page(). They
manipulate reference counter.
If counter is 0, get_huge_zero_page() allocates a new huge page and
takes two references: one for caller and one for shrinker. We free the
page only in shrinker callback if counter is 1 (only shrinker has the
reference).
put_huge_zero_page() only decrements counter. Counter is never zero
in put_huge_zero_page() since shrinker holds on reference.
Freeing huge zero page in shrinker callback helps to avoid frequent
allocate-free.
Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
parallel (40 processes) read page faulting comparing to lazy huge page
allocation. I think it's pretty reasonable for synthetic benchmark.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
mm/huge_memory.c | 111 ++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 87 insertions(+), 24 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8fae26a..a4f2110 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/shrinker.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -46,7 +47,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
-static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -159,31 +159,74 @@ static int start_khugepaged(void)
return err;
}
-static int init_huge_zero_pfn(void)
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- struct page *hpage;
- unsigned long pfn;
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
- hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!hpage)
- return -ENOMEM;
- pfn = page_to_pfn(hpage);
- if (cmpxchg(&huge_zero_pfn, 0, pfn))
- __free_page(hpage);
- return 0;
+ if (!zero_page)
+ return 0;
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
}
-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static void put_huge_zero_page(void)
{
- return huge_zero_pfn && pfn == huge_zero_pfn;
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static inline bool is_huge_zero_pmd(pmd_t pmd)
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- return is_huge_zero_pfn(pmd_pfn(pmd));
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
}
+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS
static ssize_t double_flag_show(struct kobject *kobj,
@@ -575,6 +618,8 @@ static int __init hugepage_init(void)
goto out;
}
+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -697,10 +742,11 @@ static inline struct page *alloc_hugepage(int defrag)
#endif
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
{
pmd_t entry;
- entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
@@ -723,15 +769,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
- if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
+ unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
spin_lock(&mm->page_table_lock);
- set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -800,7 +850,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
if (is_huge_zero_pmd(pmd)) {
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ unsigned long zero_pfn;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
ret = 0;
goto out_unlock;
}
@@ -907,6 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1108,8 +1167,10 @@ alloc:
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ }
if (page) {
VM_BUG_ON(!PageHead(page));
page_remove_rmap(page);
@@ -1187,6 +1248,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
@@ -2543,6 +2605,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(vma->vm_mm, pmd, pgtable);
+ put_huge_zero_page();
}
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
--
1.7.7.6
next prev parent reply other threads:[~2012-10-15 6:00 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-15 6:00 [PATCH v4 00/10, REBASED] Introduce huge zero page Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 01/10] thp: huge zero page: basic preparation Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 02/10] thp: zap_huge_pmd(): zap huge zero pmd Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 03/10] thp: copy_huge_pmd(): copy huge zero page Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 04/10] thp: do_huge_pmd_wp_page(): handle " Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 05/10] thp: change_huge_pmd(): keep huge zero page write-protected Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 06/10] thp: change split_huge_page_pmd() interface Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 07/10] thp: implement splitting pmd for huge zero page Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 08/10] thp: setup huge zero page on non-write page fault Kirill A. Shutemov
2012-10-15 6:00 ` [PATCH v4 09/10] thp: lazy huge zero page allocation Kirill A. Shutemov
2012-10-15 6:00 ` Kirill A. Shutemov [this message]
2012-10-18 23:45 ` [PATCH v4 10/10] thp: implement refcounting for huge zero page Andrew Morton
2012-10-18 23:59 ` Kirill A. Shutemov
2012-10-23 6:35 ` Kirill A. Shutemov
2012-10-23 6:43 ` Andrew Morton
2012-10-23 7:00 ` Kirill A. Shutemov
2012-10-23 22:59 ` Andrew Morton
2012-10-23 23:38 ` Kirill A. Shutemov
2012-10-24 19:22 ` Andrew Morton
2012-10-24 19:45 ` Kirill A. Shutemov
2012-10-24 20:25 ` Andrew Morton
2012-10-24 20:33 ` Kirill A. Shutemov
2012-10-24 20:44 ` Andi Kleen
2012-10-25 20:49 ` Kirill A. Shutemov
2012-10-25 21:05 ` Andrew Morton
2012-10-25 21:22 ` Kirill A. Shutemov
2012-10-25 21:37 ` Andrew Morton
2012-10-25 22:10 ` Kirill A. Shutemov
2012-10-16 9:53 ` [PATCH v4 00/10, REBASED] Introduce " Ni zhan Chen
2012-10-16 10:54 ` Kirill A. Shutemov
2012-10-16 11:13 ` Ni zhan Chen
2012-10-16 11:28 ` Kirill A. Shutemov
2012-10-16 11:37 ` Ni zhan Chen
2012-10-26 15:14 ` [PATCH] thp, vmstat: implement HZP_ALLOC and HZP_ALLOC_FAILED events Kirill A. Shutemov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1350280859-18801-11-git-send-email-kirill.shutemov@linux.intel.com \
--to=kirill.shutemov@linux.intel.com \
--cc=aarcange@redhat.com \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=hpa@linux.intel.com \
--cc=kirill@shutemov.name \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).