From: Andrea Arcangeli <andrea@qumranet.com>
To: Avi Kivity <avi@qumranet.com>
Cc: Izik Eidus <izike@qumranet.com>, Andrew Morton <akpm@osdl.org>,
Nick Piggin <npiggin@suse.de>,
kvm-devel@lists.sourceforge.net,
Benjamin Herrenschmidt <benh@kernel.crashing.org>,
steiner@sgi.com, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, daniel.blueman@quadrics.com, holt@sgi.com,
Hugh Dickins <hugh@veritas.com>,
clameter@sgi.com
Subject: Re: [kvm-devel] [PATCH] mmu notifiers #v4
Date: Tue, 22 Jan 2008 21:08:58 +0100 [thread overview]
Message-ID: <20080122200858.GB15848@v2.random> (raw)
In-Reply-To: <20080122144332.GE7331@v2.random>
This last update avoids the need to refresh the young bit in the linux
pte through follow_page and it allows tracking the accessed bits set
by the hardware in the sptes without requiring vmexits in certain
implementations.
KVM side is here:
http://marc.info/?l=kvm-devel&m=120103225508669&w=2
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -46,6 +46,7 @@
__young = ptep_test_and_clear_young(__vma, __address, __ptep); \
if (__young) \
flush_tlb_page(__vma, __address); \
+ __young |= mmu_notifier_age_page((__vma)->vm_mm, __address); \
__young; \
})
#endif
@@ -86,6 +87,7 @@ do { \
pte_t __pte; \
__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \
flush_tlb_page(__vma, __address); \
+ mmu_notifier(invalidate_page, (__vma)->vm_mm, __address); \
__pte; \
})
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
+#include <linux/mmu_notifier.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -219,6 +220,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
+#endif
};
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,82 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+ void (*release)(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+ int (*age_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+ void (*invalidate_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+ void (*invalidate_range)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier_head {
+ struct hlist_head head;
+ rwlock_t lock;
+};
+
+struct mmu_notifier {
+ struct hlist_node hlist;
+ const struct mmu_notifier_ops *ops;
+};
+
+#include <linux/mm_types.h>
+
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern void mmu_notifier_release(struct mm_struct *mm);
+extern int mmu_notifier_age_page(struct mm_struct *mm,
+ unsigned long address);
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+{
+ INIT_HLIST_HEAD(&mnh->head);
+ rwlock_init(&mnh->lock);
+}
+
+#define mmu_notifier(function, mm, args...) \
+ do { \
+ struct mmu_notifier *__mn; \
+ struct hlist_node *__n; \
+ \
+ if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
+ read_lock(&(mm)->mmu_notifier.lock); \
+ hlist_for_each_entry(__mn, __n, \
+ &(mm)->mmu_notifier.head, \
+ hlist) \
+ if (__mn->ops->function) \
+ __mn->ops->function(__mn, \
+ mm, \
+ args); \
+ read_unlock(&(mm)->mmu_notifier.lock); \
+ } \
+ } while (0)
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn, mm) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+#define mmu_notifier_age_page(mm, address) ({ 0; })
+#define mmu_notifier_head_init(mmh) do {} while (0)
+
+#define mmu_notifier(function, mm, args...) \
+ do { } while (0)
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,6 +359,7 @@ static struct mm_struct * mm_init(struct
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
+ mmu_notifier_head_init(&mm->mmu_notifier);
return mm;
}
free_mm(mm);
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+ def_bool y
+ bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,4 +30,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -753,6 +753,7 @@ void __unmap_hugepage_range(struct vm_ar
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier(invalidate_range, mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -889,6 +889,7 @@ unsigned long zap_page_range(struct vm_a
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
+ mmu_notifier(invalidate_range, mm, address, end);
return end;
}
@@ -1317,7 +1318,7 @@ int remap_pfn_range(struct vm_area_struc
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + PAGE_ALIGN(size);
+ unsigned long start = addr, end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -1358,6 +1359,7 @@ int remap_pfn_range(struct vm_area_struc
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range, mm, start, end);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1441,7 +1443,7 @@ int apply_to_page_range(struct mm_struct
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;
BUG_ON(addr >= end);
@@ -1452,6 +1454,7 @@ int apply_to_page_range(struct mm_struct
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range, mm, start, end);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1747,6 +1747,7 @@ static void unmap_region(struct mm_struc
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ mmu_notifier(invalidate_range, mm, start, end);
}
/*
@@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
+ mmu_notifier_release(mm);
/*
* Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,68 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+
+void mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n, *tmp;
+
+ if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+ read_lock(&mm->mmu_notifier.lock);
+ hlist_for_each_entry_safe(mn, n, tmp,
+ &mm->mmu_notifier.head, hlist) {
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ hlist_del(&mn->hlist);
+ }
+ read_unlock(&mm->mmu_notifier.lock);
+ }
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->age_page can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int mmu_notifier_age_page(struct mm_struct *mm, unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n, *tmp;
+ int young = 0;
+
+ if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+ read_lock(&mm->mmu_notifier.lock);
+ hlist_for_each_entry_safe(mn, n, tmp,
+ &mm->mmu_notifier.head, hlist) {
+ if (mn->ops->age_page)
+ young |= mn->ops->age_page(mn, mm, address);
+ }
+ read_unlock(&mm->mmu_notifier.lock);
+ }
+
+ return young;
+}
+
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ write_lock(&mm->mmu_notifier.lock);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier.head);
+ write_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ write_lock(&mm->mmu_notifier.lock);
+ hlist_del(&mn->hlist);
+ write_unlock(&mm->mmu_notifier.lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
next prev parent reply other threads:[~2008-01-22 20:09 UTC|newest]
Thread overview: 67+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-13 16:24 [PATCH] mmu notifiers #v2 Andrea Arcangeli
2008-01-13 21:11 ` Benjamin Herrenschmidt
2008-01-14 20:02 ` Christoph Lameter
2008-01-15 4:28 ` Benjamin Herrenschmidt
2008-01-15 12:44 ` Andrea Arcangeli
2008-01-15 20:18 ` Benjamin Herrenschmidt
2008-01-16 1:06 ` Andrea Arcangeli
2008-01-16 9:01 ` Brice Goglin
2008-01-16 10:19 ` Andrea Arcangeli
2008-01-16 17:42 ` Rik van Riel
2008-01-16 17:48 ` Izik Eidus
2008-01-17 16:23 ` Andrea Arcangeli
2008-01-17 18:21 ` Izik Eidus
2008-01-17 19:32 ` Andrea Arcangeli
2008-01-21 12:52 ` [PATCH] mmu notifiers #v3 Andrea Arcangeli
2008-01-22 2:21 ` Rik van Riel
2008-01-22 14:12 ` [kvm-devel] " Avi Kivity
2008-01-22 14:43 ` Andrea Arcangeli
2008-01-22 20:08 ` Andrea Arcangeli [this message]
2008-01-22 20:34 ` [kvm-devel] [PATCH] export notifier #1 Christoph Lameter
2008-01-22 22:31 ` Andrea Arcangeli
2008-01-22 22:53 ` Christoph Lameter
2008-01-23 10:27 ` Avi Kivity
2008-01-23 10:52 ` Robin Holt
2008-01-23 12:04 ` Andrea Arcangeli
2008-01-23 12:34 ` Robin Holt
2008-01-23 19:48 ` Christoph Lameter
2008-01-23 19:58 ` Robin Holt
2008-01-23 19:47 ` Christoph Lameter
2008-01-24 5:56 ` Avi Kivity
2008-01-24 12:26 ` Andrea Arcangeli
2008-01-24 12:34 ` Avi Kivity
2008-01-23 11:41 ` Andrea Arcangeli
2008-01-23 12:32 ` Robin Holt
2008-01-23 17:33 ` Andrea Arcangeli
2008-01-23 20:27 ` Christoph Lameter
2008-01-24 15:42 ` Andrea Arcangeli
2008-01-24 20:07 ` Christoph Lameter
2008-01-25 6:35 ` Avi Kivity
2008-01-23 20:18 ` Christoph Lameter
2008-01-24 14:34 ` Andrea Arcangeli
2008-01-24 14:41 ` Andrea Arcangeli
2008-01-24 15:15 ` Avi Kivity
2008-01-24 15:18 ` Avi Kivity
2008-01-24 20:01 ` Christoph Lameter
2008-01-22 23:36 ` Benjamin Herrenschmidt
2008-01-23 0:40 ` Christoph Lameter
2008-01-23 1:21 ` Robin Holt
2008-01-23 12:51 ` Gerd Hoffmann
2008-01-23 13:19 ` Robin Holt
2008-01-23 14:12 ` Gerd Hoffmann
2008-01-23 14:18 ` Robin Holt
2008-01-23 14:35 ` Gerd Hoffmann
2008-01-23 15:48 ` Robin Holt
2008-01-23 14:17 ` Avi Kivity
2008-01-24 4:03 ` Benjamin Herrenschmidt
2008-01-23 15:41 ` Andrea Arcangeli
2008-01-23 17:47 ` Gerd Hoffmann
2008-01-24 6:01 ` Avi Kivity
2008-01-24 6:45 ` Jeremy Fitzhardinge
2008-01-23 20:40 ` Christoph Lameter
2008-01-24 2:00 ` Enhance mmu notifiers to accomplish a lockless implementation (incomplete) Robin Holt
2008-01-24 4:05 ` Robin Holt
2008-01-22 19:28 ` [PATCH] mmu notifiers #v3 Peter Zijlstra
2008-01-22 20:31 ` Christoph Lameter
2008-01-22 20:31 ` Andrea Arcangeli
2008-01-22 22:10 ` Hugh Dickins
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080122200858.GB15848@v2.random \
--to=andrea@qumranet.com \
--cc=akpm@osdl.org \
--cc=avi@qumranet.com \
--cc=benh@kernel.crashing.org \
--cc=clameter@sgi.com \
--cc=daniel.blueman@quadrics.com \
--cc=holt@sgi.com \
--cc=hugh@veritas.com \
--cc=izike@qumranet.com \
--cc=kvm-devel@lists.sourceforge.net \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=npiggin@suse.de \
--cc=steiner@sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).