linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Chao Peng <chao.p.peng@linux.intel.com>
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org, linux-doc@vger.kernel.org,
	qemu-devel@nongnu.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Jonathan Corbet <corbet@lwn.net>,
	Sean Christopherson <seanjc@google.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org, "H . Peter Anvin" <hpa@zytor.com>,
	Hugh Dickins <hughd@google.com>, Jeff Layton <jlayton@kernel.org>,
	"J . Bruce Fields" <bfields@fieldses.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Rapoport <rppt@kernel.org>,
	Steven Price <steven.price@arm.com>,
	"Maciej S . Szmigiero" <mail@maciej.szmigiero.name>,
	Vlastimil Babka <vbabka@suse.cz>,
	Vishal Annapurve <vannapurve@google.com>,
	Yu Zhang <yu.c.zhang@linux.intel.com>,
	Chao Peng <chao.p.peng@linux.intel.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	luto@kernel.org, jun.nakajima@intel.com, dave.hansen@intel.com,
	ak@linux.intel.com, david@redhat.com, aarcange@redhat.com,
	ddutile@redhat.com, dhildenb@redhat.com,
	Quentin Perret <qperret@google.com>,
	Michael Roth <michael.roth@amd.com>,
	mhocko@suse.com
Subject: [PATCH v6 1/8] mm: Introduce memfile_notifier
Date: Thu, 19 May 2022 23:37:06 +0800	[thread overview]
Message-ID: <20220519153713.819591-2-chao.p.peng@linux.intel.com> (raw)
In-Reply-To: <20220519153713.819591-1-chao.p.peng@linux.intel.com>

This patch introduces memfile_notifier facility so existing memory file
subsystems (e.g. tmpfs/hugetlbfs) can provide memory pages to allow a
third kernel component to make use of memory bookmarked in the memory
file and gets notified when the pages in the memory file become
allocated/invalidated.

It will be used for KVM to use a file descriptor as the guest memory
backing store and KVM will use this memfile_notifier interface to
interact with memory file subsystems. In the future there might be other
consumers (e.g. VFIO with encrypted device memory).

It consists below components:
 - memfile_backing_store: Each supported memory file subsystem can be
   implemented as a memory backing store which bookmarks memory and
   provides callbacks for other kernel systems (memfile_notifier
   consumers) to interact with.
 - memfile_notifier: memfile_notifier consumers defines callbacks and
   associate them to a file using memfile_register_notifier().
 - memfile_node: A memfile_node is associated with the file (inode) from
   the backing store and includes feature flags and a list of registered
   memfile_notifier for notifying.

Userspace is in charge of guest memory lifecycle: it first allocates
pages in memory backing store and then passes the fd to KVM and lets KVM
register memory slot to memory backing store via memfile_register_notifier.

Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/memfile_notifier.h |  99 ++++++++++++++++++++++
 mm/Kconfig                       |   4 +
 mm/Makefile                      |   1 +
 mm/memfile_notifier.c            | 137 +++++++++++++++++++++++++++++++
 4 files changed, 241 insertions(+)
 create mode 100644 include/linux/memfile_notifier.h
 create mode 100644 mm/memfile_notifier.c

diff --git a/include/linux/memfile_notifier.h b/include/linux/memfile_notifier.h
new file mode 100644
index 000000000000..dcb3ee6ed626
--- /dev/null
+++ b/include/linux/memfile_notifier.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMFILE_NOTIFIER_H
+#define _LINUX_MEMFILE_NOTIFIER_H
+
+#include <linux/pfn_t.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/fs.h>
+
+
+#define MEMFILE_F_USER_INACCESSIBLE	BIT(0)	/* memory allocated in the file is inaccessible from userspace (e.g. read/write/mmap) */
+#define MEMFILE_F_UNMOVABLE		BIT(1)	/* memory allocated in the file is unmovable (e.g. via pagemigration)*/
+#define MEMFILE_F_UNRECLAIMABLE		BIT(2)	/* memory allocated in the file is unreclaimable (e.g. via kswapd) */
+
+#define MEMFILE_F_ALLOWED_MASK		(MEMFILE_F_USER_INACCESSIBLE | \
+					MEMFILE_F_UNMOVABLE | \
+					MEMFILE_F_UNRECLAIMABLE)
+
+struct memfile_node {
+	struct list_head	notifiers;	/* registered memfile_notifier list on the file */
+	unsigned long		flags;		/* MEMFILE_F_* flags */
+};
+
+struct memfile_backing_store {
+	struct list_head list;
+	spinlock_t lock;
+	struct memfile_node* (*lookup_memfile_node)(struct file *file);
+	int (*get_lock_pfn)(struct file *file, pgoff_t offset, pfn_t *pfn,
+			    int *order);
+	void (*put_unlock_pfn)(pfn_t pfn);
+};
+
+struct memfile_notifier;
+struct memfile_notifier_ops {
+	void (*populate)(struct memfile_notifier *notifier,
+			 pgoff_t start, pgoff_t end);
+	void (*invalidate)(struct memfile_notifier *notifier,
+			   pgoff_t start, pgoff_t end);
+};
+
+struct memfile_notifier {
+	struct list_head list;
+	struct memfile_notifier_ops *ops;
+	struct memfile_backing_store *bs;
+};
+
+static inline void memfile_node_init(struct memfile_node *node)
+{
+	INIT_LIST_HEAD(&node->notifiers);
+	node->flags = 0;
+}
+
+#ifdef CONFIG_MEMFILE_NOTIFIER
+/* APIs for backing stores */
+extern void memfile_register_backing_store(struct memfile_backing_store *bs);
+extern int memfile_node_set_flags(struct file *file, unsigned long flags);
+extern void memfile_notifier_populate(struct memfile_node *node,
+				      pgoff_t start, pgoff_t end);
+extern void memfile_notifier_invalidate(struct memfile_node *node,
+					pgoff_t start, pgoff_t end);
+/*APIs for notifier consumers */
+extern int memfile_register_notifier(struct file *file, unsigned long flags,
+				     struct memfile_notifier *notifier);
+extern void memfile_unregister_notifier(struct memfile_notifier *notifier);
+
+#else /* !CONFIG_MEMFILE_NOTIFIER */
+static void memfile_register_backing_store(struct memfile_backing_store *bs)
+{
+}
+
+static int memfile_node_set_flags(struct file *file, unsigned long flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static void memfile_notifier_populate(struct memfile_node *node,
+				      pgoff_t start, pgoff_t end)
+{
+}
+
+static void memfile_notifier_invalidate(struct memfile_node *node,
+					pgoff_t start, pgoff_t end)
+{
+}
+
+static int memfile_register_notifier(struct file *file, flags,
+				     struct memfile_notifier *notifier)
+{
+	return -EOPNOTSUPP;
+}
+
+static void memfile_unregister_notifier(struct memfile_notifier *notifier)
+{
+}
+
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
+#endif /* _LINUX_MEMFILE_NOTIFIER_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 034d87953600..e551e99cd42a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -909,6 +909,10 @@ config ANON_VMA_NAME
 	  area from being merged with adjacent virtual memory areas due to the
 	  difference in their name.
 
+config MEMFILE_NOTIFIER
+	bool
+	select SRCU
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 4cc13f3179a5..261a5cb315f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_MEMFILE_NOTIFIER) += memfile_notifier.o
diff --git a/mm/memfile_notifier.c b/mm/memfile_notifier.c
new file mode 100644
index 000000000000..ab9461cb874e
--- /dev/null
+++ b/mm/memfile_notifier.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/mm/memfile_notifier.c
+ *
+ *  Copyright (C) 2022  Intel Corporation.
+ *             Chao Peng <chao.p.peng@linux.intel.com>
+ */
+
+#include <linux/memfile_notifier.h>
+#include <linux/pagemap.h>
+#include <linux/srcu.h>
+
+DEFINE_STATIC_SRCU(memfile_srcu);
+static __ro_after_init LIST_HEAD(backing_store_list);
+
+void memfile_notifier_populate(struct memfile_node *node,
+			       pgoff_t start, pgoff_t end)
+{
+	struct memfile_notifier *notifier;
+	int id;
+
+	id = srcu_read_lock(&memfile_srcu);
+	list_for_each_entry_srcu(notifier, &node->notifiers, list,
+				 srcu_read_lock_held(&memfile_srcu)) {
+		if (notifier->ops->populate)
+			notifier->ops->populate(notifier, start, end);
+	}
+	srcu_read_unlock(&memfile_srcu, id);
+}
+
+void memfile_notifier_invalidate(struct memfile_node *node,
+				 pgoff_t start, pgoff_t end)
+{
+	struct memfile_notifier *notifier;
+	int id;
+
+	id = srcu_read_lock(&memfile_srcu);
+	list_for_each_entry_srcu(notifier, &node->notifiers, list,
+				 srcu_read_lock_held(&memfile_srcu)) {
+		if (notifier->ops->invalidate)
+			notifier->ops->invalidate(notifier, start, end);
+	}
+	srcu_read_unlock(&memfile_srcu, id);
+}
+
+void __init memfile_register_backing_store(struct memfile_backing_store *bs)
+{
+	spin_lock_init(&bs->lock);
+	list_add_tail(&bs->list, &backing_store_list);
+}
+
+static void memfile_node_update_flags(struct file *file, unsigned long flags)
+{
+	struct address_space *mapping = file_inode(file)->i_mapping;
+	gfp_t gfp;
+
+	gfp = mapping_gfp_mask(mapping);
+	if (flags & MEMFILE_F_UNMOVABLE)
+		gfp &= ~__GFP_MOVABLE;
+	else
+		gfp |= __GFP_MOVABLE;
+	mapping_set_gfp_mask(mapping, gfp);
+
+	if (flags & MEMFILE_F_UNRECLAIMABLE)
+		mapping_set_unevictable(mapping);
+	else
+		mapping_clear_unevictable(mapping);
+}
+
+int memfile_node_set_flags(struct file *file, unsigned long flags)
+{
+	struct memfile_backing_store *bs;
+	struct memfile_node *node;
+
+	if (flags & ~MEMFILE_F_ALLOWED_MASK)
+		return -EINVAL;
+
+	list_for_each_entry(bs, &backing_store_list, list) {
+		node = bs->lookup_memfile_node(file);
+		if (node) {
+			spin_lock(&bs->lock);
+			node->flags = flags;
+			spin_unlock(&bs->lock);
+			memfile_node_update_flags(file, flags);
+			return 0;
+		}
+	}
+
+	return -EOPNOTSUPP;
+}
+
+int memfile_register_notifier(struct file *file, unsigned long flags,
+			      struct memfile_notifier *notifier)
+{
+	struct memfile_backing_store *bs;
+	struct memfile_node *node;
+	struct list_head *list;
+
+	if (!file || !notifier || !notifier->ops)
+		return -EINVAL;
+	if (flags & ~MEMFILE_F_ALLOWED_MASK)
+		return -EINVAL;
+
+	list_for_each_entry(bs, &backing_store_list, list) {
+		node = bs->lookup_memfile_node(file);
+		if (node) {
+			list = &node->notifiers;
+			notifier->bs = bs;
+
+			spin_lock(&bs->lock);
+			if (list_empty(list))
+				node->flags = flags;
+			else if (node->flags ^ flags) {
+				spin_unlock(&bs->lock);
+				return -EINVAL;
+			}
+
+			list_add_rcu(&notifier->list, list);
+			spin_unlock(&bs->lock);
+			memfile_node_update_flags(file, flags);
+			return 0;
+		}
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(memfile_register_notifier);
+
+void memfile_unregister_notifier(struct memfile_notifier *notifier)
+{
+	spin_lock(&notifier->bs->lock);
+	list_del_rcu(&notifier->list);
+	spin_unlock(&notifier->bs->lock);
+
+	synchronize_srcu(&memfile_srcu);
+}
+EXPORT_SYMBOL_GPL(memfile_unregister_notifier);
-- 
2.25.1


  reply	other threads:[~2022-05-19 15:42 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-19 15:37 [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Chao Peng
2022-05-19 15:37 ` Chao Peng [this message]
2022-05-19 15:37 ` [PATCH v6 2/8] mm/shmem: Support memfile_notifier Chao Peng
2022-05-19 15:37 ` [PATCH v6 3/8] mm/memfd: Introduce MFD_INACCESSIBLE flag Chao Peng
2022-05-31 19:15   ` Vishal Annapurve
2022-06-01 10:17     ` Chao Peng
2022-06-01 12:11       ` Gupta, Pankaj
2022-06-02 10:07         ` Chao Peng
2022-06-14 20:23           ` Sean Christopherson
2022-06-15  8:53             ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 4/8] KVM: Extend the memslot to support fd-based private memory Chao Peng
2022-05-20 17:57   ` Andy Lutomirski
2022-05-20 18:31     ` Sean Christopherson
2022-05-22  4:03       ` Andy Lutomirski
2022-05-23 13:21       ` Chao Peng
2022-05-23 15:22         ` Sean Christopherson
2022-05-30 13:26           ` Chao Peng
2022-06-10 16:14             ` Sean Christopherson
2022-06-14  6:45               ` Chao Peng
2022-06-23 22:59       ` Michael Roth
2022-06-24  8:54         ` Chao Peng
2022-06-24 13:01           ` Michael Roth
2022-06-17 20:52   ` Sean Christopherson
2022-06-17 21:27     ` Sean Christopherson
2022-06-20 14:09       ` Chao Peng
2022-06-20 14:08     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 5/8] KVM: Add KVM_EXIT_MEMORY_FAULT exit Chao Peng
2022-05-19 15:37 ` [PATCH v6 6/8] KVM: Handle page fault for private memory Chao Peng
2022-06-17 21:30   ` Sean Christopherson
2022-06-20 14:16     ` Chao Peng
2022-08-19  0:40     ` Kirill A. Shutemov
2022-08-25 23:43       ` Sean Christopherson
2022-06-24  3:58   ` Nikunj A. Dadhania
2022-06-24  9:02     ` Chao Peng
2022-06-30 19:14       ` Vishal Annapurve
2022-06-30 22:21         ` Michael Roth
2022-07-01  1:21           ` Xiaoyao Li
2022-07-07 20:08             ` Sean Christopherson
2022-07-08  3:29               ` Xiaoyao Li
2022-07-20 23:08                 ` Vishal Annapurve
2022-07-21  9:45                   ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 7/8] KVM: Enable and expose KVM_MEM_PRIVATE Chao Peng
2022-06-23 22:07   ` Michael Roth
2022-06-24  8:43     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 8/8] memfd_create.2: Describe MFD_INACCESSIBLE flag Chao Peng
2022-06-06 20:09 ` [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Vishal Annapurve
2022-06-07  6:57   ` Chao Peng
2022-06-08  0:55     ` Marc Orr
2022-06-08  2:18       ` Chao Peng
2022-06-08 19:37         ` Vishal Annapurve
2022-06-09 20:29           ` Sean Christopherson
2022-06-14  7:28             ` Chao Peng
2022-06-14 17:37               ` Andy Lutomirski
2022-06-14 19:08                 ` Sean Christopherson
2022-06-14 20:59                   ` Andy Lutomirski
2022-06-15  9:17                     ` Chao Peng
2022-06-15 14:29                       ` Sean Christopherson
2022-06-10  0:11         ` Marc Orr

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220519153713.819591-2-chao.p.peng@linux.intel.com \
    --to=chao.p.peng@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=bfields@fieldses.org \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@intel.com \
    --cc=david@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=dhildenb@redhat.com \
    --cc=hpa@zytor.com \
    --cc=hughd@google.com \
    --cc=jlayton@kernel.org \
    --cc=jmattson@google.com \
    --cc=joro@8bytes.org \
    --cc=jun.nakajima@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mail@maciej.szmigiero.name \
    --cc=mhocko@suse.com \
    --cc=michael.roth@amd.com \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qperret@google.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=steven.price@arm.com \
    --cc=tglx@linutronix.de \
    --cc=vannapurve@google.com \
    --cc=vbabka@suse.cz \
    --cc=vkuznets@redhat.com \
    --cc=wanpengli@tencent.com \
    --cc=x86@kernel.org \
    --cc=yu.c.zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).