All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chao Peng <chao.p.peng@linux.intel.com>
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org, linux-doc@vger.kernel.org,
	qemu-devel@nongnu.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Jonathan Corbet <corbet@lwn.net>,
	Sean Christopherson <seanjc@google.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org, "H . Peter Anvin" <hpa@zytor.com>,
	Hugh Dickins <hughd@google.com>, Jeff Layton <jlayton@kernel.org>,
	"J . Bruce Fields" <bfields@fieldses.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Rapoport <rppt@kernel.org>,
	Steven Price <steven.price@arm.com>,
	"Maciej S . Szmigiero" <mail@maciej.szmigiero.name>,
	Vlastimil Babka <vbabka@suse.cz>,
	Vishal Annapurve <vannapurve@google.com>,
	Yu Zhang <yu.c.zhang@linux.intel.com>,
	Chao Peng <chao.p.peng@linux.intel.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	luto@kernel.org, jun.nakajima@intel.com, dave.hansen@intel.com,
	ak@linux.intel.com, david@redhat.com, aarcange@redhat.com,
	ddutile@redhat.com, dhildenb@redhat.com,
	Quentin Perret <qperret@google.com>,
	Michael Roth <michael.roth@amd.com>,
	mhocko@suse.com
Subject: [PATCH v6 1/8] mm: Introduce memfile_notifier
Date: Thu, 19 May 2022 23:37:06 +0800	[thread overview]
Message-ID: <20220519153713.819591-2-chao.p.peng@linux.intel.com> (raw)
In-Reply-To: <20220519153713.819591-1-chao.p.peng@linux.intel.com>

This patch introduces memfile_notifier facility so existing memory file
subsystems (e.g. tmpfs/hugetlbfs) can provide memory pages to allow a
third kernel component to make use of memory bookmarked in the memory
file and gets notified when the pages in the memory file become
allocated/invalidated.

It will be used for KVM to use a file descriptor as the guest memory
backing store and KVM will use this memfile_notifier interface to
interact with memory file subsystems. In the future there might be other
consumers (e.g. VFIO with encrypted device memory).

It consists below components:
 - memfile_backing_store: Each supported memory file subsystem can be
   implemented as a memory backing store which bookmarks memory and
   provides callbacks for other kernel systems (memfile_notifier
   consumers) to interact with.
 - memfile_notifier: memfile_notifier consumers defines callbacks and
   associate them to a file using memfile_register_notifier().
 - memfile_node: A memfile_node is associated with the file (inode) from
   the backing store and includes feature flags and a list of registered
   memfile_notifier for notifying.

Userspace is in charge of guest memory lifecycle: it first allocates
pages in memory backing store and then passes the fd to KVM and lets KVM
register memory slot to memory backing store via memfile_register_notifier.

Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/memfile_notifier.h |  99 ++++++++++++++++++++++
 mm/Kconfig                       |   4 +
 mm/Makefile                      |   1 +
 mm/memfile_notifier.c            | 137 +++++++++++++++++++++++++++++++
 4 files changed, 241 insertions(+)
 create mode 100644 include/linux/memfile_notifier.h
 create mode 100644 mm/memfile_notifier.c

diff --git a/include/linux/memfile_notifier.h b/include/linux/memfile_notifier.h
new file mode 100644
index 000000000000..dcb3ee6ed626
--- /dev/null
+++ b/include/linux/memfile_notifier.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMFILE_NOTIFIER_H
+#define _LINUX_MEMFILE_NOTIFIER_H
+
+#include <linux/pfn_t.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/fs.h>
+
+
+#define MEMFILE_F_USER_INACCESSIBLE	BIT(0)	/* memory allocated in the file is inaccessible from userspace (e.g. read/write/mmap) */
+#define MEMFILE_F_UNMOVABLE		BIT(1)	/* memory allocated in the file is unmovable (e.g. via pagemigration)*/
+#define MEMFILE_F_UNRECLAIMABLE		BIT(2)	/* memory allocated in the file is unreclaimable (e.g. via kswapd) */
+
+#define MEMFILE_F_ALLOWED_MASK		(MEMFILE_F_USER_INACCESSIBLE | \
+					MEMFILE_F_UNMOVABLE | \
+					MEMFILE_F_UNRECLAIMABLE)
+
+struct memfile_node {
+	struct list_head	notifiers;	/* registered memfile_notifier list on the file */
+	unsigned long		flags;		/* MEMFILE_F_* flags */
+};
+
+struct memfile_backing_store {
+	struct list_head list;
+	spinlock_t lock;
+	struct memfile_node* (*lookup_memfile_node)(struct file *file);
+	int (*get_lock_pfn)(struct file *file, pgoff_t offset, pfn_t *pfn,
+			    int *order);
+	void (*put_unlock_pfn)(pfn_t pfn);
+};
+
+struct memfile_notifier;
+struct memfile_notifier_ops {
+	void (*populate)(struct memfile_notifier *notifier,
+			 pgoff_t start, pgoff_t end);
+	void (*invalidate)(struct memfile_notifier *notifier,
+			   pgoff_t start, pgoff_t end);
+};
+
+struct memfile_notifier {
+	struct list_head list;
+	struct memfile_notifier_ops *ops;
+	struct memfile_backing_store *bs;
+};
+
+static inline void memfile_node_init(struct memfile_node *node)
+{
+	INIT_LIST_HEAD(&node->notifiers);
+	node->flags = 0;
+}
+
+#ifdef CONFIG_MEMFILE_NOTIFIER
+/* APIs for backing stores */
+extern void memfile_register_backing_store(struct memfile_backing_store *bs);
+extern int memfile_node_set_flags(struct file *file, unsigned long flags);
+extern void memfile_notifier_populate(struct memfile_node *node,
+				      pgoff_t start, pgoff_t end);
+extern void memfile_notifier_invalidate(struct memfile_node *node,
+					pgoff_t start, pgoff_t end);
+/*APIs for notifier consumers */
+extern int memfile_register_notifier(struct file *file, unsigned long flags,
+				     struct memfile_notifier *notifier);
+extern void memfile_unregister_notifier(struct memfile_notifier *notifier);
+
+#else /* !CONFIG_MEMFILE_NOTIFIER */
+static void memfile_register_backing_store(struct memfile_backing_store *bs)
+{
+}
+
+static int memfile_node_set_flags(struct file *file, unsigned long flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static void memfile_notifier_populate(struct memfile_node *node,
+				      pgoff_t start, pgoff_t end)
+{
+}
+
+static void memfile_notifier_invalidate(struct memfile_node *node,
+					pgoff_t start, pgoff_t end)
+{
+}
+
+static int memfile_register_notifier(struct file *file, flags,
+				     struct memfile_notifier *notifier)
+{
+	return -EOPNOTSUPP;
+}
+
+static void memfile_unregister_notifier(struct memfile_notifier *notifier)
+{
+}
+
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
+#endif /* _LINUX_MEMFILE_NOTIFIER_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 034d87953600..e551e99cd42a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -909,6 +909,10 @@ config ANON_VMA_NAME
 	  area from being merged with adjacent virtual memory areas due to the
 	  difference in their name.
 
+config MEMFILE_NOTIFIER
+	bool
+	select SRCU
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 4cc13f3179a5..261a5cb315f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_MEMFILE_NOTIFIER) += memfile_notifier.o
diff --git a/mm/memfile_notifier.c b/mm/memfile_notifier.c
new file mode 100644
index 000000000000..ab9461cb874e
--- /dev/null
+++ b/mm/memfile_notifier.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/mm/memfile_notifier.c
+ *
+ *  Copyright (C) 2022  Intel Corporation.
+ *             Chao Peng <chao.p.peng@linux.intel.com>
+ */
+
+#include <linux/memfile_notifier.h>
+#include <linux/pagemap.h>
+#include <linux/srcu.h>
+
+DEFINE_STATIC_SRCU(memfile_srcu);
+static __ro_after_init LIST_HEAD(backing_store_list);
+
+void memfile_notifier_populate(struct memfile_node *node,
+			       pgoff_t start, pgoff_t end)
+{
+	struct memfile_notifier *notifier;
+	int id;
+
+	id = srcu_read_lock(&memfile_srcu);
+	list_for_each_entry_srcu(notifier, &node->notifiers, list,
+				 srcu_read_lock_held(&memfile_srcu)) {
+		if (notifier->ops->populate)
+			notifier->ops->populate(notifier, start, end);
+	}
+	srcu_read_unlock(&memfile_srcu, id);
+}
+
+void memfile_notifier_invalidate(struct memfile_node *node,
+				 pgoff_t start, pgoff_t end)
+{
+	struct memfile_notifier *notifier;
+	int id;
+
+	id = srcu_read_lock(&memfile_srcu);
+	list_for_each_entry_srcu(notifier, &node->notifiers, list,
+				 srcu_read_lock_held(&memfile_srcu)) {
+		if (notifier->ops->invalidate)
+			notifier->ops->invalidate(notifier, start, end);
+	}
+	srcu_read_unlock(&memfile_srcu, id);
+}
+
+void __init memfile_register_backing_store(struct memfile_backing_store *bs)
+{
+	spin_lock_init(&bs->lock);
+	list_add_tail(&bs->list, &backing_store_list);
+}
+
+static void memfile_node_update_flags(struct file *file, unsigned long flags)
+{
+	struct address_space *mapping = file_inode(file)->i_mapping;
+	gfp_t gfp;
+
+	gfp = mapping_gfp_mask(mapping);
+	if (flags & MEMFILE_F_UNMOVABLE)
+		gfp &= ~__GFP_MOVABLE;
+	else
+		gfp |= __GFP_MOVABLE;
+	mapping_set_gfp_mask(mapping, gfp);
+
+	if (flags & MEMFILE_F_UNRECLAIMABLE)
+		mapping_set_unevictable(mapping);
+	else
+		mapping_clear_unevictable(mapping);
+}
+
+int memfile_node_set_flags(struct file *file, unsigned long flags)
+{
+	struct memfile_backing_store *bs;
+	struct memfile_node *node;
+
+	if (flags & ~MEMFILE_F_ALLOWED_MASK)
+		return -EINVAL;
+
+	list_for_each_entry(bs, &backing_store_list, list) {
+		node = bs->lookup_memfile_node(file);
+		if (node) {
+			spin_lock(&bs->lock);
+			node->flags = flags;
+			spin_unlock(&bs->lock);
+			memfile_node_update_flags(file, flags);
+			return 0;
+		}
+	}
+
+	return -EOPNOTSUPP;
+}
+
+int memfile_register_notifier(struct file *file, unsigned long flags,
+			      struct memfile_notifier *notifier)
+{
+	struct memfile_backing_store *bs;
+	struct memfile_node *node;
+	struct list_head *list;
+
+	if (!file || !notifier || !notifier->ops)
+		return -EINVAL;
+	if (flags & ~MEMFILE_F_ALLOWED_MASK)
+		return -EINVAL;
+
+	list_for_each_entry(bs, &backing_store_list, list) {
+		node = bs->lookup_memfile_node(file);
+		if (node) {
+			list = &node->notifiers;
+			notifier->bs = bs;
+
+			spin_lock(&bs->lock);
+			if (list_empty(list))
+				node->flags = flags;
+			else if (node->flags ^ flags) {
+				spin_unlock(&bs->lock);
+				return -EINVAL;
+			}
+
+			list_add_rcu(&notifier->list, list);
+			spin_unlock(&bs->lock);
+			memfile_node_update_flags(file, flags);
+			return 0;
+		}
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(memfile_register_notifier);
+
+void memfile_unregister_notifier(struct memfile_notifier *notifier)
+{
+	spin_lock(&notifier->bs->lock);
+	list_del_rcu(&notifier->list);
+	spin_unlock(&notifier->bs->lock);
+
+	synchronize_srcu(&memfile_srcu);
+}
+EXPORT_SYMBOL_GPL(memfile_unregister_notifier);
-- 
2.25.1


  reply	other threads:[~2022-05-19 15:43 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-19 15:37 [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Chao Peng
2022-05-19 15:37 ` Chao Peng [this message]
2022-05-19 15:37 ` [PATCH v6 2/8] mm/shmem: Support memfile_notifier Chao Peng
2022-05-19 15:37 ` [PATCH v6 3/8] mm/memfd: Introduce MFD_INACCESSIBLE flag Chao Peng
2022-05-31 19:15   ` Vishal Annapurve
2022-06-01 10:17     ` Chao Peng
2022-06-01 12:11       ` Gupta, Pankaj
2022-06-02 10:07         ` Chao Peng
2022-06-14 20:23           ` Sean Christopherson
2022-06-15  8:53             ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 4/8] KVM: Extend the memslot to support fd-based private memory Chao Peng
2022-05-20 17:57   ` Andy Lutomirski
2022-05-20 18:31     ` Sean Christopherson
2022-05-22  4:03       ` Andy Lutomirski
2022-05-23 13:21       ` Chao Peng
2022-05-23 15:22         ` Sean Christopherson
2022-05-30 13:26           ` Chao Peng
2022-06-10 16:14             ` Sean Christopherson
2022-06-14  6:45               ` Chao Peng
2022-06-23 22:59       ` Michael Roth
2022-06-24  8:54         ` Chao Peng
2022-06-24 13:01           ` Michael Roth
2022-06-17 20:52   ` Sean Christopherson
2022-06-17 21:27     ` Sean Christopherson
2022-06-20 14:09       ` Chao Peng
2022-06-20 14:08     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 5/8] KVM: Add KVM_EXIT_MEMORY_FAULT exit Chao Peng
2022-05-19 15:37 ` [PATCH v6 6/8] KVM: Handle page fault for private memory Chao Peng
2022-06-17 21:30   ` Sean Christopherson
2022-06-20 14:16     ` Chao Peng
2022-08-19  0:40     ` Kirill A. Shutemov
2022-08-25 23:43       ` Sean Christopherson
2022-06-24  3:58   ` Nikunj A. Dadhania
2022-06-24  9:02     ` Chao Peng
2022-06-30 19:14       ` Vishal Annapurve
2022-06-30 22:21         ` Michael Roth
2022-07-01  1:21           ` Xiaoyao Li
2022-07-07 20:08             ` Sean Christopherson
2022-07-08  3:29               ` Xiaoyao Li
2022-07-20 23:08                 ` Vishal Annapurve
2022-07-21  9:45                   ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 7/8] KVM: Enable and expose KVM_MEM_PRIVATE Chao Peng
2022-06-23 22:07   ` Michael Roth
2022-06-24  8:43     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 8/8] memfd_create.2: Describe MFD_INACCESSIBLE flag Chao Peng
2022-06-06 20:09 ` [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Vishal Annapurve
2022-06-07  6:57   ` Chao Peng
2022-06-08  0:55     ` Marc Orr
2022-06-08  2:18       ` Chao Peng
2022-06-08 19:37         ` Vishal Annapurve
2022-06-09 20:29           ` Sean Christopherson
2022-06-14  7:28             ` Chao Peng
2022-06-14 17:37               ` Andy Lutomirski
2022-06-14 19:08                 ` Sean Christopherson
2022-06-14 20:59                   ` Andy Lutomirski
2022-06-15  9:17                     ` Chao Peng
2022-06-15 14:29                       ` Sean Christopherson
2022-06-10  0:11         ` Marc Orr

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220519153713.819591-2-chao.p.peng@linux.intel.com \
    --to=chao.p.peng@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=bfields@fieldses.org \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@intel.com \
    --cc=david@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=dhildenb@redhat.com \
    --cc=hpa@zytor.com \
    --cc=hughd@google.com \
    --cc=jlayton@kernel.org \
    --cc=jmattson@google.com \
    --cc=joro@8bytes.org \
    --cc=jun.nakajima@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mail@maciej.szmigiero.name \
    --cc=mhocko@suse.com \
    --cc=michael.roth@amd.com \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qperret@google.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=steven.price@arm.com \
    --cc=tglx@linutronix.de \
    --cc=vannapurve@google.com \
    --cc=vbabka@suse.cz \
    --cc=vkuznets@redhat.com \
    --cc=wanpengli@tencent.com \
    --cc=x86@kernel.org \
    --cc=yu.c.zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.