Linux-api Archive on lore.kernel.org
 help / color / Atom feed
From: Mike Rapoport <rppt@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andy Lutomirski <luto@kernel.org>, Arnd Bergmann <arnd@arndb.de>,
	Borislav Petkov <bp@alien8.de>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Christopher Lameter <cl@linux.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Elena Reshetova <elena.reshetova@intel.com>,
	"H. Peter Anvin" <hpa@zytor.com>, Idan Yaniv <idan.yaniv@ibm.com>,
	Ingo Molnar <mingo@redhat.com>,
	James Bottomley <jejb@linux.ibm.com>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Matthew Wilcox <willy@infradead.org>,
	Mike Rapoport <rppt@linux.ibm.com>,
	Mike Rapoport <rppt@kernel.org>,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Palmer Dabbelt <palmer@dabbelt.com>,
	Paul Walmsley <paul.walmsley@sifive.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Tycho Andersen <tycho@tycho.ws>, Will Deacon <will@kernel.org>,
	linux-api@vger.kernel.org, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-nvdimm@lists.01.org, linux-riscv@lists.infradead.org,
	x86@kernel.org
Subject: [PATCH v2 5/7] mm: secretmem: use PMD-size pages to amortize direct map fragmentation
Date: Mon, 27 Jul 2020 19:29:33 +0300
Message-ID: <20200727162935.31714-6-rppt@kernel.org> (raw)
In-Reply-To: <20200727162935.31714-1-rppt@kernel.org>

From: Mike Rapoport <rppt@linux.ibm.com>

Removing a PAGE_SIZE page from the direct map every time such page is
allocated for a secret memory mapping will cause severe fragmentation of
the direct map. This fragmentation can be reduced by using PMD-size pages
as a pool for small pages for secret memory mappings.

Add a gen_pool per secretmem inode and lazily populate this pool with
PMD-size pages.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 mm/secretmem.c | 107 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 88 insertions(+), 19 deletions(-)

diff --git a/mm/secretmem.c b/mm/secretmem.c
index 9d29f3e1c49d..da609701e10e 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -6,6 +6,7 @@
 #include <linux/bitops.h>
 #include <linux/printk.h>
 #include <linux/pagemap.h>
+#include <linux/genalloc.h>
 #include <linux/syscalls.h>
 #include <linux/pseudo_fs.h>
 #include <linux/set_memory.h>
@@ -25,24 +26,66 @@
 #define SECRETMEM_FLAGS_MASK	SECRETMEM_MODE_MASK
 
 struct secretmem_ctx {
+	struct gen_pool *pool;
 	unsigned int mode;
 };
 
-static struct page *secretmem_alloc_page(gfp_t gfp)
+static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
 {
-	/*
-	 * FIXME: use a cache of large pages to reduce the direct map
-	 * fragmentation
-	 */
-	return alloc_page(gfp);
+	unsigned long nr_pages = (1 << PMD_PAGE_ORDER);
+	struct gen_pool *pool = ctx->pool;
+	unsigned long addr;
+	struct page *page;
+	int err;
+
+	page = alloc_pages(gfp, PMD_PAGE_ORDER);
+	if (!page)
+		return -ENOMEM;
+
+	addr = (unsigned long)page_address(page);
+	split_page(page, PMD_PAGE_ORDER);
+
+	err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE);
+	if (err) {
+		__free_pages(page, PMD_PAGE_ORDER);
+		return err;
+	}
+
+	__kernel_map_pages(page, nr_pages, 0);
+
+	return 0;
+}
+
+static struct page *secretmem_alloc_page(struct secretmem_ctx *ctx,
+					 gfp_t gfp)
+{
+	struct gen_pool *pool = ctx->pool;
+	unsigned long addr;
+	struct page *page;
+	int err;
+
+	if (gen_pool_avail(pool) < PAGE_SIZE) {
+		err = secretmem_pool_increase(ctx, gfp);
+		if (err)
+			return NULL;
+	}
+
+	addr = gen_pool_alloc(pool, PAGE_SIZE);
+	if (!addr)
+		return NULL;
+
+	page = virt_to_page(addr);
+	get_page(page);
+
+	return page;
 }
 
 static vm_fault_t secretmem_fault(struct vm_fault *vmf)
 {
+	struct secretmem_ctx *ctx = vmf->vma->vm_file->private_data;
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	pgoff_t offset = vmf->pgoff;
-	unsigned long addr;
 	struct page *page;
 	int ret = 0;
 
@@ -51,7 +94,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
 
 	page = find_get_entry(mapping, offset);
 	if (!page) {
-		page = secretmem_alloc_page(vmf->gfp_mask);
+		page = secretmem_alloc_page(ctx, vmf->gfp_mask);
 		if (!page)
 			return vmf_error(-ENOMEM);
 
@@ -59,14 +102,8 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
 		if (unlikely(ret))
 			goto err_put_page;
 
-		ret = set_direct_map_invalid_noflush(page);
-		if (ret)
-			goto err_del_page_cache;
-
-		addr = (unsigned long)page_address(page);
-		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-
 		__SetPageUptodate(page);
+		set_page_private(page, (unsigned long)ctx);
 
 		ret = VM_FAULT_LOCKED;
 	}
@@ -74,8 +111,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
 	vmf->page = page;
 	return ret;
 
-err_del_page_cache:
-	delete_from_page_cache(page);
 err_put_page:
 	put_page(page);
 	return vmf_error(ret);
@@ -134,7 +169,11 @@ static int secretmem_migratepage(struct address_space *mapping,
 
 static void secretmem_freepage(struct page *page)
 {
-	set_direct_map_default_noflush(page);
+	unsigned long addr = (unsigned long)page_address(page);
+	struct secretmem_ctx *ctx = (struct secretmem_ctx *)page_private(page);
+	struct gen_pool *pool = ctx->pool;
+
+	gen_pool_free(pool, addr, PAGE_SIZE);
 }
 
 static const struct address_space_operations secretmem_aops = {
@@ -159,13 +198,18 @@ static struct file *secretmem_file_create(unsigned long flags)
 	if (!ctx)
 		goto err_free_inode;
 
+	ctx->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+	if (!ctx->pool)
+		goto err_free_ctx;
+
 	file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
 				 O_RDWR, &secretmem_fops);
 	if (IS_ERR(file))
-		goto err_free_ctx;
+		goto err_free_pool;
 
 	mapping_set_unevictable(inode->i_mapping);
 
+	inode->i_private = ctx;
 	inode->i_mapping->private_data = ctx;
 	inode->i_mapping->a_ops = &secretmem_aops;
 
@@ -179,6 +223,8 @@ static struct file *secretmem_file_create(unsigned long flags)
 
 	return file;
 
+err_free_pool:
+	gen_pool_destroy(ctx->pool);
 err_free_ctx:
 	kfree(ctx);
 err_free_inode:
@@ -223,11 +269,34 @@ SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
 	return err;
 }
 
+static void secretmem_cleanup_chunk(struct gen_pool *pool,
+				    struct gen_pool_chunk *chunk, void *data)
+{
+	unsigned long start = chunk->start_addr;
+	unsigned long end = chunk->end_addr;
+	unsigned long nr_pages, addr;
+
+	nr_pages = (end - start + 1) / PAGE_SIZE;
+	__kernel_map_pages(virt_to_page(start), nr_pages, 1);
+
+	for (addr = start; addr < end; addr += PAGE_SIZE)
+		put_page(virt_to_page(addr));
+}
+
+static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
+{
+	struct gen_pool *pool = ctx->pool;
+
+	gen_pool_for_each_chunk(pool, secretmem_cleanup_chunk, ctx);
+	gen_pool_destroy(pool);
+}
+
 static void secretmem_evict_inode(struct inode *inode)
 {
 	struct secretmem_ctx *ctx = inode->i_private;
 
 	truncate_inode_pages_final(&inode->i_data);
+	secretmem_cleanup_pool(ctx);
 	clear_inode(inode);
 	kfree(ctx);
 }
-- 
2.26.2


  parent reply index

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-27 16:29 [PATCH v2 0/7] mm: introduce memfd_secret system call to create "secret" memory areas Mike Rapoport
2020-07-27 16:29 ` [PATCH v2 1/7] mm: add definition of PMD_PAGE_ORDER Mike Rapoport
2020-07-27 16:29 ` [PATCH v2 2/7] mmap: make mlock_future_check() global Mike Rapoport
2020-07-27 16:29 ` [PATCH v2 3/7] mm: introduce memfd_secret system call to create "secret" memory areas Mike Rapoport
2020-07-30 16:22   ` Catalin Marinas
2020-07-30 20:44     ` Mike Rapoport
2020-07-31 14:10       ` Catalin Marinas
2020-07-31 14:29     ` Mark Rutland
2020-07-31 16:22       ` Catalin Marinas
2020-07-27 16:29 ` [PATCH v2 4/7] arch, mm: wire up memfd_secret system call were relevant Mike Rapoport
2020-07-27 17:25   ` Arnd Bergmann
2020-07-27 16:29 ` Mike Rapoport [this message]
2020-07-27 16:29 ` [PATCH v2 6/7] mm: secretmem: add ability to reserve memory at boot Mike Rapoport
2020-07-27 16:29 ` [PATCH v2 7/7] " Mike Rapoport
2020-07-27 17:11   ` Mike Rapoport

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200727162935.31714-6-rppt@kernel.org \
    --to=rppt@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=bp@alien8.de \
    --cc=catalin.marinas@arm.com \
    --cc=cl@linux.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=elena.reshetova@intel.com \
    --cc=hpa@zytor.com \
    --cc=idan.yaniv@ibm.com \
    --cc=jejb@linux.ibm.com \
    --cc=kirill@shutemov.name \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=mtk.manpages@gmail.com \
    --cc=palmer@dabbelt.com \
    --cc=paul.walmsley@sifive.com \
    --cc=peterz@infradead.org \
    --cc=rppt@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=tycho@tycho.ws \
    --cc=viro@zeniv.linux.org.uk \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-api Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-api/0 linux-api/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-api linux-api/ https://lore.kernel.org/linux-api \
		linux-api@vger.kernel.org
	public-inbox-index linux-api

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-api


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git