All of lore.kernel.org
 help / color / mirror / Atom feed
From: ira.weiny@intel.com
To: lsf-pc@lists.linux-foundation.org
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, "Dan Williams" <dan.j.williams@intel.com>,
	"Jan Kara" <jack@suse.cz>, "Jérôme Glisse" <jglisse@redhat.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Michal Hocko" <mhocko@suse.com>,
	"Ira Weiny" <ira.weiny@intel.com>
Subject: [RFC PATCH 05/10] mm/gup: Take FL_LONGTERM lease if not set by user
Date: Sun, 28 Apr 2019 21:53:54 -0700	[thread overview]
Message-ID: <20190429045359.8923-6-ira.weiny@intel.com> (raw)
In-Reply-To: <20190429045359.8923-1-ira.weiny@intel.com>

From: Ira Weiny <ira.weiny@intel.com>

If a user has failed to take a F_LONGTERM lease on a file and they
do a longterm pin on the pages associated with a file, take a
FL_LONGTERM lease for them.

If the user has not taken a lease on the file they are trying to pin
create a FL_LONGTERM lease and attach it to the inode associated with
the memory being pinned.

If the user has already taken a lease ref count the lease such that it
will not be removed until all the GUP pins have been removed.  This
prevents the user from removing the GUP lease and tricking the kernel
into thinking the memory is free.

Follow on patches will send a SIGBUS if the user does not remove their
GUP pins and the FS needs the pages in question.  This should only
happen if they have not planned the use of the file correctly and are
allowing other processes to truncate/hold punch a file they are actively
trying to access.

This is similar to what would happen if the memory was accessed through
a regular CPU instruction with a couple of exceptions.

1) The SIGBUS is sent when the memory becomes invalid rather than
   waiting for an access by the process.  This is because we don't know
   when the device may try to access the page.  So we assume that the
   page gets "accessed immediately."

2) Hole punch is treated like a truncate.  As such SIGBUS is sent rather
   than attempting to allocate file space as a normal CPU access would.
---
 fs/locks.c         | 179 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h |   4 +
 mm/gup.c           |   7 +-
 mm/huge_memory.c   |   6 +-
 4 files changed, 187 insertions(+), 9 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 31c8b761a578..ae508d192223 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -343,8 +343,10 @@ struct file_lock *locks_alloc_lock(void)
 {
 	struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
 
-	if (fl)
+	if (fl) {
 		locks_init_lock_heads(fl);
+		kref_init(&fl->gup_ref);
+	}
 
 	return fl;
 }
@@ -607,6 +609,14 @@ static const struct lock_manager_operations lease_manager_ops = {
 	.lm_setup = lease_setup,
 };
 
+static int lease_modify_longterm(struct file_lock *fl, int arg,
+				 struct list_head *dispose);
+static const struct lock_manager_operations lease_longterm_ops = {
+	.lm_break = lease_break_callback,
+	.lm_change = lease_modify_longterm,
+	.lm_setup = lease_setup,
+};
+
 /*
  * Initialize a lease, use the default lock manager operations
  */
@@ -621,12 +631,15 @@ static int lease_init(struct file *filp, long type, unsigned int flags,
 
 	fl->fl_file = filp;
 	fl->fl_flags = FL_LEASE;
-	if (flags & FL_LONGTERM)
+	if (flags & FL_LONGTERM) {
 		fl->fl_flags |= FL_LONGTERM;
+		fl->fl_lmops = &lease_longterm_ops;
+	} else {
+		fl->fl_lmops = &lease_manager_ops;
+	}
 	fl->fl_start = 0;
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_ops = NULL;
-	fl->fl_lmops = &lease_manager_ops;
 	return 0;
 }
 
@@ -1506,6 +1519,55 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 }
 EXPORT_SYMBOL(lease_modify);
 
+static void release_longterm_lease(struct kref *kref)
+{
+	struct file_lock *fl = container_of(kref, struct file_lock, gup_ref);
+
+	locks_delete_lock_ctx(fl, NULL);
+}
+
+/*
+ * LONGTERM leases are special in that they may be held by the GUP code and
+ * therefore can't be modified in the same way as regular file leases.
+ *
+ * Specifically the lease is refcounted by GUP based on the number of pages are
+ * which want to hold the lease.
+ */
+static int lease_modify_longterm(struct file_lock *fl, int arg,
+				 struct list_head *dispose)
+{
+	int error = assign_type(fl, arg);
+
+	if (error)
+		return error;
+	lease_clear_pending(fl, arg);
+	locks_wake_up_blocks(fl);
+
+	if (arg == F_UNLCK) {
+		struct file *filp = fl->fl_file;
+
+		/*
+		 * Users who take the longterm lease get a reference to it.
+		 * This modify will remove that reference if it exists.  But
+		 * only that reference.  This means that the GUP code must exit
+		 * before the LONGTERM lease will be fully removed.
+		 */
+		if (filp) {
+			f_delown(filp);
+			filp->f_owner.signum = 0;
+
+			fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+			if (fl->fl_fasync != NULL) {
+				printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
+				fl->fl_fasync = NULL;
+			}
+
+			kref_put(&fl->gup_ref, release_longterm_lease);
+		}
+	}
+	return 0;
+}
+
 static bool past_time(unsigned long then)
 {
 	if (!then)
@@ -1794,6 +1856,33 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 	return ret;
 }
 
+/*
+ * Note the locks could eventually be optimized to lock over smaller areas
+ * of the file.  But for now we do this per inode.
+ *
+ * The rational is due to the most common use case where we don't expect users
+ * to to be removing any of the pages of the file while it is being used by the
+ * longterm pin.  Should the user want to alter the file in this way they will
+ * be required to release the pins alter the file and restablish the pins.
+ *
+ * inode->i_flctx->flc_lock must be held.
+ */
+static struct file_lock *find_longterm_lease(struct inode *inode)
+{
+	struct file_lock *ret = NULL;
+	struct file_lock *fl;
+
+	list_for_each_entry(fl, &inode->i_flctx->flc_lease, fl_list) {
+		if (fl->fl_flags & FL_LONGTERM &&
+		    fl->fl_pid == current->tgid) {
+			ret = fl;
+			break;
+		}
+	}
+
+	return ret;
+}
+
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
@@ -2986,3 +3075,87 @@ bool mapping_inode_has_longterm(struct page *page)
 }
 EXPORT_SYMBOL_GPL(mapping_inode_has_longterm);
 
+/*
+ * if the user has not already taken a longterm lease on a devmap FS page do it
+ * for them.
+ *
+ * Heavily borrowed frem the NFS code.
+ */
+bool page_set_longterm_lease(struct page *page)
+{
+	struct file_lock_context *ctx;
+	struct inode *inode;
+	struct file_lock *new_fl, *existing_fl;
+
+	/*
+	 * We should never be here unless we are a "page cache" page
+	 * And we are a devm managed page
+	 */
+	if (WARN_ON(!page) ||
+	    WARN_ON(PageAnon(page)) ||
+	    WARN_ON(!page->mapping) ||
+	    WARN_ON(!page->mapping->host) ||
+	    WARN_ON(!page_is_devmap_managed(page)))
+		return false;
+
+	new_fl = lease_alloc(NULL, F_RDLCK, FL_LONGTERM);
+	if (IS_ERR(new_fl))
+		return false;
+
+	/* Ensure page->mapping isn't freed while we look at it */
+	/* No locking needed...  mm sem is held. */
+	inode = page->mapping->host;
+
+	ctx = locks_get_lock_context(inode, F_RDLCK);
+	percpu_down_read(&file_rwsem);
+	spin_lock(&ctx->flc_lock);
+
+	existing_fl = find_longterm_lease(inode);
+	if (!existing_fl) {
+		existing_fl = new_fl;
+		locks_insert_lock_ctx(new_fl, &ctx->flc_lease);
+	} else {
+		kref_get(&existing_fl->gup_ref);
+	}
+
+	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
+
+	if (existing_fl != new_fl)
+		locks_free_lock(new_fl);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(page_set_longterm_lease);
+
+void page_remove_longterm_lease(struct page *page)
+{
+	struct file_lock_context *ctx;
+	struct inode *inode;
+	struct file_lock *found;
+
+	/*
+	 * We should never be here unless we are a "page cache" page
+	 * And we are a devm managed page
+	 */
+	if (WARN_ON(!page) ||
+	    WARN_ON(PageAnon(page)) ||
+	    WARN_ON(!page->mapping) ||
+	    WARN_ON(!page->mapping->host) ||
+	    WARN_ON(!page_is_devmap_managed(page)))
+		return;
+
+	inode = page->mapping->host;
+
+	ctx = locks_get_lock_context(inode, F_RDLCK);
+
+	found = NULL;
+	percpu_down_read(&file_rwsem);
+	spin_lock(&ctx->flc_lock);
+	found = find_longterm_lease(inode);
+	if (found)
+		kref_put(&found->gup_ref, release_longterm_lease);
+	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
+}
+EXPORT_SYMBOL_GPL(page_remove_longterm_lease);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ace21c6feb19..be2d08080aa5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -40,6 +40,7 @@
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/kref.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1093,6 +1094,7 @@ struct file_lock {
 			int state;		/* state of grant or error if -ve */
 		} afs;
 	} fl_u;
+	struct kref gup_ref;
 } __randomize_layout;
 
 struct file_lock_context {
@@ -1152,6 +1154,8 @@ extern int lease_modify(struct file_lock *, int, struct list_head *);
 struct files_struct;
 extern void show_fd_locks(struct seq_file *f,
 			 struct file *filp, struct files_struct *files);
+bool page_set_longterm_lease(struct page *page);
+void page_remove_longterm_lease(struct page *page);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, unsigned int cmd,
 			      struct flock __user *user)
diff --git a/mm/gup.c b/mm/gup.c
index 5ae1dd31a58d..1ee17f2339f7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -51,6 +51,9 @@ void put_user_page(struct page *page)
 {
 	page = compound_head(page);
 
+	if (page_is_devmap_managed(page))
+		page_remove_longterm_lease(page);
+
 	/*
 	 * For devmap managed pages we need to catch refcount transition from
 	 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
@@ -294,7 +297,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 			goto no_page;
 
 		if (unlikely(flags & FOLL_LONGTERM) &&
-		    !mapping_inode_has_longterm(page)) {
+		    !page_set_longterm_lease(page)) {
 			page = ERR_PTR(-EINVAL);
 			goto out;
 		}
@@ -1877,7 +1880,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		pages[*nr] = page;
 
 		if (unlikely(flags & FOLL_LONGTERM) &&
-		    !mapping_inode_has_longterm(page)) {
+		    !page_set_longterm_lease(page)) {
 			undo_dev_pagemap(nr, nr_start, pages);
 			return 0;
 		}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8819624c740f..6a8c039fe6ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -915,9 +915,8 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 	// FIXME combine logic
 	if (unlikely(flags & FOLL_LONGTERM)) {
 		WARN_ON_ONCE(PageAnon(page));
-		if (!mapping_inode_has_longterm(page)) {
+		if (!page_set_longterm_lease(page))
 			return NULL;
-		}
 	}
 
 	get_page(page);
@@ -1065,9 +1064,8 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 	// FIXME combine logic remove Warn
 	if (unlikely(flags & FOLL_LONGTERM)) {
 		WARN_ON_ONCE(PageAnon(page));
-		if (!mapping_inode_has_longterm(page)) {
+		if (!page_set_longterm_lease(page))
 			return NULL;
-		}
 	}
 
 	get_page(page);
-- 
2.20.1


  parent reply	other threads:[~2019-04-29  4:54 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-29  4:53 [RFC PATCH 00/10] RDMA/FS DAX "LONGTERM" lease proposal ira.weiny
2019-04-29  4:53 ` [RFC PATCH 01/10] fs/locks: Add trace_leases_conflict ira.weiny
2019-04-29  4:53 ` [RFC PATCH 02/10] fs/locks: Introduce FL_LONGTERM file lease ira.weiny
2019-04-29  4:53 ` [RFC PATCH 03/10] mm/gup: Pass flags down to __gup_device_huge* calls ira.weiny
2019-04-29  4:53 ` [RFC PATCH 04/10] WIP: mm/gup: Ensure F_LONGTERM lease is held on GUP pages ira.weiny
2019-04-29  4:53 ` ira.weiny [this message]
2019-04-29  4:53 ` [RFC PATCH 06/10] fs/locks: Add longterm lease traces ira.weiny
2019-04-29  4:53 ` [RFC PATCH 07/10] fs/dax: Create function dax_mapping_is_dax() ira.weiny
2019-04-29  4:53 ` [RFC PATCH 08/10] mm/gup: fs: Send SIGBUS on truncate of active file ira.weiny
2019-04-29  4:53 ` [RFC PATCH 09/10] fs/locks: Add tracepoint for SIGBUS on LONGTERM expiration ira.weiny
2019-04-29  4:53 ` [RFC PATCH 10/10] mm/gup: Remove FOLL_LONGTERM DAX exclusion ira.weiny

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190429045359.8923-6-ira.weiny@intel.com \
    --to=ira.weiny@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=jack@suse.cz \
    --cc=jglisse@redhat.com \
    --cc=jhubbard@nvidia.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lsf-pc@lists.linux-foundation.org \
    --cc=mhocko@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.