All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Herrmann <dh.herrmann@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: Michael Kerrisk <mtk.manpages@gmail.com>,
	Ryan Lortie <desrt@desrt.ca>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org, Greg Kroah-Hartman <greg@kroah.com>,
	john.stultz@linaro.org,
	Lennart Poettering <lennart@poettering.net>,
	Daniel Mack <zonque@gmail.com>, Kay Sievers <kay@vrfy.org>,
	Hugh Dickins <hughd@google.com>,
	Andy Lutomirski <luto@amacapital.net>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	David Herrmann <dh.herrmann@gmail.com>
Subject: [PATCH v4 6/6] shm: wait for pins to be released when sealing
Date: Sun, 20 Jul 2014 19:34:40 +0200	[thread overview]
Message-ID: <1405877680-999-7-git-send-email-dh.herrmann@gmail.com> (raw)
In-Reply-To: <1405877680-999-1-git-send-email-dh.herrmann@gmail.com>

If we set SEAL_WRITE on a file, we must make sure there cannot be any
ongoing write-operations on the file. For write() calls, we simply lock
the inode mutex, for mmap() we simply verify there're no writable
mappings. However, there might be pages pinned by AIO, Direct-IO and
similar operations via GUP. We must make sure those do not write to the
memfd file after we set SEAL_WRITE.

As there is no way to notify GUP users to drop pages or to wait for them
to be done, we implement the wait ourself: When setting SEAL_WRITE, we
check all pages for their ref-count. If it's bigger than 1, we know
there's some user of the page. We then mark the page and wait for up to
150ms for those ref-counts to be dropped. If the ref-counts are not
dropped in time, we refuse the seal operation.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 770e072..df1aceb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1780,9 +1780,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on shmem.
+ */
+#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN               4       /* about 150ms max */
+
+static void shmem_tag_pins(struct address_space *mapping)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+
+	lru_add_drain();
+	start = 0;
+	rcu_read_lock();
+
+restart:
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+		page = radix_tree_deref_slot(slot);
+		if (!page || radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page))
+				goto restart;
+		} else if (page_count(page) - page_mapcount(page) > 1) {
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_set(&mapping->page_tree, iter.index,
+					   SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+		}
+
+		if (need_resched()) {
+			cond_resched_rcu();
+			start = iter.index + 1;
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
 static int shmem_wait_for_pins(struct address_space *mapping)
 {
-	return 0;
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+	int error, scan;
+
+	shmem_tag_pins(mapping);
+
+	error = 0;
+	for (scan = 0; scan <= LAST_SCAN; scan++) {
+		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+			break;
+
+		if (!scan)
+			lru_add_drain_all();
+		else if (schedule_timeout_killable((HZ << scan) / 200))
+			scan = LAST_SCAN;
+
+		start = 0;
+		rcu_read_lock();
+restart:
+		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+					   start, SHMEM_TAG_PINNED) {
+
+			page = radix_tree_deref_slot(slot);
+			if (radix_tree_exception(page)) {
+				if (radix_tree_deref_retry(page))
+					goto restart;
+
+				page = NULL;
+			}
+
+			if (page &&
+			    page_count(page) - page_mapcount(page) != 1) {
+				if (scan < LAST_SCAN)
+					goto continue_resched;
+
+				/*
+				 * On the last scan, we clean up all those tags
+				 * we inserted; but make a note that we still
+				 * found pages pinned.
+				 */
+				error = -EBUSY;
+			}
+
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_clear(&mapping->page_tree,
+					     iter.index, SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+continue_resched:
+			if (need_resched()) {
+				cond_resched_rcu();
+				start = iter.index + 1;
+				goto restart;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return error;
 }
 
 #define F_ALL_SEALS (F_SEAL_SEAL | \
-- 
2.0.2


WARNING: multiple messages have this Message-ID (diff)
From: David Herrmann <dh.herrmann@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: Michael Kerrisk <mtk.manpages@gmail.com>,
	Ryan Lortie <desrt@desrt.ca>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org, Greg Kroah-Hartman <greg@kroah.com>,
	john.stultz@linaro.org,
	Lennart Poettering <lennart@poettering.net>,
	Daniel Mack <zonque@gmail.com>, Kay Sievers <kay@vrfy.org>,
	Hugh Dickins <hughd@google.com>,
	Andy Lutomirski <luto@amacapital.net>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	David Herrmann <dh.herrmann@gmail.com>
Subject: [PATCH v4 6/6] shm: wait for pins to be released when sealing
Date: Sun, 20 Jul 2014 19:34:40 +0200	[thread overview]
Message-ID: <1405877680-999-7-git-send-email-dh.herrmann@gmail.com> (raw)
In-Reply-To: <1405877680-999-1-git-send-email-dh.herrmann@gmail.com>

If we set SEAL_WRITE on a file, we must make sure there cannot be any
ongoing write-operations on the file. For write() calls, we simply lock
the inode mutex, for mmap() we simply verify there're no writable
mappings. However, there might be pages pinned by AIO, Direct-IO and
similar operations via GUP. We must make sure those do not write to the
memfd file after we set SEAL_WRITE.

As there is no way to notify GUP users to drop pages or to wait for them
to be done, we implement the wait ourself: When setting SEAL_WRITE, we
check all pages for their ref-count. If it's bigger than 1, we know
there's some user of the page. We then mark the page and wait for up to
150ms for those ref-counts to be dropped. If the ref-counts are not
dropped in time, we refuse the seal operation.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 770e072..df1aceb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1780,9 +1780,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on shmem.
+ */
+#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN               4       /* about 150ms max */
+
+static void shmem_tag_pins(struct address_space *mapping)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+
+	lru_add_drain();
+	start = 0;
+	rcu_read_lock();
+
+restart:
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+		page = radix_tree_deref_slot(slot);
+		if (!page || radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page))
+				goto restart;
+		} else if (page_count(page) - page_mapcount(page) > 1) {
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_set(&mapping->page_tree, iter.index,
+					   SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+		}
+
+		if (need_resched()) {
+			cond_resched_rcu();
+			start = iter.index + 1;
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
 static int shmem_wait_for_pins(struct address_space *mapping)
 {
-	return 0;
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+	int error, scan;
+
+	shmem_tag_pins(mapping);
+
+	error = 0;
+	for (scan = 0; scan <= LAST_SCAN; scan++) {
+		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+			break;
+
+		if (!scan)
+			lru_add_drain_all();
+		else if (schedule_timeout_killable((HZ << scan) / 200))
+			scan = LAST_SCAN;
+
+		start = 0;
+		rcu_read_lock();
+restart:
+		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+					   start, SHMEM_TAG_PINNED) {
+
+			page = radix_tree_deref_slot(slot);
+			if (radix_tree_exception(page)) {
+				if (radix_tree_deref_retry(page))
+					goto restart;
+
+				page = NULL;
+			}
+
+			if (page &&
+			    page_count(page) - page_mapcount(page) != 1) {
+				if (scan < LAST_SCAN)
+					goto continue_resched;
+
+				/*
+				 * On the last scan, we clean up all those tags
+				 * we inserted; but make a note that we still
+				 * found pages pinned.
+				 */
+				error = -EBUSY;
+			}
+
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_clear(&mapping->page_tree,
+					     iter.index, SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+continue_resched:
+			if (need_resched()) {
+				cond_resched_rcu();
+				start = iter.index + 1;
+				goto restart;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return error;
 }
 
 #define F_ALL_SEALS (F_SEAL_SEAL | \
-- 
2.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2014-07-20 17:35 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-20 17:34 [PATCH v4 0/6] File Sealing & memfd_create() David Herrmann
2014-07-20 17:34 ` David Herrmann
2014-07-20 17:34 ` [PATCH v4 1/6] mm: allow drivers to prevent new writable mappings David Herrmann
2014-07-20 17:34   ` David Herrmann
2014-07-24  4:07   ` Hugh Dickins
2014-07-24  4:07     ` Hugh Dickins
2014-07-20 17:34 ` [PATCH v4 2/6] shm: add sealing API David Herrmann
2014-07-20 17:34   ` David Herrmann
2014-07-24  4:11   ` Hugh Dickins
2014-07-24  4:11     ` Hugh Dickins
2014-07-20 17:34 ` [PATCH v4 3/6] shm: add memfd_create() syscall David Herrmann
2014-07-20 17:34   ` David Herrmann
2014-07-24  4:19   ` Hugh Dickins
2014-07-24  4:19     ` Hugh Dickins
2014-07-20 17:34 ` [PATCH v4 4/6] selftests: add memfd_create() + sealing tests David Herrmann
2014-07-20 17:34   ` David Herrmann
2014-07-24  4:20   ` Hugh Dickins
2014-07-24  4:20     ` Hugh Dickins
2014-07-20 17:34 ` [PATCH v4 5/6] selftests: add memfd/sealing page-pinning tests David Herrmann
2014-07-20 17:34   ` David Herrmann
2014-07-24  4:28   ` Hugh Dickins
2014-07-24  4:28     ` Hugh Dickins
2014-07-20 17:34 ` David Herrmann [this message]
2014-07-20 17:34   ` [PATCH v4 6/6] shm: wait for pins to be released when sealing David Herrmann
2014-07-24  4:32   ` Hugh Dickins
2014-07-24  4:32     ` Hugh Dickins
2014-07-24  4:48 ` [PATCH v4 0/6] File Sealing & memfd_create() Hugh Dickins
2014-07-24  4:48   ` Hugh Dickins
2014-07-24 21:47 ` Andrew Morton
2014-07-24 21:47   ` Andrew Morton
2014-07-24 22:44   ` David Herrmann
2014-07-24 22:44     ` David Herrmann

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1405877680-999-7-git-send-email-dh.herrmann@gmail.com \
    --to=dh.herrmann@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=desrt@desrt.ca \
    --cc=greg@kroah.com \
    --cc=hughd@google.com \
    --cc=john.stultz@linaro.org \
    --cc=kay@vrfy.org \
    --cc=lennart@poettering.net \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@amacapital.net \
    --cc=mtk.manpages@gmail.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zonque@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.