linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: darrick.wong@oracle.com
Cc: Jan Kara <jack@suse.cz>,
	linux-nvdimm@lists.01.org, linux-api@vger.kernel.org,
	Dave Chinner <david@fromorbit.com>,
	linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-mm@kvack.org, Jeff Moyer <jmoyer@redhat.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	luto@kernel.org, linux-fsdevel@vger.kernel.org,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	Christoph Hellwig <hch@lst.de>
Subject: [PATCH v4 3/3] fs, xfs: introduce MAP_DIRECT for creating block-map-sealed file ranges
Date: Mon, 14 Aug 2017 23:12:22 -0700	[thread overview]
Message-ID: <150277754211.23945.458876600578531019.stgit@dwillia2-desk3.amr.corp.intel.com> (raw)
In-Reply-To: <150277752553.23945.13932394738552748440.stgit@dwillia2-desk3.amr.corp.intel.com>

MAP_DIRECT is an mmap(2) flag with the following semantics:

  MAP_DIRECT
  In addition to this mapping having MAP_SHARED semantics, successful
  faults in this range may assume that the block map (logical-file-offset
  to physical memory address) is pinned for the lifetime of the mapping.
  Successful MAP_DIRECT faults establish mappings that bypass any kernel
  indirections like the page-cache. All updates are carried directly
  through to the underlying file physical blocks (modulo cpu cache
  effects).

  ETXTBSY is returned on attempts to change the block map (allocate blocks
  / convert unwritten extents / break shared extents) in the mapped range.
  Some filesystems may extend these same restrictions outside the mapped
  range and return ETXTBSY to any file operations that might mutate the
  block map. MAP_DIRECT faults may fail with a SIGSEGV if the filesystem
  needs to write the block map to satisfy the fault. For example, if the
  mapping was established over a hole in a sparse file.

  The kernel ignores attempts to mark a MAP_DIRECT mapping MAP_PRIVATE and
  will silently fall back to MAP_SHARED semantics.

  ERRORS
  EACCES A MAP_DIRECT mapping was requested and PROT_WRITE was not set.

  EINVAL MAP_ANONYMOUS was specified with MAP_DIRECT.

  EOPNOTSUPP The filesystem explicitly does not support the flag

  SIGSEGV Attempted to write a MAP_DIRECT mapping at a file offset that
          might require block-map updates.

Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                               |    2 +
 fs/xfs/xfs_file.c                      |  109 ++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h                     |    1 
 fs/xfs/xfs_super.c                     |    1 
 include/linux/mm_types.h               |    1 
 include/linux/mman.h                   |    2 -
 include/uapi/asm-generic/mman-common.h |    1 
 mm/mmap.c                              |    2 +
 8 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 306c2b603fb8..a654b2dd9016 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1121,6 +1121,8 @@ static int dax_fault_return(int error)
 		return VM_FAULT_NOPAGE;
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM;
+	if (error == -ETXTBSY)
+		return VM_FAULT_SIGSEGV;
 	return VM_FAULT_SIGBUS;
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..fcdf6d5768aa 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -40,6 +40,7 @@
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
 
+#include <linux/mman.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
@@ -1001,6 +1002,23 @@ xfs_file_llseek(
 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 }
 
+STATIC int
+xfs_vma_checks(
+	struct vm_area_struct	*vma,
+	struct inode		*inode)
+{
+	if ((vma->fs_flags & MAP_DIRECT) != MAP_DIRECT)
+		return 0;
+
+	if (xfs_is_reflink_inode(XFS_I(inode)))
+		return VM_FAULT_SIGSEGV;
+
+	if (!IS_DAX(inode))
+		return VM_FAULT_SIGSEGV;
+
+	return 0;
+}
+
 /*
  * Locking for serialisation of IO during page faults. This results in a lock
  * ordering of:
@@ -1031,6 +1049,10 @@ xfs_filemap_page_mkwrite(
 	file_update_time(vmf->vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
+	ret = xfs_vma_checks(vmf->vma, inode);
+	if (ret)
+		goto out_unlock;
+
 	if (IS_DAX(inode)) {
 		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	} else {
@@ -1038,6 +1060,7 @@ xfs_filemap_page_mkwrite(
 		ret = block_page_mkwrite_return(ret);
 	}
 
+out_unlock:
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 
@@ -1058,10 +1081,15 @@ xfs_filemap_fault(
 		return xfs_filemap_page_mkwrite(vmf);
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	ret = xfs_vma_checks(vmf->vma, inode);
+	if (ret)
+		goto out_unlock;
+
 	if (IS_DAX(inode))
 		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	else
 		ret = filemap_fault(vmf);
+out_unlock:
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	return ret;
@@ -1094,7 +1122,9 @@ xfs_filemap_huge_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+	ret = xfs_vma_checks(vmf->vma, inode);
+	if (ret == 0)
+		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1137,12 +1167,63 @@ xfs_filemap_pfn_mkwrite(
 
 }
 
+STATIC void
+xfs_filemap_open(
+	struct vm_area_struct	*vma)
+{
+	struct file		*filp = vma->vm_file;
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if ((vma->fs_flags & MAP_DIRECT) != MAP_DIRECT)
+		return;
+	atomic_inc(&ip->i_mapdcount);
+}
+
+STATIC int
+atomic_dec_and_xfs_ilock(
+	atomic_t		*atomic,
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+
+	/* Otherwise do it the slow way */
+	xfs_ilock(ip, lock_flags);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	xfs_iunlock(ip, lock_flags);
+	return 0;
+}
+
+STATIC void
+xfs_filemap_close(
+	struct vm_area_struct	*vma)
+{
+	struct file		*filp = vma->vm_file;
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if ((vma->fs_flags & MAP_DIRECT) != MAP_DIRECT)
+		return;
+
+	if (!atomic_dec_and_xfs_ilock(&ip->i_mapdcount, ip,
+				XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL))
+		return;
+	inode->i_flags &= ~S_IOMAP_SEALED;
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+}
+
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.huge_fault	= xfs_filemap_huge_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
+	.open		= xfs_filemap_open,
+	.close		= xfs_filemap_close,
 };
 
 STATIC int
@@ -1157,6 +1238,31 @@ xfs_file_mmap(
 	return 0;
 }
 
+#define	XFS_MAP_SUPPORTED (MAP_DIRECT)
+
+STATIC int
+xfs_file_fmmap(
+	struct file		*filp,
+	struct vm_area_struct	*vma,
+	unsigned long		flags)
+{
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if (flags & ~(XFS_MAP_SUPPORTED))
+		return -EOPNOTSUPP;
+
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL|XFS_IOLOCK_EXCL);
+	if ((flags & MAP_DIRECT) == MAP_DIRECT) {
+		vma->fs_flags |= MAP_DIRECT;
+		inode->i_flags |= S_IOMAP_SEALED;
+		atomic_inc(&ip->i_mapdcount);
+	}
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL|XFS_IOLOCK_EXCL);
+
+	return xfs_file_mmap(filp, vma);
+}
+
 const struct file_operations xfs_file_operations = {
 	.llseek		= xfs_file_llseek,
 	.read_iter	= xfs_file_read_iter,
@@ -1168,6 +1274,7 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
+	.fmmap		= xfs_file_fmmap,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0ee453de239a..50d3e1bca1a9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -58,6 +58,7 @@ typedef struct xfs_inode {
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
+	atomic_t		i_mapdcount;	/* inode MAP_DIRECT count */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 664db709cd1a..2604568354db 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1011,6 +1011,7 @@ xfs_fs_inode_init_once(
 
 	/* xfs inode */
 	atomic_set(&ip->i_pincount, 0);
+	atomic_set(&ip->i_mapdcount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 
 	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ff151814a02d..73fdc0ada9ee 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -306,6 +306,7 @@ struct vm_area_struct {
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
 	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
 	unsigned long vm_flags;		/* Flags, see mm.h. */
+	unsigned long fs_flags;		/* fs flags, see MAP_DIRECT etc */
 
 	/*
 	 * For areas with an address space and backing store,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 73d4ac7e7136..dc120995f684 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -8,7 +8,7 @@
 #include <uapi/linux/mman.h>
 
 /* the MAP_VALIDATE set of supported flags */
-#define	MAP_SUPPORTED_MASK (0)
+#define	MAP_SUPPORTED_MASK (MAP_DIRECT)
 
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8bf8c7828275..a16184402c45 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -25,6 +25,7 @@
 # define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
 #endif
 #define MAP_VALIDATE (MAP_SHARED|MAP_PRIVATE) /* mechanism to define new shared semantics */
+#define MAP_DIRECT (MAP_VALIDATE | 0x40)	/* shared, sealed, and no page cache */
 
 /*
  * Flags for mlock
diff --git a/mm/mmap.c b/mm/mmap.c
index d2919a9e25bf..f12de3859fec 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1393,6 +1393,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 				return -EINVAL;
 			if (!file->f_op->fmmap)
 				return -EOPNOTSUPP;
+			if ((flags & MAP_DIRECT) && !(prot & PROT_WRITE))
+				return -EACCES;
 			/* fall through */
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

  parent reply	other threads:[~2017-08-15  6:12 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-08-15  6:12 [PATCH v4 0/3] MAP_DIRECT and block-map sealed files Dan Williams
2017-08-15  6:12 ` [PATCH v4 1/3] fs, xfs: introduce S_IOMAP_SEALED Dan Williams
     [not found] ` <150277752553.23945.13932394738552748440.stgit-p8uTFz9XbKj2zm6wflaqv1nYeNYlB/vhral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-08-15  6:12   ` [PATCH v4 2/3] mm: introduce MAP_VALIDATE a mechanism for adding new mmap flags Dan Williams
2017-08-15 12:27     ` Jan Kara
     [not found]       ` <20170815122701.GF27505-4I4JzKEfoa/jFM9bn6wA6Q@public.gmane.org>
2017-08-15 16:24         ` Dan Williams
2017-09-17  3:44       ` Dan Williams
     [not found]         ` <CAA9_cmc0vejxCsc1NWp5b4C0CSsO5xetF3t6LCoCuEYB6yPiwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-09-17 17:39           ` Christoph Hellwig
     [not found]             ` <20170917173945.GA22200-jcswGhMUV9g@public.gmane.org>
2017-09-18  9:31               ` Jan Kara
2017-09-18 15:47                 ` Dan Williams
2017-09-18  9:26         ` Jan Kara
2017-08-15 16:28     ` Andy Lutomirski
2017-08-15 22:31       ` Dan Williams
2017-08-17  8:06     ` kbuild test robot
2017-08-15  6:12 ` Dan Williams [this message]
2017-08-15  9:18   ` [PATCH v4 3/3] fs, xfs: introduce MAP_DIRECT for creating block-map-sealed file ranges Kirill A. Shutemov
2017-08-15 17:11     ` Dan Williams
2017-08-16 10:25       ` Kirill A. Shutemov
     [not found]   ` <150277754211.23945.458876600578531019.stgit-p8uTFz9XbKj2zm6wflaqv1nYeNYlB/vhral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-08-15 12:42     ` Jan Kara
     [not found]       ` <20170815124250.GG27505-4I4JzKEfoa/jFM9bn6wA6Q@public.gmane.org>
2017-08-15 16:29         ` Dan Williams
     [not found]           ` <CAPcyv4h01os0Gc6bYmaGdMXt5q4G4zfirNRPWG3=gQi5POrpmg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-08-16  1:15             ` Dan Williams
2017-08-17  8:49     ` kbuild test robot
2017-08-15  9:01 ` [PATCH v4 0/3] MAP_DIRECT and block-map sealed files Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=150277754211.23945.458876600578531019.stgit@dwillia2-desk3.amr.corp.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=darrick.wong@oracle.com \
    --cc=david@fromorbit.com \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=jmoyer@redhat.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=ross.zwisler@linux.intel.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).