linux-nvdimm.lists.01.org archive mirror
 help / color / mirror / Atom feed
From: Liu Bo <bo.liu@linux.alibaba.com>
To: Vivek Goyal <vgoyal@redhat.com>
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nvdimm@lists.01.org, virtio-fs@redhat.com,
	miklos@szeredi.hu, stefanha@redhat.com, dgilbert@redhat.com,
	mst@redhat.com, Miklos Szeredi <mszeredi@redhat.com>,
	Peng Tao <tao.peng@linux.alibaba.com>
Subject: Re: [PATCH 13/20] fuse, dax: Implement dax read/write operations
Date: Sat, 4 Apr 2020 08:25:21 +0800	[thread overview]
Message-ID: <20200404002521.GA125697@rsjd01523.et2sqa> (raw)
In-Reply-To: <20200304165845.3081-14-vgoyal@redhat.com>

On Wed, Mar 04, 2020 at 11:58:38AM -0500, Vivek Goyal wrote:
> This patch implements basic DAX support. mmap() is not implemented
> yet and will come in later patches. This patch looks into implemeting
> read/write.
> 
> We make use of interval tree to keep track of per inode dax mappings.
> 
> Do not use dax for file extending writes, instead just send WRITE message
> to daemon (like we do for direct I/O path). This will keep write and
> i_size change atomic w.r.t crash.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
> ---
>  fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
>  fs/fuse/fuse_i.h          |  23 ++
>  fs/fuse/inode.c           |   6 +
>  include/uapi/linux/fuse.h |   1 +
>  4 files changed, 621 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 9d67b830fb7a..9effdd3dc6d6 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -18,6 +18,12 @@
>  #include <linux/swap.h>
>  #include <linux/falloc.h>
>  #include <linux/uio.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/interval_tree_generic.h>
> +
> +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> +                     START, LAST, static inline, fuse_dax_interval_tree);
>  
>  static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
>  				      struct fuse_page_desc **desc)
> @@ -187,6 +193,242 @@ static void fuse_link_write_file(struct file *file)
>  	spin_unlock(&fi->lock);
>  }
>  
> +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
> +{
> +	struct fuse_dax_mapping *dmap = NULL;
> +
> +	spin_lock(&fc->lock);
> +
> +	if (fc->nr_free_ranges <= 0) {
> +		spin_unlock(&fc->lock);
> +		return NULL;
> +	}
> +
> +	WARN_ON(list_empty(&fc->free_ranges));
> +
> +	/* Take a free range */
> +	dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
> +					list);
> +	list_del_init(&dmap->list);
> +	fc->nr_free_ranges--;
> +	spin_unlock(&fc->lock);
> +	return dmap;
> +}
> +
> +/* This assumes fc->lock is held */
> +static void __dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	list_add_tail(&dmap->list, &fc->free_ranges);
> +	fc->nr_free_ranges++;
> +}
> +
> +static void dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	/* Return fuse_dax_mapping to free list */
> +	spin_lock(&fc->lock);
> +	__dmap_add_to_free_pool(fc, dmap);
> +	spin_unlock(&fc->lock);
> +}
> +
> +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> +				  struct fuse_dax_mapping *dmap, bool writable,
> +				  bool upgrade)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_setupmapping_in inarg;
> +	FUSE_ARGS(args);
> +	ssize_t err;
> +
> +	WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> +	WARN_ON(fc->nr_free_ranges < 0);
> +
> +	/* Ask fuse daemon to setup mapping */
> +	memset(&inarg, 0, sizeof(inarg));
> +	inarg.foffset = offset;
> +	inarg.fh = -1;
> +	inarg.moffset = dmap->window_offset;
> +	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> +	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> +	if (writable)
> +		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> +	args.opcode = FUSE_SETUPMAPPING;
> +	args.nodeid = fi->nodeid;
> +	args.in_numargs = 1;
> +	args.in_args[0].size = sizeof(inarg);
> +	args.in_args[0].value = &inarg;
> +	err = fuse_simple_request(fc, &args);
> +	if (err < 0) {
> +		printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
> +				 __func__, dmap->window_offset, err);
> +		return err;
> +	}
> +
> +	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
> +		 " err=%zd\n", offset, writable, err);
> +
> +	dmap->writable = writable;
> +	if (!upgrade) {
> +		dmap->start = offset;
> +		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> +		/* Protected by fi->i_dmap_sem */
> +		fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> +		fi->nr_dmaps++;
> +	}
> +	return 0;
> +}
> +
> +static int
> +fuse_send_removemapping(struct inode *inode,
> +			struct fuse_removemapping_in *inargp,
> +			struct fuse_removemapping_one *remove_one)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	FUSE_ARGS(args);
> +
> +	args.opcode = FUSE_REMOVEMAPPING;
> +	args.nodeid = fi->nodeid;
> +	args.in_numargs = 2;
> +	args.in_args[0].size = sizeof(*inargp);
> +	args.in_args[0].value = inargp;
> +	args.in_args[1].size = inargp->count * sizeof(*remove_one);
> +	args.in_args[1].value = remove_one;
> +	return fuse_simple_request(fc, &args);
> +}
> +
> +static int dmap_removemapping_list(struct inode *inode, unsigned num,
> +				   struct list_head *to_remove)
> +{
> +	struct fuse_removemapping_one *remove_one, *ptr;
> +	struct fuse_removemapping_in inarg;
> +	struct fuse_dax_mapping *dmap;
> +	int ret, i = 0, nr_alloc;
> +
> +	nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
> +	remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
> +	if (!remove_one)
> +		return -ENOMEM;
> +
> +	ptr = remove_one;
> +	list_for_each_entry(dmap, to_remove, list) {
> +		ptr->moffset = dmap->window_offset;
> +		ptr->len = dmap->length;
> +		ptr++;
> +		i++;
> +		num--;
> +		if (i >= nr_alloc || num == 0) {
> +			memset(&inarg, 0, sizeof(inarg));
> +			inarg.count = i;
> +			ret = fuse_send_removemapping(inode, &inarg,
> +						      remove_one);
> +			if (ret)
> +				goto out;
> +			ptr = remove_one;
> +			i = 0;
> +		}
> +	}
> +out:
> +	kfree(remove_one);
> +	return ret;
> +}
> +
> +/*
> + * Cleanup dmap entry and add back to free list. This should be called with
> + * fc->lock held.
> + */
> +static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
> +					    struct fuse_dax_mapping *dmap)
> +{
> +	pr_debug("fuse: freeing memory range start=0x%llx end=0x%llx "
> +		 "window_offset=0x%llx length=0x%llx\n", dmap->start,
> +		 dmap->end, dmap->window_offset, dmap->length);
> +	dmap->start = dmap->end = 0;
> +	__dmap_add_to_free_pool(fc, dmap);
> +}
> +
> +/*
> + * Free inode dmap entries whose range falls entirely inside [start, end].
> + * Does not take any locks. At this point of time it should only be
> + * called from evict_inode() path where we know all dmap entries can be
> + * reclaimed.
> + */
> +static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
> +				      loff_t start, loff_t end)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap, *n;
> +	int err, num = 0;
> +	LIST_HEAD(to_remove);
> +
> +	pr_debug("fuse: %s: start=0x%llx, end=0x%llx\n", __func__, start, end);
> +
> +	/*
> +	 * Interval tree search matches intersecting entries. Adjust the range
> +	 * to avoid dropping partial valid entries.
> +	 */
> +	start = ALIGN(start, FUSE_DAX_MEM_RANGE_SZ);
> +	end = ALIGN_DOWN(end, FUSE_DAX_MEM_RANGE_SZ);
> +
> +	while (1) {
> +		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, start,
> +							 end);
> +		if (!dmap)
> +			break;
> +		fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
> +		num++;
> +		list_add(&dmap->list, &to_remove);
> +	}
> +
> +	/* Nothing to remove */
> +	if (list_empty(&to_remove))
> +		return;
> +
> +	WARN_ON(fi->nr_dmaps < num);
> +	fi->nr_dmaps -= num;
> +	/*
> +	 * During umount/shutdown, fuse connection is dropped first
> +	 * and evict_inode() is called later. That means any
> +	 * removemapping messages are going to fail. Send messages
> +	 * only if connection is up. Otherwise fuse daemon is
> +	 * responsible for cleaning up any leftover references and
> +	 * mappings.
> +	 */
> +	if (fc->connected) {
> +		err = dmap_removemapping_list(inode, num, &to_remove);
> +		if (err) {
> +			pr_warn("Failed to removemappings. start=0x%llx"
> +				" end=0x%llx\n", start, end);
> +		}
> +	}
> +	spin_lock(&fc->lock);
> +	list_for_each_entry_safe(dmap, n, &to_remove, list) {
> +		list_del_init(&dmap->list);
> +		dmap_reinit_add_to_free_pool(fc, dmap);
> +	}
> +	spin_unlock(&fc->lock);
> +}
> +
> +/*
> + * It is called from evict_inode() and by that time inode is going away. So
> + * this function does not take any locks like fi->i_dmap_sem for traversing
> + * that fuse inode interval tree. If that lock is taken then lock validator
> + * complains of deadlock situation w.r.t fs_reclaim lock.
> + */
> +void fuse_cleanup_inode_mappings(struct inode *inode)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	/*
> +	 * fuse_evict_inode() has alredy called truncate_inode_pages_final()
> +	 * before we arrive here. So we should not have to worry about
> +	 * any pages/exception entries still associated with inode.
> +	 */
> +	inode_reclaim_dmap_range(fc, inode, 0, -1);
> +}
> +
>  void fuse_finish_open(struct inode *inode, struct file *file)
>  {
>  	struct fuse_file *ff = file->private_data;
> @@ -1562,32 +1804,364 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	return res;
>  }
>  
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
>  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_read_iter(iocb, to);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_read_iter(iocb, to);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_read_iter(iocb, to);
> +
> +	return fuse_cache_read_iter(iocb, to);
>  }
>  
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_write_iter(iocb, from);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_write_iter(iocb, from);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_write_iter(iocb, from);
> +
> +	return fuse_cache_write_iter(iocb, from);
> +}
> +
> +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
> +{
> +	iomap->addr = IOMAP_NULL_ADDR;
> +	iomap->length = length;
> +	iomap->type = IOMAP_HOLE;
> +}
> +
> +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> +			struct iomap *iomap, struct fuse_dax_mapping *dmap,
> +			unsigned flags)
> +{
> +	loff_t offset, len;
> +	loff_t i_size = i_size_read(inode);
> +
> +	offset = pos - dmap->start;
> +	len = min(length, dmap->length - offset);
> +
> +	/* If length is beyond end of file, truncate further */
> +	if (pos + len > i_size)
> +		len = i_size - pos;
> +
> +	if (len > 0) {
> +		iomap->addr = dmap->window_offset + offset;
> +		iomap->length = len;
> +		if (flags & IOMAP_FAULT)
> +			iomap->length = ALIGN(len, PAGE_SIZE);
> +		iomap->type = IOMAP_MAPPED;
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				" length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	} else {
> +		/* Mapping beyond end of file is hole */
> +		fuse_fill_iomap_hole(iomap, length);
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				"length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	}
> +}
> +
> +static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> +	int ret;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	alloc_dmap = alloc_dax_mapping(fc);
> +	if (!alloc_dmap)
> +		return -EBUSY;
> +
> +	/*
> +	 * Take write lock so that only one caller can try to setup mapping
> +	 * and other waits.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	/*
> +	 * We dropped lock. Check again if somebody else setup
> +	 * mapping already.
> +	 */
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
> +						pos);
> +	if (dmap) {
> +		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return 0;
> +	}
> +
> +	/* Setup one mapping */
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     alloc_dmap, writable, false);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d"
> +			" pos=0x%llx, writable=%d\n", ret, pos, writable);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return ret;
> +	}
> +	fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
> +	up_write(&fi->i_dmap_sem);
> +	return 0;
> +}
> +
> +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap;
> +	int ret;
> +
> +	/*
> +	 * Take exclusive lock so that only one caller can try to setup
> +	 * mapping and others wait.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	/* We are holding either inode lock or i_mmap_sem, and that should
> +	 * ensure that dmap can't reclaimed or truncated and it should still
> +	 * be there in tree despite the fact we dropped and re-acquired the
> +	 * lock.
> +	 */
> +	ret = -EIO;
> +	if (WARN_ON(!dmap))
> +		goto out_err;
> +
> +	/* Maybe another thread already upgraded mapping while we were not
> +	 * holding lock.
> +	 */
> +	if (dmap->writable)

oops, looks like it's still returning -EIO here, %ret should be zero.

thanks,
liubo

> +		goto out_fill_iomap;
> +
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     dmap, true, true);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> +		       ret, pos);
> +		goto out_err;
> +	}
> +
> +out_fill_iomap:
> +	fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +out_err:
> +	up_write(&fi->i_dmap_sem);
> +	return ret;
> +}
> +
> +/* This is just for DAX and the mapping is ephemeral, do not use it for other
> + * purposes since there is no block device with a permanent mapping.
> + */
> +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
> +			    unsigned flags, struct iomap *iomap,
> +			    struct iomap *srcmap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	/* We don't support FIEMAP */
> +	BUG_ON(flags & IOMAP_REPORT);
> +
> +	pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
> +			pos, length);
> +
> +	/*
> +	 * Writes beyond end of file are not handled using dax path. Instead
> +	 * a fuse write message is sent to daemon
> +	 */
> +	if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
> +		return -EIO;
> +
> +	iomap->offset = pos;
> +	iomap->flags = 0;
> +	iomap->bdev = NULL;
> +	iomap->dax_dev = fc->dax_dev;
> +
> +	/*
> +	 * Both read/write and mmap path can race here. So we need something
> +	 * to make sure if we are setting up mapping, then other path waits
> +	 *
> +	 * For now, use a semaphore for this. It probably needs to be
> +	 * optimized later.
> +	 */
> +	down_read(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	if (dmap) {
> +		if (writable && !dmap->writable) {
> +			/* Upgrade read-only mapping to read-write. This will
> +			 * require exclusive i_dmap_sem lock as we don't want
> +			 * two threads to be trying to this simultaneously
> +			 * for same dmap. So drop shared lock and acquire
> +			 * exclusive lock.
> +			 */
> +			up_read(&fi->i_dmap_sem);
> +			pr_debug("%s: Upgrading mapping at offset 0x%llx"
> +				 " length 0x%llx\n", __func__, pos, length);
> +			return iomap_begin_upgrade_mapping(inode, pos, length,
> +							   flags, iomap);
> +		} else {
> +			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +			up_read(&fi->i_dmap_sem);
> +			return 0;
> +		}
> +	} else {
> +		up_read(&fi->i_dmap_sem);
> +		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
> +				__func__, pos, length);
> +		if (pos >= i_size_read(inode))
> +			goto iomap_hole;
> +
> +		return iomap_begin_setup_new_mapping(inode, pos, length, flags,
> +						     iomap);
> +	}
> +
> +	/*
> +	 * If read beyond end of file happnes, fs code seems to return
> +	 * it as hole
> +	 */
> +iomap_hole:
> +	fuse_fill_iomap_hole(iomap, length);
> +	pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
> +	return 0;
> +}
> +
> +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> +			  ssize_t written, unsigned flags,
> +			  struct iomap *iomap)
> +{
> +	/* DAX writes beyond end-of-file aren't handled using iomap, so the
> +	 * file size is unchanged and there is nothing to do here.
> +	 */
> +	return 0;
> +}
> +
> +static const struct iomap_ops fuse_iomap_ops = {
> +	.iomap_begin = fuse_iomap_begin,
> +	.iomap_end = fuse_iomap_end,
> +};
> +
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock_shared(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock_shared(inode);
> +	}
> +
> +	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
> +	inode_unlock_shared(inode);
> +
> +	/* TODO file_accessed(iocb->f_filp) */
> +	return ret;
> +}
> +
> +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	return (iov_iter_rw(from) == WRITE &&
> +		((iocb->ki_pos) >= i_size_read(inode)));
> +}
> +
> +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
> +	ssize_t ret;
> +
> +	ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
> +	if (ret < 0)
> +		return ret;
> +
> +	fuse_invalidate_attr(inode);
> +	fuse_write_update_size(inode, iocb->ki_pos);
> +	return ret;
> +}
> +
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret, count;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock(inode);
> +	}
> +
> +	ret = generic_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		goto out;
> +	/* TODO file_update_time() but we don't want metadata I/O */
> +
> +	/* Do not use dax for file extending writes as its an mmap and
> +	 * trying to write beyong end of existing page will generate
> +	 * SIGBUS.
> +	 */
> +	if (file_extending_write(iocb, from)) {
> +		ret = fuse_dax_direct_write(iocb, from);
> +		goto out;
> +	}
> +
> +	ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
> +	if (ret < 0)
> +		goto out;
> +
> +	/*
> +	 * If part of the write was file extending, fuse dax path will not
> +	 * take care of that. Do direct write instead.
> +	 */
> +	if (iov_iter_count(from) && file_extending_write(iocb, from)) {
> +		count = fuse_dax_direct_write(iocb, from);
> +		if (count < 0)
> +			goto out;
> +		ret += count;
> +	}
> +
> +out:
> +	inode_unlock(inode);
> +
> +	if (ret > 0)
> +		ret = generic_write_sync(iocb, ret);
> +	return ret;
>  }
>  
>  static void fuse_writepage_free(struct fuse_writepage_args *wpa)
> @@ -2318,6 +2892,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
>  	return 0;
>  }
>  
> +static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	return -EINVAL; /* TODO */
> +}
> +
>  static int convert_fuse_file_lock(struct fuse_conn *fc,
>  				  const struct fuse_file_lock *ffl,
>  				  struct file_lock *fl)
> @@ -3387,6 +3966,7 @@ static const struct address_space_operations fuse_file_aops  = {
>  void fuse_init_file_inode(struct inode *inode)
>  {
>  	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
>  
>  	inode->i_fop = &fuse_file_operations;
>  	inode->i_data.a_ops = &fuse_file_aops;
> @@ -3396,4 +3976,9 @@ void fuse_init_file_inode(struct inode *inode)
>  	fi->writectr = 0;
>  	init_waitqueue_head(&fi->page_waitq);
>  	INIT_LIST_HEAD(&fi->writepages);
> +	fi->dmap_tree = RB_ROOT_CACHED;
> +
> +	if (fc->dax_dev) {
> +		inode->i_flags |= S_DAX;
> +	}
>  }
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index b41275f73e4c..490549862bda 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -70,16 +70,29 @@ struct fuse_forget_link {
>  	struct fuse_forget_link *next;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->end)
> +
>  /** Translation information for file offsets to DAX window offsets */
>  struct fuse_dax_mapping {
>  	/* Will connect in fc->free_ranges to keep track of free memory */
>  	struct list_head list;
>  
> +	/* For interval tree in file/inode */
> +	struct rb_node rb;
> +	/** Start Position in file */
> +	__u64 start;
> +	/** End Position in file */
> +	__u64 end;
> +	__u64 __subtree_last;
>  	/** Position in DAX window */
>  	u64 window_offset;
>  
>  	/** Length of mapping, in bytes */
>  	loff_t length;
> +
> +	/* Is this mapping read-only or read-write */
> +	bool writable;
>  };
>  
>  /** FUSE inode */
> @@ -167,6 +180,15 @@ struct fuse_inode {
>  
>  	/** Lock to protect write related fields */
>  	spinlock_t lock;
> +
> +	/*
> +	 * Semaphore to protect modifications to dmap_tree
> +	 */
> +	struct rw_semaphore i_dmap_sem;
> +
> +	/** Sorted rb tree of struct fuse_dax_mapping elements */
> +	struct rb_root_cached dmap_tree;
> +	unsigned long nr_dmaps;
>  };
>  
>  /** FUSE inode state bits */
> @@ -1127,5 +1149,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
>   */
>  u64 fuse_get_unique(struct fuse_iqueue *fiq);
>  void fuse_free_conn(struct fuse_conn *fc);
> +void fuse_cleanup_inode_mappings(struct inode *inode);
>  
>  #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 36cb9c00bbe5..93bc65607a15 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -86,7 +86,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  	fi->attr_version = 0;
>  	fi->orig_ino = 0;
>  	fi->state = 0;
> +	fi->nr_dmaps = 0;
>  	mutex_init(&fi->mutex);
> +	init_rwsem(&fi->i_dmap_sem);
>  	spin_lock_init(&fi->lock);
>  	fi->forget = fuse_alloc_forget();
>  	if (!fi->forget) {
> @@ -114,6 +116,10 @@ static void fuse_evict_inode(struct inode *inode)
>  	clear_inode(inode);
>  	if (inode->i_sb->s_flags & SB_ACTIVE) {
>  		struct fuse_conn *fc = get_fuse_conn(inode);
> +		if (IS_DAX(inode)) {
> +			fuse_cleanup_inode_mappings(inode);
> +			WARN_ON(fi->nr_dmaps);
> +		}
>  		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
>  		fi->forget = NULL;
>  	}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 62633555d547..36d824b82ebc 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -896,6 +896,7 @@ struct fuse_copy_file_range_in {
>  
>  #define FUSE_SETUPMAPPING_ENTRIES 8
>  #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
> +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
>  struct fuse_setupmapping_in {
>  	/* An already open handle */
>  	uint64_t	fh;
> -- 
> 2.20.1
_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org

  parent reply	other threads:[~2020-04-04  0:25 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-04 16:58 [PATCH 00/20] virtiofs: Add DAX support Vivek Goyal
2020-03-04 16:58 ` [PATCH 01/20] dax: Modify bdev_dax_pgoff() to handle NULL bdev Vivek Goyal
2020-03-04 16:58 ` [PATCH 02/20] dax: Create a range version of dax_layout_busy_page() Vivek Goyal
2020-03-10 15:19   ` Ira Weiny
2020-03-10 20:29     ` Vivek Goyal
2020-03-04 16:58 ` [PATCH 03/20] virtio: Add get_shm_region method Vivek Goyal
2020-03-10 10:53   ` Stefan Hajnoczi
2020-03-04 16:58 ` [PATCH 04/20] virtio: Implement get_shm_region for PCI transport Vivek Goyal
2020-03-10 11:04   ` Stefan Hajnoczi
2020-03-10 18:19     ` Vivek Goyal
2020-03-11 17:34       ` Stefan Hajnoczi
2020-03-11 19:29         ` Vivek Goyal
2020-03-10 11:12   ` Michael S. Tsirkin
2020-03-10 18:47     ` Vivek Goyal
2020-03-10 21:27       ` Michael S. Tsirkin
2020-03-04 16:58 ` [PATCH 05/20] virtio: Implement get_shm_region for MMIO transport Vivek Goyal
2020-03-10 11:06   ` Stefan Hajnoczi
2020-03-04 16:58 ` [PATCH 06/20] virtiofs: Provide a helper function for virtqueue initialization Vivek Goyal
2020-03-10 14:10   ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 07/20] fuse: Get rid of no_mount_options Vivek Goyal
2020-03-10 14:12   ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 08/20] fuse,virtiofs: Add a mount option to enable dax Vivek Goyal
2020-03-10 14:16   ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 09/20] virtio_fs, dax: Set up virtio_fs dax_device Vivek Goyal
2020-03-04 16:58 ` [PATCH 10/20] fuse,virtiofs: Keep a list of free dax memory ranges Vivek Goyal
2020-03-10 19:29   ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 11/20] fuse: implement FUSE_INIT map_alignment field Vivek Goyal
2020-03-10 19:31   ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 12/20] fuse: Introduce setupmapping/removemapping commands Vivek Goyal
2020-03-10 19:49   ` Miklos Szeredi
2020-03-10 20:33     ` Vivek Goyal
2020-03-11  7:03       ` Amir Goldstein
2020-03-11 14:19         ` Miklos Szeredi
2020-03-11 14:41           ` Vivek Goyal
2020-03-11 15:12             ` Miklos Szeredi
2020-03-04 16:58 ` [PATCH 13/20] fuse, dax: Implement dax read/write operations Vivek Goyal
2020-03-12  9:43   ` Miklos Szeredi
2020-03-12 16:02     ` Vivek Goyal
2020-03-13 10:18       ` Miklos Szeredi
2020-03-13 13:41         ` Vivek Goyal
2020-04-04  0:25   ` Liu Bo [this message]
2020-04-14 12:54     ` Vivek Goyal
2020-03-04 16:58 ` [PATCH 14/20] fuse,dax: add DAX mmap support Vivek Goyal
2020-03-04 16:58 ` [PATCH 15/20] fuse, dax: Take ->i_mmap_sem lock during dax page fault Vivek Goyal
2020-03-04 16:58 ` [PATCH 16/20] fuse,virtiofs: Define dax address space operations Vivek Goyal
2020-03-04 16:58 ` [PATCH 17/20] fuse,virtiofs: Maintain a list of busy elements Vivek Goyal
2020-03-04 16:58 ` [PATCH 18/20] fuse: Release file in process context Vivek Goyal
2020-03-04 16:58 ` [PATCH 19/20] fuse: Take inode lock for dax inode truncation Vivek Goyal
2020-03-04 16:58 ` [PATCH 20/20] fuse,virtiofs: Add logic to free up a memory range Vivek Goyal
2020-03-11  5:16   ` Liu Bo
2020-03-11 12:59     ` Vivek Goyal
2020-03-11 17:24       ` Liu Bo
2020-03-26  0:09   ` Liu Bo
2020-03-27 14:01     ` Vivek Goyal
2020-03-27 22:06       ` Liu Bo
2020-04-14 19:30         ` Vivek Goyal
2020-04-15 17:22           ` Liu Bo
2020-04-16 19:05             ` Vivek Goyal
2020-04-17 18:05               ` Liu Bo
2020-03-11  5:22 ` [PATCH 00/20] virtiofs: Add DAX support Amir Goldstein
2020-03-11 13:09   ` Vivek Goyal
2020-03-11 18:48   ` Vivek Goyal
2020-03-11 19:32     ` Amir Goldstein
2020-03-11 19:39       ` Vivek Goyal
2020-03-11 13:38 ` Patrick Ohly
2020-03-16 13:02   ` Vivek Goyal
2020-03-17  8:28     ` Patrick Ohly

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200404002521.GA125697@rsjd01523.et2sqa \
    --to=bo.liu@linux.alibaba.com \
    --cc=dgilbert@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=miklos@szeredi.hu \
    --cc=mst@redhat.com \
    --cc=mszeredi@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=tao.peng@linux.alibaba.com \
    --cc=vgoyal@redhat.com \
    --cc=virtio-fs@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).