All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req
@ 2020-02-04  1:54 xiubli
  2020-02-05 16:24 ` Jeff Layton
  0 siblings, 1 reply; 4+ messages in thread
From: xiubli @ 2020-02-04  1:54 UTC (permalink / raw)
  To: jlayton, idryomov, zyan; +Cc: sage, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

If the direct io couldn't be submit in a single request, for multiple
writers, they may overlap each other.

For example, with the file layout:
ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304

fd = open(, O_DIRECT | O_WRONLY, );

Writer1:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'T', SIZE);
write(fd, buffer, SIZE);

Writer2:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'X', SIZE);
write(fd, buffer, SIZE);

From the test result, the data in the file possiblly will be:
TTT...TTT <---> object1
XXX...XXX <---> object2

The expected result should be all "XX.." or "TT.." in both object1
and object2.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/file.c  | 38 +++++++++++++++++++++++++++++++++++---
 fs/ceph/inode.c |  2 ++
 fs/ceph/super.h |  3 +++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1cedba452a66..2741070a58a9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -961,6 +961,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
 	bool should_dirty = !write && iter_is_iovec(iter);
+	bool shared_lock = false;
+	u64 size;
 
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -977,14 +979,27 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
 		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+
+		/*
+		 * If we cannot submit the whole iter in a single request we
+		 * should block all the requests followed to avoid the data
+		 * being overlapped by each other.
+		 *
+		 * But for those which could be submit in an single request
+		 * they could excute in parallel.
+		 *
+		 * Hold the exclusive lock first.
+		 */
+		down_write(&ci->i_direct_rwsem);
 	} else {
 		flags = CEPH_OSD_FLAG_READ;
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = iov_iter_count(iter);
 		ssize_t len;
 
+		size = iov_iter_count(iter);
+
 		if (write)
 			size = min_t(u64, size, fsc->mount_options->wsize);
 		else
@@ -1011,9 +1026,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			ret = len;
 			break;
 		}
+
 		if (len != size)
 			osd_req_op_extent_update(req, 0, len);
 
+		if (write && pos == iocb->ki_pos && len == count) {
+			/* Switch to shared lock */
+			downgrade_write(&ci->i_direct_rwsem);
+			shared_lock = true;
+		}
+
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
 		 * or IO can be satisfied by single OSD request.
@@ -1110,7 +1132,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (aio_req->num_reqs == 0) {
 			kfree(aio_req);
-			return ret;
+			goto unlock;
 		}
 
 		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
@@ -1131,13 +1153,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 				ceph_aio_complete_req(req);
 			}
 		}
-		return -EIOCBQUEUED;
+		ret = -EIOCBQUEUED;
+		goto unlock;
 	}
 
 	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
 		ret = pos - iocb->ki_pos;
 		iocb->ki_pos = pos;
 	}
+
+unlock:
+	if (write) {
+		if (shared_lock)
+			up_read(&ci->i_direct_rwsem);
+		else
+			up_write(&ci->i_direct_rwsem);
+	}
+
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aee7a24bf1bc..e5d634acd273 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -518,6 +518,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	ceph_fscache_inode_init(ci);
 
+	init_rwsem(&ci->i_direct_rwsem);
+
 	ci->i_meta_err = 0;
 
 	return &ci->vfs_inode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ee81920bb1a4..213c11bf41be 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -409,6 +409,9 @@ struct ceph_inode_info {
 	struct fscache_cookie *fscache;
 	u32 i_fscache_gen;
 #endif
+
+	struct rw_semaphore i_direct_rwsem;
+
 	errseq_t i_meta_err;
 
 	struct inode vfs_inode; /* at end */
-- 
2.21.0

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req
  2020-02-04  1:54 [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req xiubli
@ 2020-02-05 16:24 ` Jeff Layton
  2020-02-05 16:38   ` Jeff Layton
  0 siblings, 1 reply; 4+ messages in thread
From: Jeff Layton @ 2020-02-05 16:24 UTC (permalink / raw)
  To: xiubli, idryomov, zyan; +Cc: sage, pdonnell, ceph-devel

On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> If the direct io couldn't be submit in a single request, for multiple
> writers, they may overlap each other.
> 
> For example, with the file layout:
> ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
> 
> fd = open(, O_DIRECT | O_WRONLY, );
> 
> Writer1:
> posix_memalign(&buffer, 4194304, SIZE);
> memset(buffer, 'T', SIZE);
> write(fd, buffer, SIZE);
> 
> Writer2:
> posix_memalign(&buffer, 4194304, SIZE);
> memset(buffer, 'X', SIZE);
> write(fd, buffer, SIZE);
> 
> From the test result, the data in the file possiblly will be:
> TTT...TTT <---> object1
> XXX...XXX <---> object2
> 
> The expected result should be all "XX.." or "TT.." in both object1
> and object2.
> 

I really don't see this as broken. If you're using O_DIRECT, I don't
believe there is any expectation that the write operations (or even read
operations) will be atomic wrt to one another.

Basically, when you do this, you're saying "I know what I'm doing", and
need to provide synchronization yourself between competing applications
and clients (typically via file locking).


> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/file.c  | 38 +++++++++++++++++++++++++++++++++++---
>  fs/ceph/inode.c |  2 ++
>  fs/ceph/super.h |  3 +++
>  3 files changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 1cedba452a66..2741070a58a9 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -961,6 +961,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	loff_t pos = iocb->ki_pos;
>  	bool write = iov_iter_rw(iter) == WRITE;
>  	bool should_dirty = !write && iter_is_iovec(iter);
> +	bool shared_lock = false;
> +	u64 size;
>  
>  	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
>  		return -EROFS;
> @@ -977,14 +979,27 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			dout("invalidate_inode_pages2_range returned %d\n", ret2);
>  
>  		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
> +
> +		/*
> +		 * If we cannot submit the whole iter in a single request we
> +		 * should block all the requests followed to avoid the data
> +		 * being overlapped by each other.
> +		 *
> +		 * But for those which could be submit in an single request
> +		 * they could excute in parallel.
> +		 *
> +		 * Hold the exclusive lock first.
> +		 */
> +		down_write(&ci->i_direct_rwsem);
>  	} else {
>  		flags = CEPH_OSD_FLAG_READ;
>  	}
>  
>  	while (iov_iter_count(iter) > 0) {
> -		u64 size = iov_iter_count(iter);
>  		ssize_t len;
>  
> +		size = iov_iter_count(iter);
> +
>  		if (write)
>  			size = min_t(u64, size, fsc->mount_options->wsize);
>  		else
> @@ -1011,9 +1026,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			ret = len;
>  			break;
>  		}
> +
>  		if (len != size)
>  			osd_req_op_extent_update(req, 0, len);
>  
> +		if (write && pos == iocb->ki_pos && len == count) {
> +			/* Switch to shared lock */
> +			downgrade_write(&ci->i_direct_rwsem);
> +			shared_lock = true;
> +		}
> +
>  		/*
>  		 * To simplify error handling, allow AIO when IO within i_size
>  		 * or IO can be satisfied by single OSD request.
> @@ -1110,7 +1132,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  
>  		if (aio_req->num_reqs == 0) {
>  			kfree(aio_req);
> -			return ret;
> +			goto unlock;
>  		}
>  
>  		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
> @@ -1131,13 +1153,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  				ceph_aio_complete_req(req);
>  			}
>  		}
> -		return -EIOCBQUEUED;
> +		ret = -EIOCBQUEUED;
> +		goto unlock;
>  	}
>  
>  	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
>  		ret = pos - iocb->ki_pos;
>  		iocb->ki_pos = pos;
>  	}
> +
> +unlock:
> +	if (write) {
> +		if (shared_lock)
> +			up_read(&ci->i_direct_rwsem);
> +		else
> +			up_write(&ci->i_direct_rwsem);
> +	}
> +
>  	return ret;
>  }
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index aee7a24bf1bc..e5d634acd273 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -518,6 +518,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  
>  	ceph_fscache_inode_init(ci);
>  
> +	init_rwsem(&ci->i_direct_rwsem);
> +
>  	ci->i_meta_err = 0;
>  
>  	return &ci->vfs_inode;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index ee81920bb1a4..213c11bf41be 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -409,6 +409,9 @@ struct ceph_inode_info {
>  	struct fscache_cookie *fscache;
>  	u32 i_fscache_gen;
>  #endif
> +
> +	struct rw_semaphore i_direct_rwsem;
> +
>  	errseq_t i_meta_err;
>  
>  	struct inode vfs_inode; /* at end */

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req
  2020-02-05 16:24 ` Jeff Layton
@ 2020-02-05 16:38   ` Jeff Layton
  2020-02-06  2:51     ` Xiubo Li
  0 siblings, 1 reply; 4+ messages in thread
From: Jeff Layton @ 2020-02-05 16:38 UTC (permalink / raw)
  To: xiubli, idryomov, zyan; +Cc: sage, pdonnell, ceph-devel

On Wed, 2020-02-05 at 11:24 -0500, Jeff Layton wrote:
> On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
> > From: Xiubo Li <xiubli@redhat.com>
> > 
> > If the direct io couldn't be submit in a single request, for multiple
> > writers, they may overlap each other.
> > 
> > For example, with the file layout:
> > ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
> > 
> > fd = open(, O_DIRECT | O_WRONLY, );
> > 
> > Writer1:
> > posix_memalign(&buffer, 4194304, SIZE);
> > memset(buffer, 'T', SIZE);
> > write(fd, buffer, SIZE);
> > 
> > Writer2:
> > posix_memalign(&buffer, 4194304, SIZE);
> > memset(buffer, 'X', SIZE);
> > write(fd, buffer, SIZE);
> > 
> > From the test result, the data in the file possiblly will be:
> > TTT...TTT <---> object1
> > XXX...XXX <---> object2
> > 
> > The expected result should be all "XX.." or "TT.." in both object1
> > and object2.
> > 
> 
> I really don't see this as broken. If you're using O_DIRECT, I don't
> believe there is any expectation that the write operations (or even read
> operations) will be atomic wrt to one another.
> 
> Basically, when you do this, you're saying "I know what I'm doing", and
> need to provide synchronization yourself between competing applications
> and clients (typically via file locking).
> 

In fact, here's a discussion about this from a couple of years ago. This
one mostly applies to local filesystems, but the same concepts apply to
CephFS as well:


https://www.spinics.net/lists/linux-fsdevel/msg118115.html

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req
  2020-02-05 16:38   ` Jeff Layton
@ 2020-02-06  2:51     ` Xiubo Li
  0 siblings, 0 replies; 4+ messages in thread
From: Xiubo Li @ 2020-02-06  2:51 UTC (permalink / raw)
  To: Jeff Layton, idryomov, zyan; +Cc: sage, pdonnell, ceph-devel

On 2020/2/6 0:38, Jeff Layton wrote:
> On Wed, 2020-02-05 at 11:24 -0500, Jeff Layton wrote:
>> On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> If the direct io couldn't be submit in a single request, for multiple
>>> writers, they may overlap each other.
>>>
>>> For example, with the file layout:
>>> ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
>>>
>>> fd = open(, O_DIRECT | O_WRONLY, );
>>>
>>> Writer1:
>>> posix_memalign(&buffer, 4194304, SIZE);
>>> memset(buffer, 'T', SIZE);
>>> write(fd, buffer, SIZE);
>>>
>>> Writer2:
>>> posix_memalign(&buffer, 4194304, SIZE);
>>> memset(buffer, 'X', SIZE);
>>> write(fd, buffer, SIZE);
>>>
>>>  From the test result, the data in the file possiblly will be:
>>> TTT...TTT <---> object1
>>> XXX...XXX <---> object2
>>>
>>> The expected result should be all "XX.." or "TT.." in both object1
>>> and object2.
>>>
>> I really don't see this as broken. If you're using O_DIRECT, I don't
>> believe there is any expectation that the write operations (or even read
>> operations) will be atomic wrt to one another.
>>
>> Basically, when you do this, you're saying "I know what I'm doing", and
>> need to provide synchronization yourself between competing applications
>> and clients (typically via file locking).
>>
> In fact, here's a discussion about this from a couple of years ago. This
> one mostly applies to local filesystems, but the same concepts apply to
> CephFS as well:
>
>
> https://www.spinics.net/lists/linux-fsdevel/msg118115.html
>
Okay. So for the O_DIRECT write/read, we won't guarantee it will be 
atomic in fs layer.

Thanks,

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-02-06  2:51 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-04  1:54 [RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req xiubli
2020-02-05 16:24 ` Jeff Layton
2020-02-05 16:38   ` Jeff Layton
2020-02-06  2:51     ` Xiubo Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.