From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Yan, Zheng" Subject: Re: [RFC PATCH 8/9] ceph: copy layout, max_size and truncate_size on successful sync create Date: Tue, 14 Jan 2020 00:37:13 +0800 Message-ID: <85ef5b94-92c6-2b97-3da5-08a5e0d575ff@redhat.com> References: <20200110205647.311023-1-jlayton@kernel.org> <20200110205647.311023-9-jlayton@kernel.org> <98bb5af6-415f-e035-2d54-15ed492914b4@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Return-path: Received: from us-smtp-delivery-1.mimecast.com ([207.211.31.120]:26297 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1726943AbgAMQhZ (ORCPT ); Mon, 13 Jan 2020 11:37:25 -0500 In-Reply-To: Content-Language: en-US Sender: ceph-devel-owner@vger.kernel.org List-ID: To: Jeff Layton , ceph-devel@vger.kernel.org Cc: sage@redhat.com, idryomov@gmail.com, pdonnell@redhat.com On 1/13/20 11:13 PM, Jeff Layton wrote: > On Mon, 2020-01-13 at 22:56 +0800, Yan, Zheng wrote: >> On 1/13/20 9:26 PM, Jeff Layton wrote: >>> On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote: >>>> On 1/11/20 4:56 AM, Jeff Layton wrote: >>>>> It doesn't do much good to do an asynchronous create unless we can do >>>>> I/O to it before the create reply comes in. That means we need layout >>>>> info the new file before we've gotten the response from the MDS. >>>>> >>>>> All files created in a directory will initially inherit the same layout, >>>>> so copy off the requisite info from the first synchronous create in the >>>>> directory. Save it in the same fields in the directory inode, as those >>>>> are otherwise unsed for dir inodes. This means we need to be a bit >>>>> careful about only updating layout info on non-dir inodes. >>>>> >>>>> Also, zero out the layout when we drop Dc caps in the dir. >>>>> >>>>> Signed-off-by: Jeff Layton >>>>> --- >>>>> fs/ceph/caps.c | 24 ++++++++++++++++++++---- >>>>> fs/ceph/file.c | 24 +++++++++++++++++++++++- >>>>> fs/ceph/inode.c | 4 ++-- >>>>> 3 files changed, 45 insertions(+), 7 deletions(-) >>>>> >>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >>>>> index 7fc87b693ba4..b96fb1378479 100644 >>>>> --- a/fs/ceph/caps.c >>>>> +++ b/fs/ceph/caps.c >>>>> @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want, >>>>> return ret; >>>>> } >>>>> >>>>> - if (S_ISREG(ci->vfs_inode.i_mode) && >>>>> + if (!S_ISDIR(ci->vfs_inode.i_mode) && >>>>> ci->i_inline_version != CEPH_INLINE_NONE && >>>>> (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && >>>>> i_size_read(inode) > 0) { >>>>> @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) >>>>> if (had & CEPH_CAP_FILE_RD) >>>>> if (--ci->i_rd_ref == 0) >>>>> last++; >>>>> - if (had & CEPH_CAP_FILE_CACHE) >>>>> - if (--ci->i_rdcache_ref == 0) >>>>> + if (had & CEPH_CAP_FILE_CACHE) { >>>>> + if (--ci->i_rdcache_ref == 0) { >>>>> last++; >>>>> + /* Zero out layout if we lost CREATE caps */ >>>>> + if (S_ISDIR(inode->i_mode) && >>>>> + !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) { >>>>> + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); >>>>> + memset(&ci->i_layout, 0, sizeof(ci->i_layout)); >>>>> + } >>>>> + } >>>>> + } >>>> >>>> should do this in __check_cap_issue >>>> >>> >>> That function doesn't get called from the put codepath. Suppose one task >>> is setting up an async create and a Dc (DIR_CREATE) cap revoke races in. >>> We zero out the layout and then the inode has a bogus layout set in it. >>> >>> We can't wipe the cached layout until all of the Dc references are put. >> >> how about: >> >> get_caps_for_async_create() return the layout. >> pass the returned layout into ceph_finish_async_open() >> > > That still sounds racy. > > What guarantees the stability of the cached layout while we're copying > it in get_caps_for_async_create()? I guess we could protect the new > cached layout field with the i_ceph_lock. > Yes. copy cached layout while holding i_ceph_lock after getting Fsc caps. Besides, we need to make sure ceph_finish_async_open get called before dropping Fsc. > Is that a real improvement? I'm not sure. > I think it's more clean. >> >> >>>>> if (had & CEPH_CAP_FILE_EXCL) >>>>> if (--ci->i_fx_ref == 0) >>>>> last++; >>>>> @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode, >>>>> ci->i_subdirs = extra_info->nsubdirs; >>>>> } >>>>> >>>>> - if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { >>>>> + if (!S_ISDIR(inode->i_mode) && >>>>> + (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { >>>>> /* file layout may have changed */ >>>>> s64 old_pool = ci->i_layout.pool_id; >>>>> struct ceph_string *old_ns; >>>>> @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode, >>>>> ceph_cap_string(cap->issued), >>>>> ceph_cap_string(newcaps), >>>>> ceph_cap_string(revoking)); >>>>> + >>>>> + if (S_ISDIR(inode->i_mode) && >>>>> + (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) { >>>>> + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); >>>>> + memset(&ci->i_layout, 0, sizeof(ci->i_layout)); >>>>> + } >>>> >>>> same here >>>> >>>>> + >>>>> if (S_ISREG(inode->i_mode) && >>>>> (revoking & used & CEPH_CAP_FILE_BUFFER)) >>>>> writeback = true; /* initiate writeback; will delay ack */ >>>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >>>>> index 1e6cdf2dfe90..d4d7a277faf1 100644 >>>>> --- a/fs/ceph/file.c >>>>> +++ b/fs/ceph/file.c >>>>> @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file) >>>>> return err; >>>>> } >>>>> >>>>> +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ >>>>> +static void >>>>> +copy_file_layout(struct inode *dst, struct inode *src) >>>>> +{ >>>>> + struct ceph_inode_info *cdst = ceph_inode(dst); >>>>> + struct ceph_inode_info *csrc = ceph_inode(src); >>>>> + >>>>> + spin_lock(&cdst->i_ceph_lock); >>>>> + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && >>>>> + !ceph_file_layout_is_valid(&cdst->i_layout)) { >>>>> + memcpy(&cdst->i_layout, &csrc->i_layout, >>>>> + sizeof(cdst->i_layout)); >>>>> + rcu_assign_pointer(cdst->i_layout.pool_ns, >>>>> + ceph_try_get_string(csrc->i_layout.pool_ns)); >>>>> + cdst->i_max_size = csrc->i_max_size; >>>>> + cdst->i_truncate_size = csrc->i_truncate_size; >>>>> + } >>>>> + spin_unlock(&cdst->i_ceph_lock); >>>>> +} >>>>> >>>>> /* >>>>> * Do a lookup + open with a single request. If we get a non-existent >>>>> @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, >>>>> } else { >>>>> dout("atomic_open finish_open on dn %p\n", dn); >>>>> if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { >>>>> - ceph_init_inode_acls(d_inode(dentry), &as_ctx); >>>>> + struct inode *newino = d_inode(dentry); >>>>> + >>>>> + copy_file_layout(dir, newino); >>>>> + ceph_init_inode_acls(newino, &as_ctx); >>>>> file->f_mode |= FMODE_CREATED; >>>>> } >>>>> err = finish_open(file, dentry, ceph_open); >>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c >>>>> index 9cfc093fd273..8b51051b79b0 100644 >>>>> --- a/fs/ceph/inode.c >>>>> +++ b/fs/ceph/inode.c >>>>> @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, >>>>> ci->i_subdirs = le64_to_cpu(info->subdirs); >>>>> } >>>>> >>>>> - if (new_version || >>>>> - (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { >>>>> + if (!S_ISDIR(inode->i_mode) && (new_version || >>>>> + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) { >>>>> s64 old_pool = ci->i_layout.pool_id; >>>>> struct ceph_string *old_ns; >>>>> >>>>> >