From mboxrd@z Thu Jan 1 00:00:00 1970 From: wcheng@sourceware.org Date: 14 Feb 2007 23:15:44 -0000 Subject: [Cluster-devel] cluster/gfs-kernel/src/gfs inode.c inode.h ops ... Message-ID: <20070214231544.12927.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: wcheng at sourceware.org 2007-02-14 23:15:44 Modified files: gfs-kernel/src/gfs: inode.c inode.h ops_inode.c Log message: Final patch for bugzilla 1904745 - there is a deadlock documented in the bugzilla entry comment #26 that is messy to get around under current GFS lock order rule. To compromise the issue, we will fail the rename with the original error return code (-ENOENT), if the new ino has any possibility to cause deadlock. This implies, if GFS ino number is allocated via true random manner, we only reduce this error return code by 25%. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.20.2.3&r2=1.20.2.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/inode.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.1&r2=1.3.2.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/ops_inode.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.6.2.5&r2=1.6.2.6 --- cluster/gfs-kernel/src/gfs/inode.c 2006/10/10 18:35:27 1.20.2.3 +++ cluster/gfs-kernel/src/gfs/inode.c 2007/02/14 23:15:44 1.20.2.4 @@ -335,7 +335,7 @@ /* Read dinode from disk */ error = gfs_copyin_dinode(ip); - if (error) + if (error) goto fail_iopen; gfs_glock_hold(i_gl); @@ -1566,7 +1566,8 @@ */ int -gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip) +gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, + struct gfs_inode *ip, struct gfs_inum *inump) { struct gfs_inum inum; unsigned int type; @@ -1589,11 +1590,22 @@ return error; error = gfs_dir_search(dip, name, &inum, &type); - if (error) + if (error) return error; - if (inum.no_formal_ino != ip->i_num.no_formal_ino) + if (inum.no_formal_ino != ip->i_num.no_formal_ino) { + /* + * Add for rename - bugzilla 190475 + * return new inum such that gfs_rename can + * do its silly rename. + */ + if (unlikely(inump)) { + inump->no_formal_ino = inum.no_formal_ino; + inump->no_addr = inum.no_addr; + } + return -ENOENT; + } if (ip->i_di.di_type != type) { gfs_consist_inode(dip); --- cluster/gfs-kernel/src/gfs/inode.h 2007/01/16 20:39:03 1.3.2.1 +++ cluster/gfs-kernel/src/gfs/inode.h 2007/02/14 23:15:44 1.3.2.2 @@ -37,7 +37,7 @@ int gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip); int gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip); int gfs_unlink_ok(struct gfs_inode *dip, struct qstr *name, - struct gfs_inode *ip); + struct gfs_inode *ip, struct gfs_inum *inum); int gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to); int gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len); --- cluster/gfs-kernel/src/gfs/ops_inode.c 2007/01/18 20:40:12 1.6.2.5 +++ cluster/gfs-kernel/src/gfs/ops_inode.c 2007/02/14 23:15:44 1.6.2.6 @@ -537,7 +537,7 @@ if (error) goto fail; - error = gfs_unlink_ok(dip, &dentry->d_name, ip); + error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL); if (error) goto fail_gunlock; @@ -772,7 +772,7 @@ if (error) goto fail; - error = gfs_unlink_ok(dip, &dentry->d_name, ip); + error = gfs_unlink_ok(dip, &dentry->d_name, ip, NULL); if (error) goto fail_gunlock; @@ -914,6 +914,75 @@ return 0; } +/* + * Obtain a clone dentry with correct inode: + * + * This is to work around the window between lock_rename() + * and gfs_rename() where another node may have renamed the + * target file (bugzilla 190475). The bug most likely occurs + * in a mail server environment. + * + * Return true if the dentry has been successfully updated. + */ +static int +gfs_refresh_dentry(struct gfs_sbd *sdp, struct dentry **f_dentry, + struct gfs_inum *inump) +{ + struct inode *inode; + struct dentry *dentry, *sdentry=*f_dentry; + + dentry = d_alloc(sdentry->d_parent, &sdentry->d_name); + if (!dentry) + return 0; + + /* Get the fully updated inode */ + inode = NULL; + if (inump->no_formal_ino) { + inode = gfs_refresh_iobj(sdp, inump, NULL); + if (IS_ERR(inode)) { + dput(dentry); + return 0; + } + } + + /* + * Remove the old dentry from cache to avoid + * another lookup. + */ + if (!d_unhashed(sdentry)) { + d_drop(sdentry); + } + + /* Instantiate the new dentry */ + dentry->d_op = &gfs_dops; + + d_splice_alias(inode, dentry); + + /* Return with success */ + *f_dentry = dentry; + return 1; +} + +/* + * Check lock order for the new ino: + * return 1: if it could lead to deadlock + * return 0: if ordering is ok. + */ + +static int +gfs_check_lock_order(struct gfs_holder *ghs, int n_gh, uint64_t ino) +{ + int x; + + /* no fancy algorithm here since max n_gh is 4 */ + for (x = 0; x < n_gh; x++) { + if (ino < ghs[x].gh_gl->gl_name.ln_number) + return 1; + } + + return 0; +} + /** * gfs_rename - Rename a file * @odir: Parent directory of old file name @@ -935,17 +1004,21 @@ struct gfs_sbd *sdp = odip->i_sbd; struct qstr name; struct gfs_alloc *al; - struct gfs_holder ghs[4], r_gh; + struct gfs_holder ghs[5], r_gh; unsigned int num_gh; int dir_rename = FALSE; int alloc_required; unsigned int x; int error; + int once_cnt=0, has_ndentry=0; + struct gfs_inum new_inum; atomic_inc(&sdp->sd_ops_inode); gfs_unlinked_limit(sdp); + odir->i_sb->s_type->fs_flags &= ~FS_ODD_RENAME; + if (ndentry->d_inode) { nip = vn2ip(ndentry->d_inode); if (ip == nip) @@ -968,36 +1041,78 @@ goto fail; } - num_gh = 1; + num_gh = 1; gfs_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (odip != ndip) { gfs_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh); num_gh++; } + gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs+num_gh); num_gh++; - if (nip) { gfs_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); num_gh++; } error = gfs_glock_nq_m(num_gh, ghs); - if (error) - goto fail_uninit; + if (error) + goto fail_uninit; /* Check out the old directory */ - error = gfs_unlink_ok(odip, &odentry->d_name, ip); + error = gfs_unlink_ok(odip, &odentry->d_name, ip, NULL); if (error) goto fail_gunlock; +gfs_rename_retry: + /* Check out the new directory */ if (nip) { - error = gfs_unlink_ok(ndip, &ndentry->d_name, nip); - if (error) + /* Zero out this field so we can know whether gfs_unlink_ok + * could fill in the correct inode numbers. + */ + new_inum.no_formal_ino = 0; + error = gfs_unlink_ok(ndip, &ndentry->d_name, nip, &new_inum); + + if (unlikely(error)) { + /* + * Handle a rare race condition that the new file + * could have been renamed by another node - the + * correct inode number is passed into new_inum. + * We only allow this to happen once. + */ + if ((error == -ENOENT) && (!once_cnt) + && (new_inum.no_formal_ino)) + { + if (gfs_check_lock_order(ghs, num_gh, + new_inum.no_formal_ino)) + /* fail rename with -ENOENT */ + goto fail_gunlock; + + if (gfs_refresh_dentry(sdp,&ndentry,&new_inum)) + { + odir->i_sb->s_type->fs_flags + |= FS_ODD_RENAME; + has_ndentry = 1; + nip = vn2ip(ndentry->d_inode); + if (nip) { + once_cnt++; + error = gfs_glock_nq_init(nip->i_gl, + LM_ST_EXCLUSIVE, 0, + &ghs[num_gh++]); + if (error) { + num_gh--; + error = -ENOENT; + } + else + goto gfs_rename_retry; + } + } + } goto fail_gunlock; + } if (nip->i_di.di_type == GFS_FILE_DIR) { if (nip->i_di.di_entries < 2) { @@ -1150,6 +1265,14 @@ if (dir_rename) gfs_glock_dq_uninit(&r_gh); + + /* dput new dentry as vfs layer still uses old entry */ + if (unlikely(has_ndentry)) { + d_move(odentry, ndentry); + dput(ndentry); + dput(ndentry); + } + return 0; fail_end_trans: @@ -1178,6 +1301,12 @@ if (dir_rename) gfs_glock_dq_uninit(&r_gh); + if (unlikely(has_ndentry)) { + d_move(odentry, ndentry); + dput(ndentry); + dput(ndentry); + } + return error; }