From mboxrd@z Thu Jan 1 00:00:00 1970 From: Bob Peterson Date: Tue, 15 May 2018 13:29:42 -0500 Subject: [Cluster-devel] [GFS2 PATCH 2/2] GFS2: Introduce rgrp sharing In-Reply-To: <20180515182942.22547-1-rpeterso@redhat.com> References: <20180515182942.22547-1-rpeterso@redhat.com> Message-ID: <20180515182942.22547-3-rpeterso@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit This patch takes advantage of the new glock holder sharing feature for rgrps. New functions rgrp_lock and rgrp_unlock use a new semaphore to gain temporary exclusive access to the rgrps. This is needed whenever a multi-block reservation is reserved, and every time a function needs to add an rgrp to a transaction. Since multiple rgrp glocks held by gfs2_rlist_alloc all require the same access, the state parameter has been removed, and the new holder flag is specified. Also, function gfs2_check_blk_type, which had previously been using LM_ST_SHARED for access to the rgrp has been switched to LM_ST_EXCLUSIVE with the new holder flag. This means multiple nodes no longer hold the rgrp in SH at the same time. However, with the new flag, local holders will still share the rgrp, but we avoid the glock state transition from EX to SH and back, (which may involve both DLM and the workqueue) which is another performance boost for the block allocator. Signed-off-by: Bob Peterson --- fs/gfs2/bmap.c | 2 +- fs/gfs2/dir.c | 2 +- fs/gfs2/incore.h | 2 ++ fs/gfs2/inode.c | 7 ++-- fs/gfs2/rgrp.c | 86 ++++++++++++++++++++++++++++++++++++++++-------- fs/gfs2/rgrp.h | 2 +- fs/gfs2/super.c | 8 +++-- fs/gfs2/xattr.c | 8 +++-- 8 files changed, 91 insertions(+), 26 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0590e93494f7..ac484e6623c9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1108,7 +1108,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, goto out; } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - 0, rd_gh); + LM_FLAG_NODE_EX, rd_gh); if (ret) goto out; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index d9fb0ad6cc30..7327a9d43692 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -2020,7 +2020,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, l_blocks++; } - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + gfs2_rlist_alloc(&rlist); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0bbbaa9b05cb..03ddd01d58d4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -23,6 +23,7 @@ #include #include #include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -100,6 +101,7 @@ struct gfs2_rgrpd { #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ + struct semaphore rd_sem; /* ensures local rgrp exclusivity */ struct rb_root rd_rstree; /* multi-block reservation tree */ }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8700eb815638..3176cadfc309 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1121,8 +1121,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) if (!rgd) goto out_inodes; - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_EX, + ghs + 2); error = gfs2_glock_nq(ghs); /* parent */ if (error) @@ -1409,7 +1409,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, */ nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); if (nrgd) - gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, ghs + num_gh++); } for (x = 0; x < num_gh; x++) { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8b683917a27e..7891229b836d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -904,6 +904,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_data = be32_to_cpu(buf.ri_data); rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); + sema_init(&rgd->rd_sem, 1); error = compute_bitstructs(rgd); if (error) @@ -1344,6 +1345,25 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, return -EIO; } +/** + * rgrp_lock - gain exclusive access to an rgrp + * @gl: the glock + * + * This function waits for exclusive access to an rgrp whose glock is held in + * EX, but shared via the LM_FLAG_NODE_EX holder flag. + */ +static void rgrp_lock(struct gfs2_rgrpd *rgd) +{ + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl)); + down(&rgd->rd_sem); +} + +static void rgrp_unlock(struct gfs2_rgrpd *rgd) +{ + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl)); + up(&rgd->rd_sem); +} + /** * gfs2_fitrim - Generate discard requests for unused bits of the filesystem * @filp: Any file on the filesystem @@ -1399,7 +1419,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) while (1) { - ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &gh); if (ret) goto out; @@ -1420,11 +1441,13 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Mark rgrp as having been trimmed */ ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); if (ret == 0) { + rgrp_lock(rgd); bh = rgd->rd_bits[0].bi_bh; rgd->rd_flags |= GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); + rgrp_unlock(rgd); gfs2_trans_end(sdp); } } @@ -1768,6 +1791,16 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, * * Returns: 0 if no error * The inode, if one has been found, in inode. + * We must be careful to avoid deadlock here: + * + * All transactions expect: sd_log_flush_lock followed by rgrp ex (if neeeded), + * but try_rgrp_unlink takes sd_log_flush_lock outside a transaction and + * therefore must not have the rgrp ex already held. To avoid deadlock, we + * drop the rgrp ex lock before taking the log_flush_lock, then reacquire it + * to protect our call to gfs2_rbm_find. + * + * Also note that rgrp_unlock must come AFTER the caller does gfs2_rs_deltree + * because rgrp ex needs to be held before making reservations. */ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) @@ -1781,7 +1814,12 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; while (1) { + /* As explained above, we need to drop the rgrp ex lock and + * reacquire it after we get for sd_log_flush_lock. + */ + rgrp_unlock(rgd); down_write(&sdp->sd_log_flush_lock); + rgrp_lock(rgd); error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, true); up_write(&sdp->sd_log_flush_lock); @@ -1980,7 +2018,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = &ip->i_res; - int error = 0, rg_locked, flags = 0; + int error = 0, rg_locked, flags = LM_FLAG_NODE_EX; u64 last_unlinked = NO_BLOCK; int loops = 0; u32 skip = 0; @@ -1991,6 +2029,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) return -EINVAL; if (gfs2_rs_active(rs)) { begin = rs->rs_rbm.rgd; + flags = LM_FLAG_NODE_EX; } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { @@ -2023,16 +2062,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) &rs->rs_rgd_gh); if (unlikely(error)) return error; + rgrp_lock(rs->rs_rbm.rgd); if (!gfs2_rs_active(rs) && (loops < 2) && gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { error = update_rgrp_lvb(rs->rs_rbm.rgd); if (unlikely(error)) { + rgrp_unlock(rs->rs_rbm.rgd); gfs2_glock_dq_uninit(&rs->rs_rgd_gh); return error; } } + } else { + rgrp_lock(rs->rs_rbm.rgd); } /* Skip unuseable resource groups */ @@ -2058,14 +2101,21 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { ip->i_rgd = rs->rs_rbm.rgd; ap->allowed = ip->i_rgd->rd_free_clone; + rgrp_unlock(rs->rs_rbm.rgd); return 0; } check_rgrp: /* Check for unlinked inodes which can be reclaimed */ - if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) + if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) { + /* Drop reservation if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, ip->i_no_addr); + } skip_rgrp: + rgrp_unlock(rs->rs_rbm.rgd); + /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) gfs2_rs_deltree(rs); @@ -2169,7 +2219,7 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, } /** - * rgblk_free - Change alloc state of given block(s) + * rgblk_free_and_lock - Change alloc state of given block(s) and lock rgrp ex * @sdp: the filesystem * @bstart: the start of a run of blocks to free * @blen: the length of the block run (all must lie within ONE RG!) @@ -2178,8 +2228,8 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, * Returns: Resource group containing the block(s) */ -static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, - u32 blen, unsigned char new_state) +static struct gfs2_rgrpd *rgblk_free_and_lock(struct gfs2_sbd *sdp, u64 bstart, + u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; struct gfs2_bitmap *bi, *bi_prev = NULL; @@ -2192,6 +2242,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, } gfs2_rbm_from_block(&rbm, bstart); + rgrp_lock(rbm.rgd); while (blen--) { bi = rbm_bi(&rbm); if (bi != bi_prev) { @@ -2341,6 +2392,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); + rgrp_lock(rbm.rgd); error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); if (error == -ENOSPC) { @@ -2395,6 +2447,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock(rbm.rgd); + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) gfs2_trans_add_unrevoke(sdp, block, *nblocks); @@ -2408,6 +2462,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, return 0; rgrp_error: + rgrp_unlock(rbm.rgd); gfs2_rgrp_error(rbm.rgd); return -EIO; } @@ -2426,7 +2481,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + rgd = rgblk_free_and_lock(sdp, bstart, blen, GFS2_BLKST_FREE); if (!rgd) return; trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); @@ -2435,6 +2490,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock(rgd); /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) @@ -2465,7 +2521,7 @@ void gfs2_unlink_di(struct inode *inode) struct gfs2_rgrpd *rgd; u64 blkno = ip->i_no_addr; - rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + rgd = rgblk_free_and_lock(sdp, blkno, 1, GFS2_BLKST_UNLINKED); if (!rgd) return; trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); @@ -2473,6 +2529,7 @@ void gfs2_unlink_di(struct inode *inode) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, 1); + rgrp_unlock(rgd); } void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) @@ -2480,7 +2537,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_rgrpd *tmp_rgd; - tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); + tmp_rgd = rgblk_free_and_lock(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); if (!tmp_rgd) return; gfs2_assert_withdraw(sdp, rgd == tmp_rgd); @@ -2494,6 +2551,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, -1); + rgrp_unlock(rgd); gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); @@ -2522,7 +2580,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) if (!rgd) goto fail; - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &rgd_gh); if (error) goto fail; @@ -2601,16 +2660,15 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, * */ -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) { unsigned int x; rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) - gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - state, 0, - &rlist->rl_ghs[x]); + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &rlist->rl_ghs[x]); } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index e90478e2f545..ca40cdcc4b7e 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -68,7 +68,7 @@ struct gfs2_rgrp_list { extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cf5c7f3080d2..57f30f96b3b6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1131,8 +1131,9 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host done = 0; else if (rgd_next && !error) { error = gfs2_glock_nq_init(rgd_next->rd_gl, - LM_ST_SHARED, - GL_ASYNC, + LM_ST_EXCLUSIVE, + GL_ASYNC| + LM_FLAG_NODE_EX, gh); rgd_next = gfs2_rgrpd_get_next(rgd_next); done = 0; @@ -1507,7 +1508,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) goto out_qs; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &gh); if (error) goto out_qs; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f2bce1e0f6fb..6a698a74dcb5 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -262,7 +262,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &rg_gh); if (error) return error; @@ -1314,7 +1315,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) else goto out; - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + gfs2_rlist_alloc(&rlist); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); @@ -1397,7 +1398,8 @@ static int ea_dealloc_block(struct gfs2_inode *ip) return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_EX, &gh); if (error) return error; -- 2.17.0