All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH 0/2] Extended attribute readahead
@ 2015-11-01 19:02 Andreas Gruenbacher
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher
  0 siblings, 2 replies; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hello,

here are two patches that implement extended attribute readahead for those
cases where the extended attribute block is allocated immediately upon
inode creation (i.e., with SELinux or inherited permissions).  The first
patch submits a READA request for the extended attributes after submitting
a READ request for the inode.

The second patch submits both requests in a single bio instead.  As you can
see, this is somewhat messy.  I'm not convinced that this optimization buys
us anything, so I will not recommend merging this patch.

Git tree:

 git://git.kernel.org/pub/scm/linux/kernel/git/agruen/linux.git gfs2-wip

Could you please review?

Thanks,
Andreas

Andreas Gruenbacher (2):
  gfs2: Extended attribute readahead
  gfs2: Extended attribute readahead optimization

 fs/gfs2/dir.c     | 15 ++++++---
 fs/gfs2/incore.h  |  1 +
 fs/gfs2/meta_io.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/gfs2/meta_io.h |  2 +-
 fs/gfs2/quota.c   |  2 +-
 fs/gfs2/rgrp.c    |  2 +-
 fs/gfs2/super.c   |  1 +
 fs/gfs2/xattr.c   | 10 +++---
 8 files changed, 109 insertions(+), 19 deletions(-)

-- 
2.5.0



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead
  2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher
@ 2015-11-01 19:02 ` Andreas Gruenbacher
  2015-11-03 17:29   ` Bob Peterson
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher
  1 sibling, 1 reply; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

When gfs2 allocates an inode and its extended attribute block next to
each other at inode create time, the inode's directory entry indicates
that in de_rahead.  In that case, we can readahead the extended
attribute block when we read in the inode.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/dir.c     | 15 +++++++++++----
 fs/gfs2/incore.h  |  1 +
 fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++--
 fs/gfs2/meta_io.h |  2 +-
 fs/gfs2/quota.c   |  2 +-
 fs/gfs2/rgrp.c    |  2 +-
 fs/gfs2/super.c   |  1 +
 fs/gfs2/xattr.c   | 10 +++++-----
 8 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b..b377bdc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -108,7 +108,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
 	struct buffer_head *bh;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
 	if (error)
 		return error;
 	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -305,7 +305,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
 			BUG_ON(extlen < 1);
 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
 		} else {
-			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
 			if (error)
 				goto fail;
 		}
@@ -718,7 +718,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
 {
 	int error;
 
-	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
 	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
 		/* pr_info("block num=%llu\n", leaf_no); */
 		error = -EIO;
@@ -1555,15 +1555,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
 
 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
 	if (dent) {
+		struct inode *inode;
+		u16 rahead;
+
 		if (IS_ERR(dent))
 			return ERR_CAST(dent);
 		dtype = be16_to_cpu(dent->de_type);
+		rahead = be16_to_cpu(dent->de_rahead);
 		addr = be64_to_cpu(dent->de_inum.no_addr);
 		formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
 		brelse(bh);
 		if (fail_on_exist)
 			return ERR_PTR(-EEXIST);
-		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+		inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+		if (!IS_ERR(inode))
+			GFS2_I(inode)->i_rahead = rahead;
+		return inode;
 	}
 	return ERR_PTR(-ENOENT);
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 121ed08..722bf69 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -403,6 +403,7 @@ struct gfs2_inode {
 	u32 i_diskflags;
 	u8 i_height;
 	u8 i_depth;
+	u16 i_rahead;
 };
 
 /*
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0e1d4be..0f24828 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,6 +187,21 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 	return bh;
 }
 
+static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
+{
+	struct buffer_head *bh;
+
+	bh = gfs2_getbuf(gl, blkno, 1);
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		brelse(bh);
+		return;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READA | REQ_META | REQ_PRIO, bh);
+}
+
 /**
  * gfs2_meta_read - Read a block from disk
  * @gl: The glock covering the block
@@ -198,7 +213,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
  */
 
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-		   struct buffer_head **bhp)
+		   int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct buffer_head *bh;
@@ -213,11 +228,15 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
+		if (rahead)
+			gfs2_meta_readahead(gl, blkno + 1);
 		return 0;
 	}
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
 	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+	if (rahead)
+		gfs2_meta_readahead(gl, blkno + 1);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -341,8 +360,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 	struct buffer_head *bh;
 	int ret = 0;
 	u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+	int rahead = 0;
+
+	if (num == ip->i_no_addr)
+		rahead = ip->i_rahead;
 
-	ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+	ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
 	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
 		brelse(bh);
 		ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 8ca1615..c5086c8 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 
 extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-			  struct buffer_head **bhp);
+			  int rahead, struct buffer_head **bhp);
 extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
 				       int create);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3a31226..e01298d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
 	if (error)
 		goto fail;
-	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
 	if (error)
 		goto fail;
 	error = -EIO;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 475985d..a2a6075 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1157,7 +1157,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
-		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
 		if (error)
 			goto fail;
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 894fb01..8f94282 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1633,6 +1633,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 		ip->i_gl = NULL;
 		ip->i_rgd = NULL;
 		ip->i_res = NULL;
+		ip->i_rahead = 0;
 	}
 	return &ip->i_inode;
 }
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa..f0fe884 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
 	if (error)
 		return error;
 
@@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 			break;
 		bn = be64_to_cpu(*eablk);
 
-		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
 		if (error)
 			break;
 		error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		return -ENOMEM;
 
 	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
 				       bh + x);
 		if (error) {
 			while (x--)
@@ -977,7 +977,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
 				       &indbh);
 		if (error)
 			return error;
@@ -1303,7 +1303,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
 	if (error)
 		return error;
 
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher
@ 2015-11-01 19:02 ` Andreas Gruenbacher
  2015-11-12 13:44   ` Steven Whitehouse
  1 sibling, 1 reply; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Instead of submitting separate bio for the inode and its extended
attributes, submit a single bio for both when possible.  The entire
request becomes a normal read, not a readahead.

To keep track of the buffer heads that make up the bio, we allocate
temporary buffer head arrays: in the endio handler, it would also be
possible to compute the buffer head block numbers from the bio and to
grab the buffer heads with gfs2_getbuf, but the code would become even
messier.
---
 fs/gfs2/meta_io.c | 94 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 76 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0f24828..e650127 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 	return bh;
 }
 
-static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
+struct gfs2_meta_read {
+	int num;
+	struct buffer_head *bhs[0];
+};
+
+static void gfs2_meta_read_endio(struct bio *bio) {
+	struct gfs2_meta_read *r = bio->bi_private;
+	int i;
+
+	for (i = 0; i < r->num; i++) {
+		struct buffer_head *bh = r->bhs[i];
+
+		if (unlikely(bio_flagged(bio, BIO_QUIET)))
+			set_bit(BH_Quiet, &bh->b_state);
+
+		bh->b_end_io(bh, !bio->bi_error);
+	}
+	bio_put(bio);
+	kfree(r);
+}
+
+/*
+ * (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
 {
-	struct buffer_head *bh;
+	struct gfs2_meta_read *r;
+	struct buffer_head *bh = bhs[0];
+	struct bio *bio;
+	int i;
 
-	bh = gfs2_getbuf(gl, blkno, 1);
-	lock_buffer(bh);
-	if (buffer_uptodate(bh)) {
-		unlock_buffer(bh);
-		brelse(bh);
+	if (!num)
+		return;
+
+	if (num == 1) {
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(rw, bh);
 		return;
 	}
-	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(READA | REQ_META | REQ_PRIO, bh);
+
+	r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]),
+		    GFP_NOIO | __GFP_NOFAIL);
+	r->num = num;
+	for (i = 0; i < num; i++)
+		r->bhs[i] = bhs[i];
+
+	bio = bio_alloc(GFP_NOIO, num);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	for (i = 0; i < num; i++) {
+		bh = bhs[i];
+		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+	}
+	bio->bi_end_io = gfs2_meta_read_endio;
+	bio->bi_private = r;
+
+	submit_bio(rw, bio);
 }
 
 /**
@@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	struct buffer_head *bh;
+	struct buffer_head *bh, *bhs[2];
+	int num = 0;
 
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
 		*bhp = NULL;
@@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
-		if (rahead)
-			gfs2_meta_readahead(gl, blkno + 1);
-		return 0;
+		flags &= ~DIO_WAIT;
+	} else {
+		bh->b_end_io = end_buffer_read_sync;
+		get_bh(bh);
+		bhs[num++] = bh;
 	}
-	bh->b_end_io = end_buffer_read_sync;
-	get_bh(bh);
-	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
-	if (rahead)
-		gfs2_meta_readahead(gl, blkno + 1);
+
+	if (rahead) {
+		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+		lock_buffer(bh);
+		if (buffer_uptodate(bh)) {
+			unlock_buffer(bh);
+			brelse(bh);
+		} else {
+			bh->b_end_io = end_buffer_read_sync;
+			bhs[num++] = bh;
+		}
+	}
+
+	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
+	bh = *bhp;
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh))) {
 		struct gfs2_trans *tr = current->journal_info;
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher
@ 2015-11-03 17:29   ` Bob Peterson
  2015-11-03 19:02     ` Steven Whitehouse
  0 siblings, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2015-11-03 17:29 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
> When gfs2 allocates an inode and its extended attribute block next to
> each other at inode create time, the inode's directory entry indicates
> that in de_rahead.  In that case, we can readahead the extended
> attribute block when we read in the inode.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>  fs/gfs2/dir.c     | 15 +++++++++++----
>  fs/gfs2/incore.h  |  1 +
>  fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++--
>  fs/gfs2/meta_io.h |  2 +-
>  fs/gfs2/quota.c   |  2 +-
>  fs/gfs2/rgrp.c    |  2 +-
>  fs/gfs2/super.c   |  1 +
>  fs/gfs2/xattr.c   | 10 +++++-----
>  8 files changed, 46 insertions(+), 14 deletions(-)
> 

Hi Andreas,

Most of this looks good. However, two comments:

1. I don't like adding a new u16 to the gfs2_inode. I've been working to
   reduce the size of gfs2's inodes lately, so I'd rather see this
   implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's i_flags.
2. It seems to me like we should take advantage of function gfs2_meta_ra()
   which already submits one block and a variable number of additional
   blocks for read-ahead, then waits for the first block IO to complete.

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead
  2015-11-03 17:29   ` Bob Peterson
@ 2015-11-03 19:02     ` Steven Whitehouse
  2015-11-03 20:18       ` Andreas Gruenbacher
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2015-11-03 19:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 03/11/15 17:29, Bob Peterson wrote:
> ----- Original Message -----
>> When gfs2 allocates an inode and its extended attribute block next to
>> each other at inode create time, the inode's directory entry indicates
>> that in de_rahead.  In that case, we can readahead the extended
>> attribute block when we read in the inode.
>>
>> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
>> ---
>>   fs/gfs2/dir.c     | 15 +++++++++++----
>>   fs/gfs2/incore.h  |  1 +
>>   fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++--
>>   fs/gfs2/meta_io.h |  2 +-
>>   fs/gfs2/quota.c   |  2 +-
>>   fs/gfs2/rgrp.c    |  2 +-
>>   fs/gfs2/super.c   |  1 +
>>   fs/gfs2/xattr.c   | 10 +++++-----
>>   8 files changed, 46 insertions(+), 14 deletions(-)
>>
> Hi Andreas,
>
> Most of this looks good. However, two comments:
>
> 1. I don't like adding a new u16 to the gfs2_inode. I've been working to
>     reduce the size of gfs2's inodes lately, so I'd rather see this
>     implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's i_flags.
> 2. It seems to me like we should take advantage of function gfs2_meta_ra()
>     which already submits one block and a variable number of additional
>     blocks for read-ahead, then waits for the first block IO to complete.
The meta_ra thing is a bit of a hack, and best avoided for this use. We 
want to only send out a single I/O here rather than let the I/O stack do 
the merging after the fact,

Steve.

> Regards,
>
> Bob Peterson
> Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead
  2015-11-03 19:02     ` Steven Whitehouse
@ 2015-11-03 20:18       ` Andreas Gruenbacher
  0 siblings, 0 replies; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-03 20:18 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Bob and Steve,

On Tue, Nov 3, 2015 at 8:02 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
>> Most of this looks good. However, two comments:
>>
>> 1. I don't like adding a new u16 to the gfs2_inode. I've been working to
>>     reduce the size of gfs2's inodes lately, so I'd rather see this
>>     implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's
>> i_flags.

that's how I had this implemented in the previous version, before
Steve requested to have it changed. It seems pretty unlikely to me
that we'll readahead more than one block here, so I one bit should
actually be enough.

>> 2. It seems to me like we should take advantage of function gfs2_meta_ra()
>>     which already submits one block and a variable number of additional
>>     blocks for read-ahead, then waits for the first block IO to complete.
>
> The meta_ra thing is a bit of a hack, and best avoided for this use. We want
> to only send out a single I/O here rather than let the I/O stack do the
> merging after the fact,

Well, the first patch might as well use gfs2_meta_ra I guess. I'm
unsure whether submitting a single I/O request is worth it given that
the code for doing that is quite ugly.

If it is, we could make the argument that it's also worth it for
gfs2_meta_ra. The block size might make a big difference here as well.

Thanks,
Andreas



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher
@ 2015-11-12 13:44   ` Steven Whitehouse
  2015-11-12 15:33     ` Andreas Gruenbacher
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2015-11-12 13:44 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 01/11/15 19:02, Andreas Gruenbacher wrote:
> Instead of submitting separate bio for the inode and its extended
> attributes, submit a single bio for both when possible.  The entire
> request becomes a normal read, not a readahead.
>
> To keep track of the buffer heads that make up the bio, we allocate
> temporary buffer head arrays: in the endio handler, it would also be
> possible to compute the buffer head block numbers from the bio and to
> grab the buffer heads with gfs2_getbuf, but the code would become even
> messier.
> ---
>   fs/gfs2/meta_io.c | 94 ++++++++++++++++++++++++++++++++++++++++++++-----------
>   1 file changed, 76 insertions(+), 18 deletions(-)
>
> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
> index 0f24828..e650127 100644
> --- a/fs/gfs2/meta_io.c
> +++ b/fs/gfs2/meta_io.c
> @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
>   	return bh;
>   }
>   
> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
> +struct gfs2_meta_read {
> +	int num;
> +	struct buffer_head *bhs[0];
> +};
> +
> +static void gfs2_meta_read_endio(struct bio *bio) {
> +	struct gfs2_meta_read *r = bio->bi_private;
> +	int i;
> +
> +	for (i = 0; i < r->num; i++) {
> +		struct buffer_head *bh = r->bhs[i];
> +
> +		if (unlikely(bio_flagged(bio, BIO_QUIET)))
> +			set_bit(BH_Quiet, &bh->b_state);
> +
> +		bh->b_end_io(bh, !bio->bi_error);
> +	}
> +	bio_put(bio);
> +	kfree(r);
> +}
> +
> +/*
> + * (See submit_bh_wbc.)
> + */
> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
>   {
> -	struct buffer_head *bh;
> +	struct gfs2_meta_read *r;
> +	struct buffer_head *bh = bhs[0];
> +	struct bio *bio;
> +	int i;
>   
> -	bh = gfs2_getbuf(gl, blkno, 1);
> -	lock_buffer(bh);
> -	if (buffer_uptodate(bh)) {
> -		unlock_buffer(bh);
> -		brelse(bh);
> +	if (!num)
> +		return;
> +
> +	if (num == 1) {
> +		bh->b_end_io = end_buffer_read_sync;
> +		submit_bh(rw, bh);
>   		return;
>   	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	submit_bh(READA | REQ_META | REQ_PRIO, bh);
> +
> +	r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]),
> +		    GFP_NOIO | __GFP_NOFAIL);
Can we avoid doing this I wonder? I would like to avoid adding 
GFP_NOFAIL allocations, and I think it should be possible to figure out 
where the buffers are using the bio itself (i.e. iterating over it, 
similar to gfs2_end_log_write() in lops.c) so that we don't need to 
allocate this additional memory. Otherwise this looks like the right 
approach I think,

Steve.

> +	r->num = num;
> +	for (i = 0; i < num; i++)
> +		r->bhs[i] = bhs[i];
> +
> +	bio = bio_alloc(GFP_NOIO, num);
> +	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
> +	bio->bi_bdev = bh->b_bdev;
> +	for (i = 0; i < num; i++) {
> +		bh = bhs[i];
> +		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
> +	}
> +	bio->bi_end_io = gfs2_meta_read_endio;
> +	bio->bi_private = r;
> +
> +	submit_bio(rw, bio);
>   }
>   
>   /**
> @@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
>   		   int rahead, struct buffer_head **bhp)
>   {
>   	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> -	struct buffer_head *bh;
> +	struct buffer_head *bh, *bhs[2];
> +	int num = 0;
>   
>   	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
>   		*bhp = NULL;
> @@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
>   	lock_buffer(bh);
>   	if (buffer_uptodate(bh)) {
>   		unlock_buffer(bh);
> -		if (rahead)
> -			gfs2_meta_readahead(gl, blkno + 1);
> -		return 0;
> +		flags &= ~DIO_WAIT;
> +	} else {
> +		bh->b_end_io = end_buffer_read_sync;
> +		get_bh(bh);
> +		bhs[num++] = bh;
>   	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	get_bh(bh);
> -	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
> -	if (rahead)
> -		gfs2_meta_readahead(gl, blkno + 1);
> +
> +	if (rahead) {
> +		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
> +
> +		lock_buffer(bh);
> +		if (buffer_uptodate(bh)) {
> +			unlock_buffer(bh);
> +			brelse(bh);
> +		} else {
> +			bh->b_end_io = end_buffer_read_sync;
> +			bhs[num++] = bh;
> +		}
> +	}
> +
> +	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
>   	if (!(flags & DIO_WAIT))
>   		return 0;
>   
> +	bh = *bhp;
>   	wait_on_buffer(bh);
>   	if (unlikely(!buffer_uptodate(bh))) {
>   		struct gfs2_trans *tr = current->journal_info;



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-12 13:44   ` Steven Whitehouse
@ 2015-11-12 15:33     ` Andreas Gruenbacher
  2015-11-12 20:15       ` Andreas Gruenbacher
  0 siblings, 1 reply; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-12 15:33 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Thu, Nov 12, 2015 at 2:44 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
> Hi,
>
>
> On 01/11/15 19:02, Andreas Gruenbacher wrote:
>>
>> Instead of submitting separate bio for the inode and its extended
>> attributes, submit a single bio for both when possible.  The entire
>> request becomes a normal read, not a readahead.
>>
>> To keep track of the buffer heads that make up the bio, we allocate
>> temporary buffer head arrays: in the endio handler, it would also be
>> possible to compute the buffer head block numbers from the bio and to
>> grab the buffer heads with gfs2_getbuf, but the code would become even
>> messier.
>> ---
>>   fs/gfs2/meta_io.c | 94
>> ++++++++++++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 76 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
>> index 0f24828..e650127 100644
>> --- a/fs/gfs2/meta_io.c
>> +++ b/fs/gfs2/meta_io.c
>> @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock
>> *gl, u64 blkno)
>>         return bh;
>>   }
>>   -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
>> +struct gfs2_meta_read {
>> +       int num;
>> +       struct buffer_head *bhs[0];
>> +};
>> +
>> +static void gfs2_meta_read_endio(struct bio *bio) {
>> +       struct gfs2_meta_read *r = bio->bi_private;
>> +       int i;
>> +
>> +       for (i = 0; i < r->num; i++) {
>> +               struct buffer_head *bh = r->bhs[i];
>> +
>> +               if (unlikely(bio_flagged(bio, BIO_QUIET)))
>> +                       set_bit(BH_Quiet, &bh->b_state);
>> +
>> +               bh->b_end_io(bh, !bio->bi_error);
>> +       }
>> +       bio_put(bio);
>> +       kfree(r);
>> +}
>> +
>> +/*
>> + * (See submit_bh_wbc.)
>> + */
>> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
>>   {
>> -       struct buffer_head *bh;
>> +       struct gfs2_meta_read *r;
>> +       struct buffer_head *bh = bhs[0];
>> +       struct bio *bio;
>> +       int i;
>>   -     bh = gfs2_getbuf(gl, blkno, 1);
>> -       lock_buffer(bh);
>> -       if (buffer_uptodate(bh)) {
>> -               unlock_buffer(bh);
>> -               brelse(bh);
>> +       if (!num)
>> +               return;
>> +
>> +       if (num == 1) {
>> +               bh->b_end_io = end_buffer_read_sync;
>> +               submit_bh(rw, bh);
>>                 return;
>>         }
>> -       bh->b_end_io = end_buffer_read_sync;
>> -       submit_bh(READA | REQ_META | REQ_PRIO, bh);
>> +
>> +       r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]),
>> +                   GFP_NOIO | __GFP_NOFAIL);
>
> Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL
> allocations, and I think it should be possible to figure out where the
> buffers are using the bio itself (i.e. iterating over it, similar to
> gfs2_end_log_write() in lops.c) so that we don't need to allocate this
> additional memory.

Yes, that's better than going through gfs2_getbuf() again. I'll change that.

Thanks,
Andreas



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-12 15:33     ` Andreas Gruenbacher
@ 2015-11-12 20:15       ` Andreas Gruenbacher
  2015-11-12 20:33         ` Steven Whitehouse
  2015-11-13 13:48         ` Bob Peterson
  0 siblings, 2 replies; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-12 20:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Here is an updated version of this patch, please review.

Thanks,
Andreas

--

Instead of submitting a READ_SYNC bio for the inode and a READA bio for
the inode's extended attributes through submit_bh, submit a single READ
bio for both strough submit_bio when possible.  This can be more
efficient on some kinds of block devices.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/meta_io.c | 81 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0f24828..e137d96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 	return bh;
 }
 
-static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
+static void gfs2_meta_read_endio(struct bio *bio)
 {
-	struct buffer_head *bh;
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct buffer_head *bh = page_buffers(page);
+		unsigned int len = bvec->bv_len;
+
+		while (bh_offset(bh) < bvec->bv_offset)
+			bh = bh->b_this_page;
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			len -= bh->b_size;
+			bh->b_end_io(bh, !bio->bi_error);
+			bh = next;
+		} while (bh && len);
+	}
+	bio_put(bio);
+}
 
-	bh = gfs2_getbuf(gl, blkno, 1);
-	lock_buffer(bh);
-	if (buffer_uptodate(bh)) {
-		unlock_buffer(bh);
-		brelse(bh);
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request.  (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+	struct buffer_head *bh = bhs[0];
+	struct bio *bio;
+	int i;
+
+	if (!num)
 		return;
+
+	bio = bio_alloc(GFP_NOIO, num);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	for (i = 0; i < num; i++) {
+		bh = bhs[i];
+		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 	}
-	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(READA | REQ_META | REQ_PRIO, bh);
+	bio->bi_end_io = gfs2_meta_read_endio;
+	submit_bio(rw, bio);
 }
 
 /**
@@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	struct buffer_head *bh;
+	struct buffer_head *bh, *bhs[2];
+	int num = 0;
 
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
 		*bhp = NULL;
@@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
-		if (rahead)
-			gfs2_meta_readahead(gl, blkno + 1);
-		return 0;
+		flags &= ~DIO_WAIT;
+	} else {
+		bh->b_end_io = end_buffer_read_sync;
+		get_bh(bh);
+		bhs[num++] = bh;
 	}
-	bh->b_end_io = end_buffer_read_sync;
-	get_bh(bh);
-	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
-	if (rahead)
-		gfs2_meta_readahead(gl, blkno + 1);
+
+	if (rahead) {
+		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+		lock_buffer(bh);
+		if (buffer_uptodate(bh)) {
+			unlock_buffer(bh);
+			brelse(bh);
+		} else {
+			bh->b_end_io = end_buffer_read_sync;
+			bhs[num++] = bh;
+		}
+	}
+
+	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
+	bh = *bhp;
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh))) {
 		struct gfs2_trans *tr = current->journal_info;
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-12 20:15       ` Andreas Gruenbacher
@ 2015-11-12 20:33         ` Steven Whitehouse
  2015-11-13 22:24           ` Andreas Gruenbacher
  2015-11-13 13:48         ` Bob Peterson
  1 sibling, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2015-11-12 20:33 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 12/11/15 20:15, Andreas Gruenbacher wrote:
> Here is an updated version of this patch, please review.
>
> Thanks,
> Andreas

That looks good to me, and I assume that it should be faster too?

Steve.

>
> --
>
> Instead of submitting a READ_SYNC bio for the inode and a READA bio for
> the inode's extended attributes through submit_bh, submit a single READ
> bio for both strough submit_bio when possible.  This can be more
> efficient on some kinds of block devices.
>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>   fs/gfs2/meta_io.c | 81 ++++++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 63 insertions(+), 18 deletions(-)
>
> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
> index 0f24828..e137d96 100644
> --- a/fs/gfs2/meta_io.c
> +++ b/fs/gfs2/meta_io.c
> @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
>   	return bh;
>   }
>   
> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
> +static void gfs2_meta_read_endio(struct bio *bio)
>   {
> -	struct buffer_head *bh;
> +	struct bio_vec *bvec;
> +	int i;
> +
> +	bio_for_each_segment_all(bvec, bio, i) {
> +		struct page *page = bvec->bv_page;
> +		struct buffer_head *bh = page_buffers(page);
> +		unsigned int len = bvec->bv_len;
> +
> +		while (bh_offset(bh) < bvec->bv_offset)
> +			bh = bh->b_this_page;
> +		do {
> +			struct buffer_head *next = bh->b_this_page;
> +			len -= bh->b_size;
> +			bh->b_end_io(bh, !bio->bi_error);
> +			bh = next;
> +		} while (bh && len);
> +	}
> +	bio_put(bio);
> +}
>   
> -	bh = gfs2_getbuf(gl, blkno, 1);
> -	lock_buffer(bh);
> -	if (buffer_uptodate(bh)) {
> -		unlock_buffer(bh);
> -		brelse(bh);
> +/*
> + * Submit several consecutive buffer head I/O requests as a single bio I/O
> + * request.  (See submit_bh_wbc.)
> + */
> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
> +{
> +	struct buffer_head *bh = bhs[0];
> +	struct bio *bio;
> +	int i;
> +
> +	if (!num)
>   		return;
> +
> +	bio = bio_alloc(GFP_NOIO, num);
> +	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
> +	bio->bi_bdev = bh->b_bdev;
> +	for (i = 0; i < num; i++) {
> +		bh = bhs[i];
> +		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
>   	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	submit_bh(READA | REQ_META | REQ_PRIO, bh);
> +	bio->bi_end_io = gfs2_meta_read_endio;
> +	submit_bio(rw, bio);
>   }
>   
>   /**
> @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
>   		   int rahead, struct buffer_head **bhp)
>   {
>   	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> -	struct buffer_head *bh;
> +	struct buffer_head *bh, *bhs[2];
> +	int num = 0;
>   
>   	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
>   		*bhp = NULL;
> @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
>   	lock_buffer(bh);
>   	if (buffer_uptodate(bh)) {
>   		unlock_buffer(bh);
> -		if (rahead)
> -			gfs2_meta_readahead(gl, blkno + 1);
> -		return 0;
> +		flags &= ~DIO_WAIT;
> +	} else {
> +		bh->b_end_io = end_buffer_read_sync;
> +		get_bh(bh);
> +		bhs[num++] = bh;
>   	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	get_bh(bh);
> -	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
> -	if (rahead)
> -		gfs2_meta_readahead(gl, blkno + 1);
> +
> +	if (rahead) {
> +		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
> +
> +		lock_buffer(bh);
> +		if (buffer_uptodate(bh)) {
> +			unlock_buffer(bh);
> +			brelse(bh);
> +		} else {
> +			bh->b_end_io = end_buffer_read_sync;
> +			bhs[num++] = bh;
> +		}
> +	}
> +
> +	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
>   	if (!(flags & DIO_WAIT))
>   		return 0;
>   
> +	bh = *bhp;
>   	wait_on_buffer(bh);
>   	if (unlikely(!buffer_uptodate(bh))) {
>   		struct gfs2_trans *tr = current->journal_info;



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-12 20:15       ` Andreas Gruenbacher
  2015-11-12 20:33         ` Steven Whitehouse
@ 2015-11-13 13:48         ` Bob Peterson
  1 sibling, 0 replies; 13+ messages in thread
From: Bob Peterson @ 2015-11-13 13:48 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
> Here is an updated version of this patch, please review.
> 
> Thanks,
> Andreas
> 
> --
> 
> Instead of submitting a READ_SYNC bio for the inode and a READA bio for
> the inode's extended attributes through submit_bh, submit a single READ
> bio for both strough submit_bio when possible.  This can be more
> efficient on some kinds of block devices.
> 
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>  fs/gfs2/meta_io.c | 81
>  ++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 63 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
> index 0f24828..e137d96 100644
> --- a/fs/gfs2/meta_io.c
> +++ b/fs/gfs2/meta_io.c
> @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock
> *gl, u64 blkno)
>  	return bh;
>  }
>  
> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
> +static void gfs2_meta_read_endio(struct bio *bio)
>  {
> -	struct buffer_head *bh;
> +	struct bio_vec *bvec;
> +	int i;
> +
> +	bio_for_each_segment_all(bvec, bio, i) {
> +		struct page *page = bvec->bv_page;
> +		struct buffer_head *bh = page_buffers(page);
> +		unsigned int len = bvec->bv_len;
> +
> +		while (bh_offset(bh) < bvec->bv_offset)
> +			bh = bh->b_this_page;
> +		do {
> +			struct buffer_head *next = bh->b_this_page;
> +			len -= bh->b_size;
> +			bh->b_end_io(bh, !bio->bi_error);
> +			bh = next;
> +		} while (bh && len);
> +	}
> +	bio_put(bio);
> +}
>  
> -	bh = gfs2_getbuf(gl, blkno, 1);
> -	lock_buffer(bh);
> -	if (buffer_uptodate(bh)) {
> -		unlock_buffer(bh);
> -		brelse(bh);
> +/*
> + * Submit several consecutive buffer head I/O requests as a single bio I/O
> + * request.  (See submit_bh_wbc.)
> + */
> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
> +{
> +	struct buffer_head *bh = bhs[0];
> +	struct bio *bio;
> +	int i;
> +
> +	if (!num)
>  		return;
> +
> +	bio = bio_alloc(GFP_NOIO, num);
> +	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
> +	bio->bi_bdev = bh->b_bdev;
> +	for (i = 0; i < num; i++) {
> +		bh = bhs[i];
> +		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
>  	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	submit_bh(READA | REQ_META | REQ_PRIO, bh);
> +	bio->bi_end_io = gfs2_meta_read_endio;
> +	submit_bio(rw, bio);
>  }
>  
>  /**
> @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int
> flags,
>  		   int rahead, struct buffer_head **bhp)
>  {
>  	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> -	struct buffer_head *bh;
> +	struct buffer_head *bh, *bhs[2];
> +	int num = 0;
>  
>  	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
>  		*bhp = NULL;
> @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
> int flags,
>  	lock_buffer(bh);
>  	if (buffer_uptodate(bh)) {
>  		unlock_buffer(bh);
> -		if (rahead)
> -			gfs2_meta_readahead(gl, blkno + 1);
> -		return 0;
> +		flags &= ~DIO_WAIT;
> +	} else {
> +		bh->b_end_io = end_buffer_read_sync;
> +		get_bh(bh);
> +		bhs[num++] = bh;
>  	}
> -	bh->b_end_io = end_buffer_read_sync;
> -	get_bh(bh);
> -	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
> -	if (rahead)
> -		gfs2_meta_readahead(gl, blkno + 1);
> +
> +	if (rahead) {
> +		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
> +
> +		lock_buffer(bh);
> +		if (buffer_uptodate(bh)) {
> +			unlock_buffer(bh);
> +			brelse(bh);
> +		} else {
> +			bh->b_end_io = end_buffer_read_sync;
> +			bhs[num++] = bh;
> +		}
> +	}
> +
> +	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
>  	if (!(flags & DIO_WAIT))
>  		return 0;
>  
> +	bh = *bhp;
>  	wait_on_buffer(bh);
>  	if (unlikely(!buffer_uptodate(bh))) {
>  		struct gfs2_trans *tr = current->journal_info;
> --
> 2.5.0
> 
> 
Hi,

ACK to both patches

Looks good. I'll hold onto them until this merge window closes.

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-12 20:33         ` Steven Whitehouse
@ 2015-11-13 22:24           ` Andreas Gruenbacher
  2015-11-16 18:14             ` Bob Peterson
  0 siblings, 1 reply; 13+ messages in thread
From: Andreas Gruenbacher @ 2015-11-13 22:24 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
> That looks good to me, and I assume that it should be faster too?

I did some tests with a directory tree with 3439 directories and 51556
files in it.

In that tree, 47313 or 86% of the 54995 files and directories ended up
with readahead enabled, while 7682 didn't.

On a slow SATA magnetic disc, I compared the time required for running
"find", then "find | xargs stat", then "find | xargs getfattr". I
booted with selinux=0 for these measurements, so "find" and "find |
xargs stat" didn't use the xattrs.

The results are attached as a spreadsheet. They show that xattr
readahead can save quite some time (here, about -75%). The optimized
version of xattr readahead is about 3% faster than the simple version
in this test, so I'll leave it up to you whether that's worth it for
you.

It makes no difference whether the scheduler is "noop" or "deadline"
here; the "noop" scheduler still seems to merge adjacent I/O requests
(which makes sense).

Andreas
-------------- next part --------------
A non-text attachment was scrubbed...
Name: gfs2-readahead.ods
Type: application/vnd.oasis.opendocument.spreadsheet
Size: 11411 bytes
Desc: not available
URL: <http://listman.redhat.com/archives/cluster-devel/attachments/20151113/9323ca98/attachment.ods>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
  2015-11-13 22:24           ` Andreas Gruenbacher
@ 2015-11-16 18:14             ` Bob Peterson
  0 siblings, 0 replies; 13+ messages in thread
From: Bob Peterson @ 2015-11-16 18:14 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
> On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse <swhiteho@redhat.com>
> wrote:
> > That looks good to me, and I assume that it should be faster too?
> 
> I did some tests with a directory tree with 3439 directories and 51556
> files in it.
> 
> In that tree, 47313 or 86% of the 54995 files and directories ended up
> with readahead enabled, while 7682 didn't.
> 
> On a slow SATA magnetic disc, I compared the time required for running
> "find", then "find | xargs stat", then "find | xargs getfattr". I
> booted with selinux=0 for these measurements, so "find" and "find |
> xargs stat" didn't use the xattrs.
> 
> The results are attached as a spreadsheet. They show that xattr
> readahead can save quite some time (here, about -75%). The optimized
> version of xattr readahead is about 3% faster than the simple version
> in this test, so I'll leave it up to you whether that's worth it for
> you.
> 
> It makes no difference whether the scheduler is "noop" or "deadline"
> here; the "noop" scheduler still seems to merge adjacent I/O requests
> (which makes sense).
> 
> Andreas
> 
Hi,

Thanks. Both patches are now applied to the for-next branch of the linux-gfs2 tree:
https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next&id=c8d577038449a718ad0027d1790b6ef4441715d4
https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next&id=843396ad9886458bddbc7d07137b2975876b029e

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-11-16 18:14 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher
2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher
2015-11-03 17:29   ` Bob Peterson
2015-11-03 19:02     ` Steven Whitehouse
2015-11-03 20:18       ` Andreas Gruenbacher
2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher
2015-11-12 13:44   ` Steven Whitehouse
2015-11-12 15:33     ` Andreas Gruenbacher
2015-11-12 20:15       ` Andreas Gruenbacher
2015-11-12 20:33         ` Steven Whitehouse
2015-11-13 22:24           ` Andreas Gruenbacher
2015-11-16 18:14             ` Bob Peterson
2015-11-13 13:48         ` Bob Peterson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.