gfs2 iomap dealock, IOMAP_F_UNBALANCED

* gfs2 iomap dealock, IOMAP_F_UNBALANCED
@ 2019-03-21 13:13 Andreas Gruenbacher
  2019-03-21 21:43 ` Dave Chinner
  2019-03-28 16:51 ` Christoph Hellwig
  0 siblings, 2 replies; 12+ messages in thread
From: Andreas Gruenbacher @ 2019-03-21 13:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andreas Gruenbacher, cluster-devel, Dave Chinner, Ross Lagerwall,
	Mark Syms, Edwin Török, linux-fsdevel

Hi Christoph,

we need your help fixing a gfs2 deadlock involving iomap.  What's going
on is the following:

* During iomap_file_buffered_write, gfs2_iomap_begin grabs the log flush
  lock and keeps it until gfs2_iomap_end.  It currently always does that
  even though there is no point other than for journaled data writes.

* iomap_file_buffered_write then calls balance_dirty_pages_ratelimited.
  If that ends up calling gfs2_write_inode, gfs2 will try to grab the
  log flush lock again and deadlock.

We can stop gfs2_iomap_begin from keeping the log flush lock held for
non-journaled data writes, but that still leaves us with the deadlock
for journaled data writes: for them, we need to add the data pages to
the running transaction, so dropping the log flush lock wouldn't work.

I've tried to work around the unwanted balance_dirty_pages_ratelimited
in gfs2_write_inode as originally suggested by Ross.  That works to a
certain degree, but only if we always skip inodes in the WB_SYNC_NONE
case, and that means that gfs2_write_inode becomes quite ineffective.

So it seems that it would be more reasonable to avoid the dirty page
balancing under the log flush lock in the first place.

The attached patch changes gfs2_iomap_begin to only hold on to the log
flush lock for journaled data writes.  In that case, we also make sure
to limit the write size to not overrun dirty limits using a similar
logic as in balance_dirty_pages_ratelimited; there is precedent for that
approach in btrfs_buffered_write.  Then, we prevent iomap from balancing
dirty pages via the new IOMAP_F_UNBALANCED flag.

Does that seem like an acceptable approach?

Thanks,
Andreas

---
 fs/gfs2/bmap.c        | 62 +++++++++++++++++++++++++++++++++----------
 fs/gfs2/file.c        |  2 ++
 fs/iomap.c            |  3 ++-
 include/linux/iomap.h |  1 +
 4 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 02b2646d84b3a..dad7d3810533e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -862,6 +862,24 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 	iomap->offset = lblock << inode->i_blkbits;
 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 	len = lblock_stop - lblock + 1;
+
+	if ((flags & IOMAP_WRITE) && gfs2_is_jdata(ip)) {
+		int max_pages;
+		u64 max_blocks;
+
+		/*
+		 * Limit the write size: this ensures that write throttling
+		 * will kick in fast enough even when we don't call
+		 * balance_dirty_pages_ratelimited for each page written.
+		 */
+		max_pages = min((int)DIV_ROUND_UP(length, PAGE_SIZE),
+				current->nr_dirtied_pause - current->nr_dirtied);
+		max_pages = max(max_pages, 8);
+		max_blocks = (u64)max_pages << (PAGE_SHIFT - inode->i_blkbits);
+		if (len > max_blocks)
+			len = max_blocks;
+	}
+
 	iomap->length = len << inode->i_blkbits;
 
 	height = ip->i_height;
@@ -974,6 +992,19 @@ static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
 	gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
 }
 
+static void gfs2_write_end(struct inode *inode, struct buffer_head *dibh)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_trans *tr = current->journal_info;
+
+	gfs2_ordered_add_inode(ip);
+
+	if (tr->tr_num_buf_new)
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	else
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+}
+
 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 				  loff_t length, unsigned flags,
 				  struct iomap *iomap,
@@ -1052,6 +1083,13 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 	}
 	if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
 		iomap->page_done = gfs2_iomap_journaled_page_done;
+
+	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip) || inode == sdp->sd_rindex) {
+		iomap->flags |= IOMAP_F_UNBALANCED;
+	} else {
+		gfs2_write_end(inode, mp->mp_bh[0]);
+		gfs2_trans_end(sdp);
+	}
 	return 0;
 
 out_trans_end:
@@ -1103,28 +1141,24 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_trans *tr = current->journal_info;
 	struct buffer_head *dibh = iomap->private;
 
 	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
 		goto out;
 
-	if (iomap->type != IOMAP_INLINE) {
-		gfs2_ordered_add_inode(ip);
+	if (current->journal_info) {
+		struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-		if (tr->tr_num_buf_new)
-			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		else
-			gfs2_trans_add_meta(ip->i_gl, dibh);
-	}
+		if (iomap->type != IOMAP_INLINE)
+			gfs2_write_end(inode, dibh);
 
-	if (inode == sdp->sd_rindex) {
-		adjust_fs_space(inode);
-		sdp->sd_rindex_uptodate = 0;
-	}
+		if (inode == sdp->sd_rindex) {
+			adjust_fs_space(inode);
+			sdp->sd_rindex_uptodate = 0;
+		}
 
-	gfs2_trans_end(sdp);
+		gfs2_trans_end(sdp);
+	}
 	gfs2_inplace_release(ip);
 
 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 58a768e59712e..542bd149c4aa3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -867,6 +867,8 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			iocb->ki_pos += ret;
 	}
 
+	balance_dirty_pages_ratelimited(file->f_mapping);
+
 out2:
 	current->backing_dev_info = NULL;
 out:
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7da..0d8ea3f29b2ef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -863,7 +863,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		written += copied;
 		length -= copied;
 
-		balance_dirty_pages_ratelimited(inode->i_mapping);
+		if (!(iomap->flags & IOMAP_F_UNBALANCED))
+			balance_dirty_pages_ratelimited(inode->i_mapping);
 	} while (iov_iter_count(i) && length);
 
 	return written ? written : status;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0fefb5455bdaf..e9a04e76a3217 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -35,6 +35,7 @@ struct vm_fault;
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
 #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
 #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
+#define IOMAP_F_UNBALANCED	0x08	/* don't balance dirty pages */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread