All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] xfs: implement per-inode writeback completion
@ 2019-03-27  3:05 Darrick J. Wong
  2019-03-27  3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong
  2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster
  0 siblings, 2 replies; 12+ messages in thread
From: Darrick J. Wong @ 2019-03-27  3:05 UTC (permalink / raw)
  To: xfs

Hi folks,

Here's a quick patchset reworking writeback ioend completion processing
into per-inode completion queues so that we don't have a thundering herd
of unwritten/cow completion kworkers contending for the ILOCK.  The
second patch will also combine adjacent ioends when possible to reduce
the overhead further.  Let me know what you think. :)

--D
---
From: Darrick J. Wong <darrick.wong@oracle.com>

Restructure the buffered writeback completion code to use a single work
item per inode, since it's pointless to awaken a thundering herd of
threads to contend on a single inode's ILOCK.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c   |   48 +++++++++++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_aops.h   |    1 -
 fs/xfs/xfs_icache.c |    3 +++
 fs/xfs/xfs_inode.h  |    7 +++++++
 4 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3619e9e8d359..f7a9bb661826 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -234,11 +234,9 @@ xfs_setfilesize_ioend(
  * IO write completion.
  */
 STATIC void
-xfs_end_io(
-	struct work_struct *work)
+xfs_end_ioend(
+	struct xfs_ioend	*ioend)
 {
-	struct xfs_ioend	*ioend =
-		container_of(work, struct xfs_ioend, io_work);
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
@@ -278,19 +276,48 @@ xfs_end_io(
 	xfs_destroy_ioend(ioend, error);
 }
 
+/* Finish all pending io completions. */
+void
+xfs_end_io(
+	struct work_struct	*work)
+{
+	struct xfs_inode	*ip;
+	struct xfs_ioend	*ioend;
+	struct list_head	completion_list;
+	unsigned long		flags;
+
+	ip = container_of(work, struct xfs_inode, i_iodone_work);
+
+	spin_lock_irqsave(&ip->i_iodone_lock, flags);
+	list_replace_init(&ip->i_iodone_list, &completion_list);
+	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
+
+	while (!list_empty(&completion_list)) {
+		ioend = list_first_entry(&completion_list, struct xfs_ioend,
+				io_list);
+		list_del_init(&ioend->io_list);
+		xfs_end_ioend(ioend);
+	}
+}
+
 STATIC void
 xfs_end_bio(
 	struct bio		*bio)
 {
 	struct xfs_ioend	*ioend = bio->bi_private;
-	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned long		flags;
 
 	if (ioend->io_fork == XFS_COW_FORK ||
-	    ioend->io_state == XFS_EXT_UNWRITTEN)
-		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-	else if (ioend->io_append_trans)
-		queue_work(mp->m_data_workqueue, &ioend->io_work);
-	else
+	    ioend->io_state == XFS_EXT_UNWRITTEN ||
+	    ioend->io_append_trans != NULL) {
+		spin_lock_irqsave(&ip->i_iodone_lock, flags);
+		if (list_empty(&ip->i_iodone_list))
+			queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work);
+		list_add_tail(&ioend->io_list, &ip->i_iodone_list);
+		spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
+	} else
 		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 }
 
@@ -594,7 +621,6 @@ xfs_alloc_ioend(
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = offset;
-	INIT_WORK(&ioend->io_work, xfs_end_io);
 	ioend->io_append_trans = NULL;
 	ioend->io_bio = bio;
 	return ioend;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 6c2615b83c5d..f62b03186c62 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,6 @@ struct xfs_ioend {
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
-	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct bio		*io_bio;	/* bio being built */
 	struct bio		io_inline_bio;	/* MUST BE LAST! */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 245483cc282b..e70e7db29026 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -70,6 +70,9 @@ xfs_inode_alloc(
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(ip->i_d));
+	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
+	INIT_LIST_HEAD(&ip->i_iodone_list);
+	spin_lock_init(&ip->i_iodone_lock);
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e62074a5257c..88239c2dd824 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -57,6 +57,11 @@ typedef struct xfs_inode {
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
+
+	/* pending io completions */
+	spinlock_t		i_iodone_lock;
+	struct work_struct	i_iodone_work;
+	struct list_head	i_iodone_list;
 } xfs_inode_t;
 
 /* Convert from vfs inode to xfs inode */
@@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip);
 int xfs_iunlink_init(struct xfs_perag *pag);
 void xfs_iunlink_destroy(struct xfs_perag *pag);
 
+void xfs_end_io(struct work_struct *work);
+
 #endif	/* __XFS_INODE_H__ */

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-27  3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong
@ 2019-03-27  3:06 ` Darrick J. Wong
  2019-03-28 14:10   ` Brian Foster
  2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster
  1 sibling, 1 reply; 12+ messages in thread
From: Darrick J. Wong @ 2019-03-27  3:06 UTC (permalink / raw)
  To: xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

When we're processing an ioend on the list of io completions, check to
see if the next items on the list are both adjacent and of the same
type.  If so, we can merge the completions to reduce transaction
overhead.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f7a9bb661826..53afa2e6e3e7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -237,6 +237,7 @@ STATIC void
 xfs_end_ioend(
 	struct xfs_ioend	*ioend)
 {
+	struct list_head	ioend_list;
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
@@ -273,7 +274,89 @@ xfs_end_ioend(
 done:
 	if (ioend->io_append_trans)
 		error = xfs_setfilesize_ioend(ioend, error);
+	list_replace_init(&ioend->io_list, &ioend_list);
 	xfs_destroy_ioend(ioend, error);
+
+	while (!list_empty(&ioend_list)) {
+		ioend = list_first_entry(&ioend_list, struct xfs_ioend,
+				io_list);
+		list_del_init(&ioend->io_list);
+		xfs_destroy_ioend(ioend, error);
+	}
+}
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+xfs_ioend_can_merge(
+	struct xfs_ioend	*ioend,
+	int			ioend_error,
+	struct xfs_ioend	*next)
+{
+	int			next_error;
+
+	next_error = blk_status_to_errno(next->io_bio->bi_status);
+	if (ioend_error != next_error)
+		return false;
+	if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
+		return false;
+	if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
+	    (next->io_state == XFS_EXT_UNWRITTEN))
+		return false;
+	if (ioend->io_offset + ioend->io_size != next->io_offset)
+		return false;
+	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
+		return false;
+	return true;
+}
+
+/* Try to merge adjacent completions. */
+STATIC void
+xfs_ioend_try_merge(
+	struct xfs_ioend	*ioend,
+	struct list_head	*more_ioends)
+{
+	struct xfs_ioend	*next_ioend;
+	int			ioend_error;
+	int			error;
+
+	if (list_empty(more_ioends))
+		return;
+
+	ioend_error = blk_status_to_errno(ioend->io_bio->bi_status);
+
+	while (!list_empty(more_ioends)) {
+		next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
+				io_list);
+		if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend))
+			break;
+		list_move_tail(&next_ioend->io_list, &ioend->io_list);
+		ioend->io_size += next_ioend->io_size;
+		if (ioend->io_append_trans) {
+			error = xfs_setfilesize_ioend(next_ioend, 1);
+			ASSERT(error == 1);
+		}
+	}
+}
+
+/* list_sort compare function for ioends */
+static int
+xfs_ioend_compare(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xfs_ioend	*ia;
+	struct xfs_ioend	*ib;
+
+	ia = container_of(a, struct xfs_ioend, io_list);
+	ib = container_of(b, struct xfs_ioend, io_list);
+	if (ia->io_offset < ib->io_offset)
+		return -1;
+	else if (ia->io_offset > ib->io_offset)
+		return 1;
+	return 0;
 }
 
 /* Finish all pending io completions. */
@@ -292,10 +375,13 @@ xfs_end_io(
 	list_replace_init(&ip->i_iodone_list, &completion_list);
 	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
 
+	list_sort(NULL, &completion_list, xfs_ioend_compare);
+
 	while (!list_empty(&completion_list)) {
 		ioend = list_first_entry(&completion_list, struct xfs_ioend,
 				io_list);
 		list_del_init(&ioend->io_list);
+		xfs_ioend_try_merge(ioend, &completion_list);
 		xfs_end_ioend(ioend);
 	}
 }

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: implement per-inode writeback completion
  2019-03-27  3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong
  2019-03-27  3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong
@ 2019-03-28 14:08 ` Brian Foster
  2019-03-28 15:00   ` Darrick J. Wong
  1 sibling, 1 reply; 12+ messages in thread
From: Brian Foster @ 2019-03-28 14:08 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote:
> Hi folks,
> 
> Here's a quick patchset reworking writeback ioend completion processing
> into per-inode completion queues so that we don't have a thundering herd
> of unwritten/cow completion kworkers contending for the ILOCK.  The
> second patch will also combine adjacent ioends when possible to reduce
> the overhead further.  Let me know what you think. :)
> 

I assume the issue is that you get a bunch of ioends associated with COW
remaps or whatever and workqueue parallelism leads to a bunch contending
on inode locks because they happen to be associated with the same inode.
Hence, it's more efficient to have a single work item per inode, let
that task complete as many ioends as available for the inode and let the
workqueue spread around workers for different inodes. Sounds like a nice
idea to me. Is there a workload that measurably benefits from this or is
this just based on observation?

A couple small nits..

> --D
> ---
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Restructure the buffered writeback completion code to use a single work
> item per inode, since it's pointless to awaken a thundering herd of
> threads to contend on a single inode's ILOCK.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_aops.c   |   48 +++++++++++++++++++++++++++++++++++++-----------
>  fs/xfs/xfs_aops.h   |    1 -
>  fs/xfs/xfs_icache.c |    3 +++
>  fs/xfs/xfs_inode.h  |    7 +++++++
>  4 files changed, 47 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 3619e9e8d359..f7a9bb661826 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -234,11 +234,9 @@ xfs_setfilesize_ioend(
>   * IO write completion.
>   */
>  STATIC void
> -xfs_end_io(
> -	struct work_struct *work)
> +xfs_end_ioend(
> +	struct xfs_ioend	*ioend)
>  {
> -	struct xfs_ioend	*ioend =
> -		container_of(work, struct xfs_ioend, io_work);
>  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
>  	xfs_off_t		offset = ioend->io_offset;
>  	size_t			size = ioend->io_size;
> @@ -278,19 +276,48 @@ xfs_end_io(
>  	xfs_destroy_ioend(ioend, error);
>  }
>  
> +/* Finish all pending io completions. */
> +void
> +xfs_end_io(
> +	struct work_struct	*work)
> +{
> +	struct xfs_inode	*ip;
> +	struct xfs_ioend	*ioend;
> +	struct list_head	completion_list;
> +	unsigned long		flags;
> +
> +	ip = container_of(work, struct xfs_inode, i_iodone_work);
> +
> +	spin_lock_irqsave(&ip->i_iodone_lock, flags);
> +	list_replace_init(&ip->i_iodone_list, &completion_list);
> +	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> +
> +	while (!list_empty(&completion_list)) {
> +		ioend = list_first_entry(&completion_list, struct xfs_ioend,
> +				io_list);
> +		list_del_init(&ioend->io_list);
> +		xfs_end_ioend(ioend);
> +	}
> +}
> +
>  STATIC void
>  xfs_end_bio(
>  	struct bio		*bio)
>  {
>  	struct xfs_ioend	*ioend = bio->bi_private;
> -	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
> +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	unsigned long		flags;
>  
>  	if (ioend->io_fork == XFS_COW_FORK ||
> -	    ioend->io_state == XFS_EXT_UNWRITTEN)
> -		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
> -	else if (ioend->io_append_trans)
> -		queue_work(mp->m_data_workqueue, &ioend->io_work);
> -	else
> +	    ioend->io_state == XFS_EXT_UNWRITTEN ||
> +	    ioend->io_append_trans != NULL) {
> +		spin_lock_irqsave(&ip->i_iodone_lock, flags);
> +		if (list_empty(&ip->i_iodone_list))
> +			queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work);
> +		list_add_tail(&ioend->io_list, &ip->i_iodone_list);
> +		spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> +	} else

Ok, so there's no longer a need for multiple ioend workqueues because
all such operations are going to be processed by the single work
execution. Perhaps we should rename m_unwritten_workqueue or just use
m_data_workqueue since it happens to have a more generic name. We can
also kill off the other wq either way since it appears to be unused.

>  		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
>  }
>  
> @@ -594,7 +621,6 @@ xfs_alloc_ioend(
>  	ioend->io_inode = inode;
>  	ioend->io_size = 0;
>  	ioend->io_offset = offset;
> -	INIT_WORK(&ioend->io_work, xfs_end_io);
>  	ioend->io_append_trans = NULL;
>  	ioend->io_bio = bio;
>  	return ioend;
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 6c2615b83c5d..f62b03186c62 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -18,7 +18,6 @@ struct xfs_ioend {
>  	struct inode		*io_inode;	/* file being written to */
>  	size_t			io_size;	/* size of the extent */
>  	xfs_off_t		io_offset;	/* offset in the file */
> -	struct work_struct	io_work;	/* xfsdatad work queue */
>  	struct xfs_trans	*io_append_trans;/* xact. for size update */
>  	struct bio		*io_bio;	/* bio being built */
>  	struct bio		io_inline_bio;	/* MUST BE LAST! */
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 245483cc282b..e70e7db29026 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -70,6 +70,9 @@ xfs_inode_alloc(
>  	ip->i_flags = 0;
>  	ip->i_delayed_blks = 0;
>  	memset(&ip->i_d, 0, sizeof(ip->i_d));
> +	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> +	INIT_LIST_HEAD(&ip->i_iodone_list);
> +	spin_lock_init(&ip->i_iodone_lock);
>  
>  	return ip;
>  }
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index e62074a5257c..88239c2dd824 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -57,6 +57,11 @@ typedef struct xfs_inode {
>  
>  	/* VFS inode */
>  	struct inode		i_vnode;	/* embedded VFS inode */
> +
> +	/* pending io completions */
> +	spinlock_t		i_iodone_lock;
> +	struct work_struct	i_iodone_work;
> +	struct list_head	i_iodone_list;

A nit but I guess I'd name these with ioend instead of iodone. The
latter kind of confuses with buffer iodone callbacks (to me).

Brian

>  } xfs_inode_t;
>  
>  /* Convert from vfs inode to xfs inode */
> @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip);
>  int xfs_iunlink_init(struct xfs_perag *pag);
>  void xfs_iunlink_destroy(struct xfs_perag *pag);
>  
> +void xfs_end_io(struct work_struct *work);
> +
>  #endif	/* __XFS_INODE_H__ */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-27  3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong
@ 2019-03-28 14:10   ` Brian Foster
  2019-03-28 15:17     ` Darrick J. Wong
  0 siblings, 1 reply; 12+ messages in thread
From: Brian Foster @ 2019-03-28 14:10 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> When we're processing an ioend on the list of io completions, check to
> see if the next items on the list are both adjacent and of the same
> type.  If so, we can merge the completions to reduce transaction
> overhead.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

I'm curious of the value of this one... what situations allow for
batching on the ioend completion side that we haven't already accounted
for in the ioend construction side? The latter already batches until we
cross a change in fork type, extent state, or a break in logical or
physical contiguity. The former looks like it follows similar logic for
merging with the exceptions of allowing for merges of physically
discontiguous extents and disallowing merges of those with different
append status. That seems like a smallish window of opportunity to me..
am I missing something?

If that is the gist but there is enough benefit for the more lenient
merging, I also wonder whether it would be more efficient to try and
also accomplish that on the construction side rather than via completion
post-processing. For example, could we abstract a single ioend to cover
an arbitrary list of bio/page -> sector mappings with the same higher
level semantics? We already have a bio chaining mechanism, it's just
only used for when a bio is full. Could we reuse that for dealing with
physical discontiguity?

Brian

>  fs/xfs/xfs_aops.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 86 insertions(+)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index f7a9bb661826..53afa2e6e3e7 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -237,6 +237,7 @@ STATIC void
>  xfs_end_ioend(
>  	struct xfs_ioend	*ioend)
>  {
> +	struct list_head	ioend_list;
>  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
>  	xfs_off_t		offset = ioend->io_offset;
>  	size_t			size = ioend->io_size;
> @@ -273,7 +274,89 @@ xfs_end_ioend(
>  done:
>  	if (ioend->io_append_trans)
>  		error = xfs_setfilesize_ioend(ioend, error);
> +	list_replace_init(&ioend->io_list, &ioend_list);
>  	xfs_destroy_ioend(ioend, error);
> +
> +	while (!list_empty(&ioend_list)) {
> +		ioend = list_first_entry(&ioend_list, struct xfs_ioend,
> +				io_list);
> +		list_del_init(&ioend->io_list);
> +		xfs_destroy_ioend(ioend, error);
> +	}
> +}
> +
> +/*
> + * We can merge two adjacent ioends if they have the same set of work to do.
> + */
> +static bool
> +xfs_ioend_can_merge(
> +	struct xfs_ioend	*ioend,
> +	int			ioend_error,
> +	struct xfs_ioend	*next)
> +{
> +	int			next_error;
> +
> +	next_error = blk_status_to_errno(next->io_bio->bi_status);
> +	if (ioend_error != next_error)
> +		return false;
> +	if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
> +		return false;
> +	if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
> +	    (next->io_state == XFS_EXT_UNWRITTEN))
> +		return false;
> +	if (ioend->io_offset + ioend->io_size != next->io_offset)
> +		return false;
> +	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
> +		return false;
> +	return true;
> +}
> +
> +/* Try to merge adjacent completions. */
> +STATIC void
> +xfs_ioend_try_merge(
> +	struct xfs_ioend	*ioend,
> +	struct list_head	*more_ioends)
> +{
> +	struct xfs_ioend	*next_ioend;
> +	int			ioend_error;
> +	int			error;
> +
> +	if (list_empty(more_ioends))
> +		return;
> +
> +	ioend_error = blk_status_to_errno(ioend->io_bio->bi_status);
> +
> +	while (!list_empty(more_ioends)) {
> +		next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
> +				io_list);
> +		if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend))
> +			break;
> +		list_move_tail(&next_ioend->io_list, &ioend->io_list);
> +		ioend->io_size += next_ioend->io_size;
> +		if (ioend->io_append_trans) {
> +			error = xfs_setfilesize_ioend(next_ioend, 1);
> +			ASSERT(error == 1);
> +		}
> +	}
> +}
> +
> +/* list_sort compare function for ioends */
> +static int
> +xfs_ioend_compare(
> +	void			*priv,
> +	struct list_head	*a,
> +	struct list_head	*b)
> +{
> +	struct xfs_ioend	*ia;
> +	struct xfs_ioend	*ib;
> +
> +	ia = container_of(a, struct xfs_ioend, io_list);
> +	ib = container_of(b, struct xfs_ioend, io_list);
> +	if (ia->io_offset < ib->io_offset)
> +		return -1;
> +	else if (ia->io_offset > ib->io_offset)
> +		return 1;
> +	return 0;
>  }
>  
>  /* Finish all pending io completions. */
> @@ -292,10 +375,13 @@ xfs_end_io(
>  	list_replace_init(&ip->i_iodone_list, &completion_list);
>  	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
>  
> +	list_sort(NULL, &completion_list, xfs_ioend_compare);
> +
>  	while (!list_empty(&completion_list)) {
>  		ioend = list_first_entry(&completion_list, struct xfs_ioend,
>  				io_list);
>  		list_del_init(&ioend->io_list);
> +		xfs_ioend_try_merge(ioend, &completion_list);
>  		xfs_end_ioend(ioend);
>  	}
>  }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: implement per-inode writeback completion
  2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster
@ 2019-03-28 15:00   ` Darrick J. Wong
  2019-03-28 16:24     ` Brian Foster
  0 siblings, 1 reply; 12+ messages in thread
From: Darrick J. Wong @ 2019-03-28 15:00 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

On Thu, Mar 28, 2019 at 10:08:59AM -0400, Brian Foster wrote:
> On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote:
> > Hi folks,
> > 
> > Here's a quick patchset reworking writeback ioend completion processing
> > into per-inode completion queues so that we don't have a thundering herd
> > of unwritten/cow completion kworkers contending for the ILOCK.  The
> > second patch will also combine adjacent ioends when possible to reduce
> > the overhead further.  Let me know what you think. :)
> > 
> 
> I assume the issue is that you get a bunch of ioends associated with COW
> remaps or whatever and workqueue parallelism leads to a bunch contending
> on inode locks because they happen to be associated with the same inode.

Yes, as outlined in the thread "xlog_grant_head_wait deadlocks on
high-rolling transactions?"

> Hence, it's more efficient to have a single work item per inode, let
> that task complete as many ioends as available for the inode and let the
> workqueue spread around workers for different inodes. Sounds like a nice
> idea to me. Is there a workload that measurably benefits from this or is
> this just based on observation?

If you consider 'xfstests' to be my primary workload, then yes, it
reduces developer deadlock triage time considerably. :)

Uhh but more seriously, on kvm hosts that use pagecache-as-diskcache(!)
the number of kworkers spirals insanely when someone issue a cache flush
(a.k.a. fsync) because they're all contending with each other and the
host writeback slows to a crawl if it doesn't just fall over dead.

> A couple small nits..
> 
> > --D
> > ---
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Restructure the buffered writeback completion code to use a single work
> > item per inode, since it's pointless to awaken a thundering herd of
> > threads to contend on a single inode's ILOCK.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_aops.c   |   48 +++++++++++++++++++++++++++++++++++++-----------
> >  fs/xfs/xfs_aops.h   |    1 -
> >  fs/xfs/xfs_icache.c |    3 +++
> >  fs/xfs/xfs_inode.h  |    7 +++++++
> >  4 files changed, 47 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index 3619e9e8d359..f7a9bb661826 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -234,11 +234,9 @@ xfs_setfilesize_ioend(
> >   * IO write completion.
> >   */
> >  STATIC void
> > -xfs_end_io(
> > -	struct work_struct *work)
> > +xfs_end_ioend(
> > +	struct xfs_ioend	*ioend)
> >  {
> > -	struct xfs_ioend	*ioend =
> > -		container_of(work, struct xfs_ioend, io_work);
> >  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> >  	xfs_off_t		offset = ioend->io_offset;
> >  	size_t			size = ioend->io_size;
> > @@ -278,19 +276,48 @@ xfs_end_io(
> >  	xfs_destroy_ioend(ioend, error);
> >  }
> >  
> > +/* Finish all pending io completions. */
> > +void
> > +xfs_end_io(
> > +	struct work_struct	*work)
> > +{
> > +	struct xfs_inode	*ip;
> > +	struct xfs_ioend	*ioend;
> > +	struct list_head	completion_list;
> > +	unsigned long		flags;
> > +
> > +	ip = container_of(work, struct xfs_inode, i_iodone_work);
> > +
> > +	spin_lock_irqsave(&ip->i_iodone_lock, flags);
> > +	list_replace_init(&ip->i_iodone_list, &completion_list);
> > +	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> > +
> > +	while (!list_empty(&completion_list)) {
> > +		ioend = list_first_entry(&completion_list, struct xfs_ioend,
> > +				io_list);
> > +		list_del_init(&ioend->io_list);
> > +		xfs_end_ioend(ioend);
> > +	}
> > +}
> > +
> >  STATIC void
> >  xfs_end_bio(
> >  	struct bio		*bio)
> >  {
> >  	struct xfs_ioend	*ioend = bio->bi_private;
> > -	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
> > +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	unsigned long		flags;
> >  
> >  	if (ioend->io_fork == XFS_COW_FORK ||
> > -	    ioend->io_state == XFS_EXT_UNWRITTEN)
> > -		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
> > -	else if (ioend->io_append_trans)
> > -		queue_work(mp->m_data_workqueue, &ioend->io_work);
> > -	else
> > +	    ioend->io_state == XFS_EXT_UNWRITTEN ||
> > +	    ioend->io_append_trans != NULL) {
> > +		spin_lock_irqsave(&ip->i_iodone_lock, flags);
> > +		if (list_empty(&ip->i_iodone_list))
> > +			queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work);
> > +		list_add_tail(&ioend->io_list, &ip->i_iodone_list);
> > +		spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> > +	} else
> 
> Ok, so there's no longer a need for multiple ioend workqueues because
> all such operations are going to be processed by the single work
> execution. Perhaps we should rename m_unwritten_workqueue or just use
> m_data_workqueue since it happens to have a more generic name. We can
> also kill off the other wq either way since it appears to be unused.

<nod> I'll add a patch to kill off one of the workqueues.

> >  		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
> >  }
> >  
> > @@ -594,7 +621,6 @@ xfs_alloc_ioend(
> >  	ioend->io_inode = inode;
> >  	ioend->io_size = 0;
> >  	ioend->io_offset = offset;
> > -	INIT_WORK(&ioend->io_work, xfs_end_io);
> >  	ioend->io_append_trans = NULL;
> >  	ioend->io_bio = bio;
> >  	return ioend;
> > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> > index 6c2615b83c5d..f62b03186c62 100644
> > --- a/fs/xfs/xfs_aops.h
> > +++ b/fs/xfs/xfs_aops.h
> > @@ -18,7 +18,6 @@ struct xfs_ioend {
> >  	struct inode		*io_inode;	/* file being written to */
> >  	size_t			io_size;	/* size of the extent */
> >  	xfs_off_t		io_offset;	/* offset in the file */
> > -	struct work_struct	io_work;	/* xfsdatad work queue */
> >  	struct xfs_trans	*io_append_trans;/* xact. for size update */
> >  	struct bio		*io_bio;	/* bio being built */
> >  	struct bio		io_inline_bio;	/* MUST BE LAST! */
> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index 245483cc282b..e70e7db29026 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> > @@ -70,6 +70,9 @@ xfs_inode_alloc(
> >  	ip->i_flags = 0;
> >  	ip->i_delayed_blks = 0;
> >  	memset(&ip->i_d, 0, sizeof(ip->i_d));
> > +	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> > +	INIT_LIST_HEAD(&ip->i_iodone_list);
> > +	spin_lock_init(&ip->i_iodone_lock);
> >  
> >  	return ip;
> >  }
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index e62074a5257c..88239c2dd824 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -57,6 +57,11 @@ typedef struct xfs_inode {
> >  
> >  	/* VFS inode */
> >  	struct inode		i_vnode;	/* embedded VFS inode */
> > +
> > +	/* pending io completions */
> > +	spinlock_t		i_iodone_lock;
> > +	struct work_struct	i_iodone_work;
> > +	struct list_head	i_iodone_list;
> 
> A nit but I guess I'd name these with ioend instead of iodone. The
> latter kind of confuses with buffer iodone callbacks (to me).

<nod> Will do.

--D

> Brian
> 
> >  } xfs_inode_t;
> >  
> >  /* Convert from vfs inode to xfs inode */
> > @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip);
> >  int xfs_iunlink_init(struct xfs_perag *pag);
> >  void xfs_iunlink_destroy(struct xfs_perag *pag);
> >  
> > +void xfs_end_io(struct work_struct *work);
> > +
> >  #endif	/* __XFS_INODE_H__ */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-28 14:10   ` Brian Foster
@ 2019-03-28 15:17     ` Darrick J. Wong
  2019-03-28 16:46       ` Brian Foster
  2019-03-28 21:06       ` Dave Chinner
  0 siblings, 2 replies; 12+ messages in thread
From: Darrick J. Wong @ 2019-03-28 15:17 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > When we're processing an ioend on the list of io completions, check to
> > see if the next items on the list are both adjacent and of the same
> > type.  If so, we can merge the completions to reduce transaction
> > overhead.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> I'm curious of the value of this one... what situations allow for
> batching on the ioend completion side that we haven't already accounted
> for in the ioend construction side?

I was skeptical too, but Dave (I think?) pointed out that writeback can
split into 1GB chunks so it actually is possible to end up with adjacent
ioends.  So I wrote this patch and added a tracepoint, and lo it
actually did trigger when there's a lot of data to flush out, and we
succeed at allocating a single extent for the entire delalloc reservation.

> The latter already batches until we
> cross a change in fork type, extent state, or a break in logical or
> physical contiguity. The former looks like it follows similar logic for
> merging with the exceptions of allowing for merges of physically
> discontiguous extents and disallowing merges of those with different
> append status. That seems like a smallish window of opportunity to me..
> am I missing something?

Yep, it's a smallish window; small discontiguous writes don't benefit
here at all.

> If that is the gist but there is enough benefit for the more lenient
> merging, I also wonder whether it would be more efficient to try and
> also accomplish that on the construction side rather than via completion
> post-processing. For example, could we abstract a single ioend to cover
> an arbitrary list of bio/page -> sector mappings with the same higher
> level semantics? We already have a bio chaining mechanism, it's just
> only used for when a bio is full. Could we reuse that for dealing with
> physical discontiguity?

I suppose we could, though the bigger the ioend the longer it'll take to
process responses.  Also, I think it's the case that if any of the bios
fail then we treat all of the chained ones as failed?  (Meh, it's
writeback, it's not like you get to know /which/ writes failed unless
you do a stupid write()/fsync() dance...)

The other thing is that directio completions look very similar to
writeback completions, including the potential for having the thundering
herds pounding on the ILOCK.  I was thinking about refactoring those to
use the per-inode queue as a next step, though the directio completion
paths are murky.

> Brian
> 
> >  fs/xfs/xfs_aops.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 86 insertions(+)
> > 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index f7a9bb661826..53afa2e6e3e7 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -237,6 +237,7 @@ STATIC void
> >  xfs_end_ioend(
> >  	struct xfs_ioend	*ioend)
> >  {
> > +	struct list_head	ioend_list;
> >  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> >  	xfs_off_t		offset = ioend->io_offset;
> >  	size_t			size = ioend->io_size;
> > @@ -273,7 +274,89 @@ xfs_end_ioend(
> >  done:
> >  	if (ioend->io_append_trans)
> >  		error = xfs_setfilesize_ioend(ioend, error);
> > +	list_replace_init(&ioend->io_list, &ioend_list);
> >  	xfs_destroy_ioend(ioend, error);
> > +
> > +	while (!list_empty(&ioend_list)) {
> > +		ioend = list_first_entry(&ioend_list, struct xfs_ioend,
> > +				io_list);
> > +		list_del_init(&ioend->io_list);
> > +		xfs_destroy_ioend(ioend, error);
> > +	}
> > +}
> > +
> > +/*
> > + * We can merge two adjacent ioends if they have the same set of work to do.
> > + */
> > +static bool
> > +xfs_ioend_can_merge(
> > +	struct xfs_ioend	*ioend,
> > +	int			ioend_error,
> > +	struct xfs_ioend	*next)
> > +{
> > +	int			next_error;
> > +
> > +	next_error = blk_status_to_errno(next->io_bio->bi_status);
> > +	if (ioend_error != next_error)
> > +		return false;
> > +	if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
> > +		return false;
> > +	if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
> > +	    (next->io_state == XFS_EXT_UNWRITTEN))
> > +		return false;
> > +	if (ioend->io_offset + ioend->io_size != next->io_offset)
> > +		return false;
> > +	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
> > +		return false;
> > +	return true;
> > +}
> > +
> > +/* Try to merge adjacent completions. */
> > +STATIC void
> > +xfs_ioend_try_merge(
> > +	struct xfs_ioend	*ioend,
> > +	struct list_head	*more_ioends)
> > +{
> > +	struct xfs_ioend	*next_ioend;
> > +	int			ioend_error;
> > +	int			error;
> > +
> > +	if (list_empty(more_ioends))
> > +		return;
> > +
> > +	ioend_error = blk_status_to_errno(ioend->io_bio->bi_status);
> > +
> > +	while (!list_empty(more_ioends)) {
> > +		next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
> > +				io_list);
> > +		if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend))
> > +			break;
> > +		list_move_tail(&next_ioend->io_list, &ioend->io_list);
> > +		ioend->io_size += next_ioend->io_size;
> > +		if (ioend->io_append_trans) {
> > +			error = xfs_setfilesize_ioend(next_ioend, 1);
> > +			ASSERT(error == 1);
> > +		}
> > +	}
> > +}
> > +
> > +/* list_sort compare function for ioends */
> > +static int
> > +xfs_ioend_compare(
> > +	void			*priv,
> > +	struct list_head	*a,
> > +	struct list_head	*b)
> > +{
> > +	struct xfs_ioend	*ia;
> > +	struct xfs_ioend	*ib;
> > +
> > +	ia = container_of(a, struct xfs_ioend, io_list);
> > +	ib = container_of(b, struct xfs_ioend, io_list);
> > +	if (ia->io_offset < ib->io_offset)
> > +		return -1;
> > +	else if (ia->io_offset > ib->io_offset)
> > +		return 1;
> > +	return 0;
> >  }
> >  
> >  /* Finish all pending io completions. */
> > @@ -292,10 +375,13 @@ xfs_end_io(
> >  	list_replace_init(&ip->i_iodone_list, &completion_list);
> >  	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> >  
> > +	list_sort(NULL, &completion_list, xfs_ioend_compare);
> > +
> >  	while (!list_empty(&completion_list)) {
> >  		ioend = list_first_entry(&completion_list, struct xfs_ioend,
> >  				io_list);
> >  		list_del_init(&ioend->io_list);
> > +		xfs_ioend_try_merge(ioend, &completion_list);
> >  		xfs_end_ioend(ioend);
> >  	}
> >  }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: implement per-inode writeback completion
  2019-03-28 15:00   ` Darrick J. Wong
@ 2019-03-28 16:24     ` Brian Foster
  0 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2019-03-28 16:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

On Thu, Mar 28, 2019 at 08:00:22AM -0700, Darrick J. Wong wrote:
> On Thu, Mar 28, 2019 at 10:08:59AM -0400, Brian Foster wrote:
> > On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote:
> > > Hi folks,
> > > 
> > > Here's a quick patchset reworking writeback ioend completion processing
> > > into per-inode completion queues so that we don't have a thundering herd
> > > of unwritten/cow completion kworkers contending for the ILOCK.  The
> > > second patch will also combine adjacent ioends when possible to reduce
> > > the overhead further.  Let me know what you think. :)
> > > 
> > 
> > I assume the issue is that you get a bunch of ioends associated with COW
> > remaps or whatever and workqueue parallelism leads to a bunch contending
> > on inode locks because they happen to be associated with the same inode.
> 
> Yes, as outlined in the thread "xlog_grant_head_wait deadlocks on
> high-rolling transactions?"
> 
> > Hence, it's more efficient to have a single work item per inode, let
> > that task complete as many ioends as available for the inode and let the
> > workqueue spread around workers for different inodes. Sounds like a nice
> > idea to me. Is there a workload that measurably benefits from this or is
> > this just based on observation?
> 
> If you consider 'xfstests' to be my primary workload, then yes, it
> reduces developer deadlock triage time considerably. :)
> 

Ah, Ok. So this is not primarily a performance issue but rather
serializes the per inode I/O completion work to address that parallel
transaction alloc -> ilock -> ... problem of exhausting log reservation
space. We should probably document that in the commit log. ;)

> Uhh but more seriously, on kvm hosts that use pagecache-as-diskcache(!)
> the number of kworkers spirals insanely when someone issue a cache flush
> (a.k.a. fsync) because they're all contending with each other and the
> host writeback slows to a crawl if it doesn't just fall over dead.
> 

That's more of what I was expecting, but it makes sense for the above
too.

Brian

> > A couple small nits..
> > 
> > > --D
> > > ---
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Restructure the buffered writeback completion code to use a single work
> > > item per inode, since it's pointless to awaken a thundering herd of
> > > threads to contend on a single inode's ILOCK.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_aops.c   |   48 +++++++++++++++++++++++++++++++++++++-----------
> > >  fs/xfs/xfs_aops.h   |    1 -
> > >  fs/xfs/xfs_icache.c |    3 +++
> > >  fs/xfs/xfs_inode.h  |    7 +++++++
> > >  4 files changed, 47 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > > index 3619e9e8d359..f7a9bb661826 100644
> > > --- a/fs/xfs/xfs_aops.c
> > > +++ b/fs/xfs/xfs_aops.c
> > > @@ -234,11 +234,9 @@ xfs_setfilesize_ioend(
> > >   * IO write completion.
> > >   */
> > >  STATIC void
> > > -xfs_end_io(
> > > -	struct work_struct *work)
> > > +xfs_end_ioend(
> > > +	struct xfs_ioend	*ioend)
> > >  {
> > > -	struct xfs_ioend	*ioend =
> > > -		container_of(work, struct xfs_ioend, io_work);
> > >  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> > >  	xfs_off_t		offset = ioend->io_offset;
> > >  	size_t			size = ioend->io_size;
> > > @@ -278,19 +276,48 @@ xfs_end_io(
> > >  	xfs_destroy_ioend(ioend, error);
> > >  }
> > >  
> > > +/* Finish all pending io completions. */
> > > +void
> > > +xfs_end_io(
> > > +	struct work_struct	*work)
> > > +{
> > > +	struct xfs_inode	*ip;
> > > +	struct xfs_ioend	*ioend;
> > > +	struct list_head	completion_list;
> > > +	unsigned long		flags;
> > > +
> > > +	ip = container_of(work, struct xfs_inode, i_iodone_work);
> > > +
> > > +	spin_lock_irqsave(&ip->i_iodone_lock, flags);
> > > +	list_replace_init(&ip->i_iodone_list, &completion_list);
> > > +	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> > > +
> > > +	while (!list_empty(&completion_list)) {
> > > +		ioend = list_first_entry(&completion_list, struct xfs_ioend,
> > > +				io_list);
> > > +		list_del_init(&ioend->io_list);
> > > +		xfs_end_ioend(ioend);
> > > +	}
> > > +}
> > > +
> > >  STATIC void
> > >  xfs_end_bio(
> > >  	struct bio		*bio)
> > >  {
> > >  	struct xfs_ioend	*ioend = bio->bi_private;
> > > -	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
> > > +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +	unsigned long		flags;
> > >  
> > >  	if (ioend->io_fork == XFS_COW_FORK ||
> > > -	    ioend->io_state == XFS_EXT_UNWRITTEN)
> > > -		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
> > > -	else if (ioend->io_append_trans)
> > > -		queue_work(mp->m_data_workqueue, &ioend->io_work);
> > > -	else
> > > +	    ioend->io_state == XFS_EXT_UNWRITTEN ||
> > > +	    ioend->io_append_trans != NULL) {
> > > +		spin_lock_irqsave(&ip->i_iodone_lock, flags);
> > > +		if (list_empty(&ip->i_iodone_list))
> > > +			queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work);
> > > +		list_add_tail(&ioend->io_list, &ip->i_iodone_list);
> > > +		spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> > > +	} else
> > 
> > Ok, so there's no longer a need for multiple ioend workqueues because
> > all such operations are going to be processed by the single work
> > execution. Perhaps we should rename m_unwritten_workqueue or just use
> > m_data_workqueue since it happens to have a more generic name. We can
> > also kill off the other wq either way since it appears to be unused.
> 
> <nod> I'll add a patch to kill off one of the workqueues.
> 
> > >  		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
> > >  }
> > >  
> > > @@ -594,7 +621,6 @@ xfs_alloc_ioend(
> > >  	ioend->io_inode = inode;
> > >  	ioend->io_size = 0;
> > >  	ioend->io_offset = offset;
> > > -	INIT_WORK(&ioend->io_work, xfs_end_io);
> > >  	ioend->io_append_trans = NULL;
> > >  	ioend->io_bio = bio;
> > >  	return ioend;
> > > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> > > index 6c2615b83c5d..f62b03186c62 100644
> > > --- a/fs/xfs/xfs_aops.h
> > > +++ b/fs/xfs/xfs_aops.h
> > > @@ -18,7 +18,6 @@ struct xfs_ioend {
> > >  	struct inode		*io_inode;	/* file being written to */
> > >  	size_t			io_size;	/* size of the extent */
> > >  	xfs_off_t		io_offset;	/* offset in the file */
> > > -	struct work_struct	io_work;	/* xfsdatad work queue */
> > >  	struct xfs_trans	*io_append_trans;/* xact. for size update */
> > >  	struct bio		*io_bio;	/* bio being built */
> > >  	struct bio		io_inline_bio;	/* MUST BE LAST! */
> > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > > index 245483cc282b..e70e7db29026 100644
> > > --- a/fs/xfs/xfs_icache.c
> > > +++ b/fs/xfs/xfs_icache.c
> > > @@ -70,6 +70,9 @@ xfs_inode_alloc(
> > >  	ip->i_flags = 0;
> > >  	ip->i_delayed_blks = 0;
> > >  	memset(&ip->i_d, 0, sizeof(ip->i_d));
> > > +	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> > > +	INIT_LIST_HEAD(&ip->i_iodone_list);
> > > +	spin_lock_init(&ip->i_iodone_lock);
> > >  
> > >  	return ip;
> > >  }
> > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > index e62074a5257c..88239c2dd824 100644
> > > --- a/fs/xfs/xfs_inode.h
> > > +++ b/fs/xfs/xfs_inode.h
> > > @@ -57,6 +57,11 @@ typedef struct xfs_inode {
> > >  
> > >  	/* VFS inode */
> > >  	struct inode		i_vnode;	/* embedded VFS inode */
> > > +
> > > +	/* pending io completions */
> > > +	spinlock_t		i_iodone_lock;
> > > +	struct work_struct	i_iodone_work;
> > > +	struct list_head	i_iodone_list;
> > 
> > A nit but I guess I'd name these with ioend instead of iodone. The
> > latter kind of confuses with buffer iodone callbacks (to me).
> 
> <nod> Will do.
> 
> --D
> 
> > Brian
> > 
> > >  } xfs_inode_t;
> > >  
> > >  /* Convert from vfs inode to xfs inode */
> > > @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip);
> > >  int xfs_iunlink_init(struct xfs_perag *pag);
> > >  void xfs_iunlink_destroy(struct xfs_perag *pag);
> > >  
> > > +void xfs_end_io(struct work_struct *work);
> > > +
> > >  #endif	/* __XFS_INODE_H__ */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-28 15:17     ` Darrick J. Wong
@ 2019-03-28 16:46       ` Brian Foster
  2019-03-28 21:06       ` Dave Chinner
  1 sibling, 0 replies; 12+ messages in thread
From: Brian Foster @ 2019-03-28 16:46 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote:
> On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > When we're processing an ioend on the list of io completions, check to
> > > see if the next items on the list are both adjacent and of the same
> > > type.  If so, we can merge the completions to reduce transaction
> > > overhead.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > I'm curious of the value of this one... what situations allow for
> > batching on the ioend completion side that we haven't already accounted
> > for in the ioend construction side?
> 
> I was skeptical too, but Dave (I think?) pointed out that writeback can
> split into 1GB chunks so it actually is possible to end up with adjacent
> ioends.  So I wrote this patch and added a tracepoint, and lo it
> actually did trigger when there's a lot of data to flush out, and we
> succeed at allocating a single extent for the entire delalloc reservation.
> 

That doesn't seem like a huge overhead to me, but I am curious where
that splitting logic is. Somewhere in the writeback code..? (I assume
Dave can chime in on some of this stuff if he's more familiar with
it..).

> > The latter already batches until we
> > cross a change in fork type, extent state, or a break in logical or
> > physical contiguity. The former looks like it follows similar logic for
> > merging with the exceptions of allowing for merges of physically
> > discontiguous extents and disallowing merges of those with different
> > append status. That seems like a smallish window of opportunity to me..
> > am I missing something?
> 
> Yep, it's a smallish window; small discontiguous writes don't benefit
> here at all.
> 

Ok. The whole append thing is somewhat dynamic/non-deterministic as
well, along with the fact that we're subject to completion/wq timing to
allow for this kind of batching to occur. E.g., if we're completing a
series of 1GB ioends, what are the odds the next ioend completes before
the current one starts in the wq? I'd guess not great, but tbh I have no
idea..

> > If that is the gist but there is enough benefit for the more lenient
> > merging, I also wonder whether it would be more efficient to try and
> > also accomplish that on the construction side rather than via completion
> > post-processing. For example, could we abstract a single ioend to cover
> > an arbitrary list of bio/page -> sector mappings with the same higher
> > level semantics? We already have a bio chaining mechanism, it's just
> > only used for when a bio is full. Could we reuse that for dealing with
> > physical discontiguity?
> 
> I suppose we could, though the bigger the ioend the longer it'll take to
> process responses.  Also, I think it's the case that if any of the bios
> fail then we treat all of the chained ones as failed?  (Meh, it's
> writeback, it's not like you get to know /which/ writes failed unless
> you do a stupid write()/fsync() dance...)
> 

This already seems to be the case to a large degree with our current
batching. That 1GB ioend above is already a fairly large bio chain, as I
think that a bio can't have more than 256 (BIO_MAX_PAGES) pages.

Eh, I guess my take is that this doesn't necessarily seem like an
unreasonable change and it's not like it's a huge amount of code, but it
does seem like potentially more standalone code than it's worth for
minimal benefit. It seems the amount of code and processing could be
reduced and benefit slightly increased by allocating fewer ioends in the
first place if this were pre-processed vs. post-processed. I'd prefer to
see more analysis of the potential benefits either way...

> The other thing is that directio completions look very similar to
> writeback completions, including the potential for having the thundering
> herds pounding on the ILOCK.  I was thinking about refactoring those to
> use the per-inode queue as a next step, though the directio completion
> paths are murky.
> 

Indeed, though we don't have the degree of submission batching taking
place for direct I/O. Perhaps it just depends more on the workload since
aio can presumably drive a deeper queue.

A couple things to note..

I vaguely recall that the dio completion code has a bit of a messy
history of being switched back and forth between using ioends or
bitmasks or whatever else to avoid ioend allocations. This doesn't rule
out potential new benefits of using ioends that might be achieved in
light of new features like reflink, but FWIW I'd be more skeptical of
refactors along those lines that don't come along with a measurable
benefit or solve a tangible problem.

Another thing to be aware of here is that the iomap code looks like it
already can invoke our callback in a wq and it may complete the aio as
soon as we return...

Brian

> > Brian
> > 
> > >  fs/xfs/xfs_aops.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 86 insertions(+)
> > > 
> > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > > index f7a9bb661826..53afa2e6e3e7 100644
> > > --- a/fs/xfs/xfs_aops.c
> > > +++ b/fs/xfs/xfs_aops.c
> > > @@ -237,6 +237,7 @@ STATIC void
> > >  xfs_end_ioend(
> > >  	struct xfs_ioend	*ioend)
> > >  {
> > > +	struct list_head	ioend_list;
> > >  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> > >  	xfs_off_t		offset = ioend->io_offset;
> > >  	size_t			size = ioend->io_size;
> > > @@ -273,7 +274,89 @@ xfs_end_ioend(
> > >  done:
> > >  	if (ioend->io_append_trans)
> > >  		error = xfs_setfilesize_ioend(ioend, error);
> > > +	list_replace_init(&ioend->io_list, &ioend_list);
> > >  	xfs_destroy_ioend(ioend, error);
> > > +
> > > +	while (!list_empty(&ioend_list)) {
> > > +		ioend = list_first_entry(&ioend_list, struct xfs_ioend,
> > > +				io_list);
> > > +		list_del_init(&ioend->io_list);
> > > +		xfs_destroy_ioend(ioend, error);
> > > +	}
> > > +}
> > > +
> > > +/*
> > > + * We can merge two adjacent ioends if they have the same set of work to do.
> > > + */
> > > +static bool
> > > +xfs_ioend_can_merge(
> > > +	struct xfs_ioend	*ioend,
> > > +	int			ioend_error,
> > > +	struct xfs_ioend	*next)
> > > +{
> > > +	int			next_error;
> > > +
> > > +	next_error = blk_status_to_errno(next->io_bio->bi_status);
> > > +	if (ioend_error != next_error)
> > > +		return false;
> > > +	if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
> > > +		return false;
> > > +	if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
> > > +	    (next->io_state == XFS_EXT_UNWRITTEN))
> > > +		return false;
> > > +	if (ioend->io_offset + ioend->io_size != next->io_offset)
> > > +		return false;
> > > +	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
> > > +		return false;
> > > +	return true;
> > > +}
> > > +
> > > +/* Try to merge adjacent completions. */
> > > +STATIC void
> > > +xfs_ioend_try_merge(
> > > +	struct xfs_ioend	*ioend,
> > > +	struct list_head	*more_ioends)
> > > +{
> > > +	struct xfs_ioend	*next_ioend;
> > > +	int			ioend_error;
> > > +	int			error;
> > > +
> > > +	if (list_empty(more_ioends))
> > > +		return;
> > > +
> > > +	ioend_error = blk_status_to_errno(ioend->io_bio->bi_status);
> > > +
> > > +	while (!list_empty(more_ioends)) {
> > > +		next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
> > > +				io_list);
> > > +		if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend))
> > > +			break;
> > > +		list_move_tail(&next_ioend->io_list, &ioend->io_list);
> > > +		ioend->io_size += next_ioend->io_size;
> > > +		if (ioend->io_append_trans) {
> > > +			error = xfs_setfilesize_ioend(next_ioend, 1);
> > > +			ASSERT(error == 1);
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/* list_sort compare function for ioends */
> > > +static int
> > > +xfs_ioend_compare(
> > > +	void			*priv,
> > > +	struct list_head	*a,
> > > +	struct list_head	*b)
> > > +{
> > > +	struct xfs_ioend	*ia;
> > > +	struct xfs_ioend	*ib;
> > > +
> > > +	ia = container_of(a, struct xfs_ioend, io_list);
> > > +	ib = container_of(b, struct xfs_ioend, io_list);
> > > +	if (ia->io_offset < ib->io_offset)
> > > +		return -1;
> > > +	else if (ia->io_offset > ib->io_offset)
> > > +		return 1;
> > > +	return 0;
> > >  }
> > >  
> > >  /* Finish all pending io completions. */
> > > @@ -292,10 +375,13 @@ xfs_end_io(
> > >  	list_replace_init(&ip->i_iodone_list, &completion_list);
> > >  	spin_unlock_irqrestore(&ip->i_iodone_lock, flags);
> > >  
> > > +	list_sort(NULL, &completion_list, xfs_ioend_compare);
> > > +
> > >  	while (!list_empty(&completion_list)) {
> > >  		ioend = list_first_entry(&completion_list, struct xfs_ioend,
> > >  				io_list);
> > >  		list_del_init(&ioend->io_list);
> > > +		xfs_ioend_try_merge(ioend, &completion_list);
> > >  		xfs_end_ioend(ioend);
> > >  	}
> > >  }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-28 15:17     ` Darrick J. Wong
  2019-03-28 16:46       ` Brian Foster
@ 2019-03-28 21:06       ` Dave Chinner
  2019-03-29 12:51         ` Brian Foster
  1 sibling, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2019-03-28 21:06 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, xfs

On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote:
> On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > When we're processing an ioend on the list of io completions, check to
> > > see if the next items on the list are both adjacent and of the same
> > > type.  If so, we can merge the completions to reduce transaction
> > > overhead.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > I'm curious of the value of this one... what situations allow for
> > batching on the ioend completion side that we haven't already accounted
> > for in the ioend construction side?
> 
> I was skeptical too, but Dave (I think?) pointed out that writeback can
> split into 1GB chunks so it actually is possible to end up with adjacent
> ioends.

When there amount of writeback for a single file exceeds the
measured bandwidth of the device, or there are a number of dirty files
that the writeback bandwidthis shared between, then writeback code
breaks up the amount of data that can be written in any single
writepages call to any single inode. This can get down as low as
MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we
can end up writing large files in lots of very small chunks.

> So I wrote this patch and added a tracepoint, and lo it
> actually did trigger when there's a lot of data to flush out, and we
> succeed at allocating a single extent for the entire delalloc reservation.

I'd expect it to fire more when there are lots of large files being
written concurently than just for single files (i.e. the writeback
interleaving fragmentation problem that speculative delalloc
avoids).

> > The latter already batches until we
> > cross a change in fork type, extent state, or a break in logical or
> > physical contiguity. The former looks like it follows similar logic for
> > merging with the exceptions of allowing for merges of physically
> > discontiguous extents and disallowing merges of those with different
> > append status. That seems like a smallish window of opportunity to me..
> > am I missing something?
> 
> Yep, it's a smallish window; small discontiguous writes don't benefit
> here at all.
> 
> > If that is the gist but there is enough benefit for the more lenient
> > merging, I also wonder whether it would be more efficient to try and
> > also accomplish that on the construction side rather than via completion
> > post-processing. For example, could we abstract a single ioend to cover
> > an arbitrary list of bio/page -> sector mappings with the same higher
> > level semantics? We already have a bio chaining mechanism, it's just
> > only used for when a bio is full. Could we reuse that for dealing with
> > physical discontiguity?

While possible, I think that's way more complex and problematic than
merging successful completions optimistically...

In reality, we need some numbers to prove whether this is worthwhile
or not. If we can't find obvious workloads where it actually makes a
difference (either in throughput, IO latency or CPU usage) then,
like most things, we write off as an interesting experiement that
didn't provide the benefit we thought it might...

> The other thing is that directio completions look very similar to
> writeback completions, including the potential for having the thundering
> herds pounding on the ILOCK.  I was thinking about refactoring those to
> use the per-inode queue as a next step, though the directio completion
> paths are murky.

Yeah, i think there can be benefit here for AIO+DIO when issuing
multiple concurrent sequential writes. I'm looking at making xfs-fsr
use AIO so it can pipeline the data movement (rather than being
synchronous) and I can see having completion batching for unwritten
extent conversion having a positive effect on such operations....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-28 21:06       ` Dave Chinner
@ 2019-03-29 12:51         ` Brian Foster
  2019-03-31 22:24           ` Dave Chinner
  0 siblings, 1 reply; 12+ messages in thread
From: Brian Foster @ 2019-03-29 12:51 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, xfs

On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote:
> On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote:
> > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > When we're processing an ioend on the list of io completions, check to
> > > > see if the next items on the list are both adjacent and of the same
> > > > type.  If so, we can merge the completions to reduce transaction
> > > > overhead.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > I'm curious of the value of this one... what situations allow for
> > > batching on the ioend completion side that we haven't already accounted
> > > for in the ioend construction side?
> > 
> > I was skeptical too, but Dave (I think?) pointed out that writeback can
> > split into 1GB chunks so it actually is possible to end up with adjacent
> > ioends.
> 
> When there amount of writeback for a single file exceeds the
> measured bandwidth of the device, or there are a number of dirty files
> that the writeback bandwidthis shared between, then writeback code
> breaks up the amount of data that can be written in any single
> writepages call to any single inode. This can get down as low as
> MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we
> can end up writing large files in lots of very small chunks.
> 

Ok, so in general this has more to do with working around higher level
writeback behavior than improving our own ioend batching/mapping from a
single ->writepages() instance.

Taking a look at the writeback code, this sounds more relevant to
background writeback because integrity writeback uses the LONG_MAX chunk
size for ->writepages() calls. Background writeback calculates a chunk
size based on bandwidth, etc. as you've noted and looks like it rotors
across dirty inodes in a given superblock until a higher level writeback
count is achieved. Makes sense.

> > So I wrote this patch and added a tracepoint, and lo it
> > actually did trigger when there's a lot of data to flush out, and we
> > succeed at allocating a single extent for the entire delalloc reservation.
> 
> I'd expect it to fire more when there are lots of large files being
> written concurently than just for single files (i.e. the writeback
> interleaving fragmentation problem that speculative delalloc
> avoids).
> 
> > > The latter already batches until we
> > > cross a change in fork type, extent state, or a break in logical or
> > > physical contiguity. The former looks like it follows similar logic for
> > > merging with the exceptions of allowing for merges of physically
> > > discontiguous extents and disallowing merges of those with different
> > > append status. That seems like a smallish window of opportunity to me..
> > > am I missing something?
> > 
> > Yep, it's a smallish window; small discontiguous writes don't benefit
> > here at all.
> > 
> > > If that is the gist but there is enough benefit for the more lenient
> > > merging, I also wonder whether it would be more efficient to try and
> > > also accomplish that on the construction side rather than via completion
> > > post-processing. For example, could we abstract a single ioend to cover
> > > an arbitrary list of bio/page -> sector mappings with the same higher
> > > level semantics? We already have a bio chaining mechanism, it's just
> > > only used for when a bio is full. Could we reuse that for dealing with
> > > physical discontiguity?
> 
> While possible, I think that's way more complex and problematic than
> merging successful completions optimistically...
> 

Note again that the suggestion above applies only to ioend batching
within a single ->writepages() instance as opposed to across multiple
writebacks. It's less relevant given the context you added above around
potentially optimizing background writeback completion across multiple
->writepages() calls.

That said, I don't agree that it's complex or problematic to implement.
I was going to elaborate, but on looking again I realize that it's easy
enough to just try. See the appended diff[1]. This allows a single ioend
to cover multiple logically contiguous but physically discontiguous
extents so long as they have the same general ioend state (i.e.,
unwritten, etc.). This is not thoroughly tested of course, but works
for a couple quick tests.

There are caveats to this having now taken a closer look. This doesn't
reduce transaction load for COW or unwritten extents because we
convert/remap with a transaction per extent either way. I suppose it
could reduce ioend memory allocation overhead a bit and potentially the
need to allocate multiple append transactions in some cases.

> In reality, we need some numbers to prove whether this is worthwhile
> or not. If we can't find obvious workloads where it actually makes a
> difference (either in throughput, IO latency or CPU usage) then,
> like most things, we write off as an interesting experiement that
> didn't provide the benefit we thought it might...
> 

Agreed.

Brian

[1] Quick hack to batch physically discontiguous extents in a single
ioend.

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3619e9e8d359..c9bed8f3cb90 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -654,13 +654,13 @@ xfs_add_to_ioend(
 	if (!wpc->ioend ||
 	    wpc->fork != wpc->ioend->io_fork ||
 	    wpc->imap.br_state != wpc->ioend->io_state ||
-	    sector != bio_end_sector(wpc->ioend->io_bio) ||
 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
 		wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
 				wpc->imap.br_state, offset, bdev, sector);
-	}
+	} else if (sector != bio_end_sector(wpc->ioend->io_bio))
+		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
 
 	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
 		if (iop)

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-29 12:51         ` Brian Foster
@ 2019-03-31 22:24           ` Dave Chinner
  2019-04-01 14:59             ` Brian Foster
  0 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2019-03-31 22:24 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, xfs

On Fri, Mar 29, 2019 at 08:51:29AM -0400, Brian Foster wrote:
> On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote:
> > On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote:
> > > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> > > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > When we're processing an ioend on the list of io completions, check to
> > > > > see if the next items on the list are both adjacent and of the same
> > > > > type.  If so, we can merge the completions to reduce transaction
> > > > > overhead.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > 
> > > > I'm curious of the value of this one... what situations allow for
> > > > batching on the ioend completion side that we haven't already accounted
> > > > for in the ioend construction side?
> > > 
> > > I was skeptical too, but Dave (I think?) pointed out that writeback can
> > > split into 1GB chunks so it actually is possible to end up with adjacent
> > > ioends.
> > 
> > When there amount of writeback for a single file exceeds the
> > measured bandwidth of the device, or there are a number of dirty files
> > that the writeback bandwidthis shared between, then writeback code
> > breaks up the amount of data that can be written in any single
> > writepages call to any single inode. This can get down as low as
> > MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we
> > can end up writing large files in lots of very small chunks.
> > 
> 
> Ok, so in general this has more to do with working around higher level
> writeback behavior than improving our own ioend batching/mapping from a
> single ->writepages() instance.

I wouldn't say "working around", because the higher level stuff is
avoiding inode writeback starvation which is entirely necessary. The
MIN_WRITEBACK_PAGES limit is essentially a mechanism to prevent
falling into thrashing with small IOs to lots of files and becoming
seek bound rather than bandwidth bound on spinning disks.  It's only
when there are so many writers that dirty page throttling occurs
before a process can dirty 4MB of pages that we see writeback chunks
drop below this limit, and that's where the IO being really
efficient at cleaning pages really matters....

> Taking a look at the writeback code, this sounds more relevant to
> background writeback because integrity writeback uses the LONG_MAX chunk
> size for ->writepages() calls. Background writeback calculates a chunk
> size based on bandwidth, etc. as you've noted and looks like it rotors
> across dirty inodes in a given superblock until a higher level writeback
> count is achieved. Makes sense.

*nod*

> 
> > > So I wrote this patch and added a tracepoint, and lo it
> > > actually did trigger when there's a lot of data to flush out, and we
> > > succeed at allocating a single extent for the entire delalloc reservation.
> > 
> > I'd expect it to fire more when there are lots of large files being
> > written concurently than just for single files (i.e. the writeback
> > interleaving fragmentation problem that speculative delalloc
> > avoids).
> > 
> > > > The latter already batches until we
> > > > cross a change in fork type, extent state, or a break in logical or
> > > > physical contiguity. The former looks like it follows similar logic for
> > > > merging with the exceptions of allowing for merges of physically
> > > > discontiguous extents and disallowing merges of those with different
> > > > append status. That seems like a smallish window of opportunity to me..
> > > > am I missing something?
> > > 
> > > Yep, it's a smallish window; small discontiguous writes don't benefit
> > > here at all.
> > > 
> > > > If that is the gist but there is enough benefit for the more lenient
> > > > merging, I also wonder whether it would be more efficient to try and
> > > > also accomplish that on the construction side rather than via completion
> > > > post-processing. For example, could we abstract a single ioend to cover
> > > > an arbitrary list of bio/page -> sector mappings with the same higher
> > > > level semantics? We already have a bio chaining mechanism, it's just
> > > > only used for when a bio is full. Could we reuse that for dealing with
> > > > physical discontiguity?
> > 
> > While possible, I think that's way more complex and problematic than
> > merging successful completions optimistically...
> > 
> 
> Note again that the suggestion above applies only to ioend batching
> within a single ->writepages() instance as opposed to across multiple
> writebacks. It's less relevant given the context you added above around
> potentially optimizing background writeback completion across multiple
> ->writepages() calls.

Ah, ok, I was thinking you were talking about cross-writepages()
clustering....

> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 3619e9e8d359..c9bed8f3cb90 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -654,13 +654,13 @@ xfs_add_to_ioend(
>  	if (!wpc->ioend ||
>  	    wpc->fork != wpc->ioend->io_fork ||
>  	    wpc->imap.br_state != wpc->ioend->io_state ||
> -	    sector != bio_end_sector(wpc->ioend->io_bio) ||
>  	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
>  		if (wpc->ioend)
>  			list_add(&wpc->ioend->io_list, iolist);
>  		wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
>  				wpc->imap.br_state, offset, bdev, sector);
> -	}
> +	} else if (sector != bio_end_sector(wpc->ioend->io_bio))
> +		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);

That might make sense from a completion optimisation POV, but it's
going to have a significant impact on IO latency. i.e. we now delay
the completion of the first IO to after we've seeked to the second
IO and compelted that (and potentially many more). This will
increase the latency of cleaning dirty pages on spinning disks, so i
suspect this would be detrimental under heavy memory pressure and
lots of dirty pages to write back....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type
  2019-03-31 22:24           ` Dave Chinner
@ 2019-04-01 14:59             ` Brian Foster
  0 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2019-04-01 14:59 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, xfs

On Mon, Apr 01, 2019 at 09:24:56AM +1100, Dave Chinner wrote:
> On Fri, Mar 29, 2019 at 08:51:29AM -0400, Brian Foster wrote:
> > On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote:
> > > On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote:
> > > > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote:
> > > > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote:
> > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > 
> > > > > > When we're processing an ioend on the list of io completions, check to
> > > > > > see if the next items on the list are both adjacent and of the same
> > > > > > type.  If so, we can merge the completions to reduce transaction
> > > > > > overhead.
> > > > > > 
> > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > ---
> > > > > 
> > > > > I'm curious of the value of this one... what situations allow for
> > > > > batching on the ioend completion side that we haven't already accounted
> > > > > for in the ioend construction side?
> > > > 
> > > > I was skeptical too, but Dave (I think?) pointed out that writeback can
> > > > split into 1GB chunks so it actually is possible to end up with adjacent
> > > > ioends.
> > > 
> > > When there amount of writeback for a single file exceeds the
> > > measured bandwidth of the device, or there are a number of dirty files
> > > that the writeback bandwidthis shared between, then writeback code
> > > breaks up the amount of data that can be written in any single
> > > writepages call to any single inode. This can get down as low as
> > > MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we
> > > can end up writing large files in lots of very small chunks.
> > > 
> > 
> > Ok, so in general this has more to do with working around higher level
> > writeback behavior than improving our own ioend batching/mapping from a
> > single ->writepages() instance.
> 
> I wouldn't say "working around", because the higher level stuff is
> avoiding inode writeback starvation which is entirely necessary. The
> MIN_WRITEBACK_PAGES limit is essentially a mechanism to prevent
> falling into thrashing with small IOs to lots of files and becoming
> seek bound rather than bandwidth bound on spinning disks.  It's only
> when there are so many writers that dirty page throttling occurs
> before a process can dirty 4MB of pages that we see writeback chunks
> drop below this limit, and that's where the IO being really
> efficient at cleaning pages really matters....
> 

Sure, I just mean that the potential optimization targets
inter-writepages patterns as opposed to intra-writepages..

> > Taking a look at the writeback code, this sounds more relevant to
> > background writeback because integrity writeback uses the LONG_MAX chunk
> > size for ->writepages() calls. Background writeback calculates a chunk
> > size based on bandwidth, etc. as you've noted and looks like it rotors
> > across dirty inodes in a given superblock until a higher level writeback
> > count is achieved. Makes sense.
> 
> *nod*
> 
> > 
> > > > So I wrote this patch and added a tracepoint, and lo it
> > > > actually did trigger when there's a lot of data to flush out, and we
> > > > succeed at allocating a single extent for the entire delalloc reservation.
> > > 
> > > I'd expect it to fire more when there are lots of large files being
> > > written concurently than just for single files (i.e. the writeback
> > > interleaving fragmentation problem that speculative delalloc
> > > avoids).
> > > 
> > > > > The latter already batches until we
> > > > > cross a change in fork type, extent state, or a break in logical or
> > > > > physical contiguity. The former looks like it follows similar logic for
> > > > > merging with the exceptions of allowing for merges of physically
> > > > > discontiguous extents and disallowing merges of those with different
> > > > > append status. That seems like a smallish window of opportunity to me..
> > > > > am I missing something?
> > > > 
> > > > Yep, it's a smallish window; small discontiguous writes don't benefit
> > > > here at all.
> > > > 
> > > > > If that is the gist but there is enough benefit for the more lenient
> > > > > merging, I also wonder whether it would be more efficient to try and
> > > > > also accomplish that on the construction side rather than via completion
> > > > > post-processing. For example, could we abstract a single ioend to cover
> > > > > an arbitrary list of bio/page -> sector mappings with the same higher
> > > > > level semantics? We already have a bio chaining mechanism, it's just
> > > > > only used for when a bio is full. Could we reuse that for dealing with
> > > > > physical discontiguity?
> > > 
> > > While possible, I think that's way more complex and problematic than
> > > merging successful completions optimistically...
> > > 
> > 
> > Note again that the suggestion above applies only to ioend batching
> > within a single ->writepages() instance as opposed to across multiple
> > writebacks. It's less relevant given the context you added above around
> > potentially optimizing background writeback completion across multiple
> > ->writepages() calls.
> 
> Ah, ok, I was thinking you were talking about cross-writepages()
> clustering....
> 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index 3619e9e8d359..c9bed8f3cb90 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -654,13 +654,13 @@ xfs_add_to_ioend(
> >  	if (!wpc->ioend ||
> >  	    wpc->fork != wpc->ioend->io_fork ||
> >  	    wpc->imap.br_state != wpc->ioend->io_state ||
> > -	    sector != bio_end_sector(wpc->ioend->io_bio) ||
> >  	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
> >  		if (wpc->ioend)
> >  			list_add(&wpc->ioend->io_list, iolist);
> >  		wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
> >  				wpc->imap.br_state, offset, bdev, sector);
> > -	}
> > +	} else if (sector != bio_end_sector(wpc->ioend->io_bio))
> > +		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
> 
> That might make sense from a completion optimisation POV, but it's
> going to have a significant impact on IO latency. i.e. we now delay
> the completion of the first IO to after we've seeked to the second
> IO and compelted that (and potentially many more). This will
> increase the latency of cleaning dirty pages on spinning disks, so i
> suspect this would be detrimental under heavy memory pressure and
> lots of dirty pages to write back....
> 

(By cleaning pages I assume you refer to making dirty pages available
for reclaim/reuse and not clearing of the dirty bit, which happens
before writeback submission.)

Yeah, I could see the potential for that. I don't think it's necessarily
imminent if you consider all of the existing writeback chunk size
heuristics, plugging, and the fact that a large contiguous writeback can
impose a similar per-page writeback latency cost by constructing a huge
bio chain. Sequential I/O is certainly much faster in the spinning disk
example you cited, but we can already block a single page writeback
waiter on potentially GBs of bios in certain cases without any currently
known problems. We could always implement a heuristic to restrict the
length of more "random" bio chains for ioends that don't have enough
anticipated transaction completion overhead to trade off for the
additional completion latency. (In fact, if this is a concern I wonder
if we should have some chain limiting logic anyways..).

Anyways, I think the larger takeaway right now is that if
post-completion batching is useful and effective across multiple
writepages() calls, it's probably not worth the additional complexity of
this kind of submission batching when we can get the same primary
transaction overhead reduction benefit from a more comprehensive
implementation.

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-04-01 14:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-27  3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong
2019-03-27  3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong
2019-03-28 14:10   ` Brian Foster
2019-03-28 15:17     ` Darrick J. Wong
2019-03-28 16:46       ` Brian Foster
2019-03-28 21:06       ` Dave Chinner
2019-03-29 12:51         ` Brian Foster
2019-03-31 22:24           ` Dave Chinner
2019-04-01 14:59             ` Brian Foster
2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster
2019-03-28 15:00   ` Darrick J. Wong
2019-03-28 16:24     ` Brian Foster

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.