* [RFC PATCH] xfs: implement per-inode writeback completion @ 2019-03-27 3:05 Darrick J. Wong 2019-03-27 3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong 2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster 0 siblings, 2 replies; 12+ messages in thread From: Darrick J. Wong @ 2019-03-27 3:05 UTC (permalink / raw) To: xfs Hi folks, Here's a quick patchset reworking writeback ioend completion processing into per-inode completion queues so that we don't have a thundering herd of unwritten/cow completion kworkers contending for the ILOCK. The second patch will also combine adjacent ioends when possible to reduce the overhead further. Let me know what you think. :) --D --- From: Darrick J. Wong <darrick.wong@oracle.com> Restructure the buffered writeback completion code to use a single work item per inode, since it's pointless to awaken a thundering herd of threads to contend on a single inode's ILOCK. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/xfs_aops.c | 48 +++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_aops.h | 1 - fs/xfs/xfs_icache.c | 3 +++ fs/xfs/xfs_inode.h | 7 +++++++ 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3619e9e8d359..f7a9bb661826 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -234,11 +234,9 @@ xfs_setfilesize_ioend( * IO write completion. */ STATIC void -xfs_end_io( - struct work_struct *work) +xfs_end_ioend( + struct xfs_ioend *ioend) { - struct xfs_ioend *ioend = - container_of(work, struct xfs_ioend, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -278,19 +276,48 @@ xfs_end_io( xfs_destroy_ioend(ioend, error); } +/* Finish all pending io completions. */ +void +xfs_end_io( + struct work_struct *work) +{ + struct xfs_inode *ip; + struct xfs_ioend *ioend; + struct list_head completion_list; + unsigned long flags; + + ip = container_of(work, struct xfs_inode, i_iodone_work); + + spin_lock_irqsave(&ip->i_iodone_lock, flags); + list_replace_init(&ip->i_iodone_list, &completion_list); + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); + + while (!list_empty(&completion_list)) { + ioend = list_first_entry(&completion_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_end_ioend(ioend); + } +} + STATIC void xfs_end_bio( struct bio *bio) { struct xfs_ioend *ioend = bio->bi_private; - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + unsigned long flags; if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else + ioend->io_state == XFS_EXT_UNWRITTEN || + ioend->io_append_trans != NULL) { + spin_lock_irqsave(&ip->i_iodone_lock, flags); + if (list_empty(&ip->i_iodone_list)) + queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work); + list_add_tail(&ioend->io_list, &ip->i_iodone_list); + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); + } else xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } @@ -594,7 +621,6 @@ xfs_alloc_ioend( ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; - INIT_WORK(&ioend->io_work, xfs_end_io); ioend->io_append_trans = NULL; ioend->io_bio = bio; return ioend; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 6c2615b83c5d..f62b03186c62 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,6 @@ struct xfs_ioend { struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ - struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 245483cc282b..e70e7db29026 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,6 +70,9 @@ xfs_inode_alloc( ip->i_flags = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(ip->i_d)); + INIT_WORK(&ip->i_iodone_work, xfs_end_io); + INIT_LIST_HEAD(&ip->i_iodone_list); + spin_lock_init(&ip->i_iodone_lock); return ip; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e62074a5257c..88239c2dd824 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -57,6 +57,11 @@ typedef struct xfs_inode { /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ + + /* pending io completions */ + spinlock_t i_iodone_lock; + struct work_struct i_iodone_work; + struct list_head i_iodone_list; } xfs_inode_t; /* Convert from vfs inode to xfs inode */ @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip); int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); +void xfs_end_io(struct work_struct *work); + #endif /* __XFS_INODE_H__ */ ^ permalink raw reply related [flat|nested] 12+ messages in thread
* [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-27 3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong @ 2019-03-27 3:06 ` Darrick J. Wong 2019-03-28 14:10 ` Brian Foster 2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster 1 sibling, 1 reply; 12+ messages in thread From: Darrick J. Wong @ 2019-03-27 3:06 UTC (permalink / raw) To: xfs From: Darrick J. Wong <darrick.wong@oracle.com> When we're processing an ioend on the list of io completions, check to see if the next items on the list are both adjacent and of the same type. If so, we can merge the completions to reduce transaction overhead. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/xfs_aops.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f7a9bb661826..53afa2e6e3e7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -237,6 +237,7 @@ STATIC void xfs_end_ioend( struct xfs_ioend *ioend) { + struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -273,7 +274,89 @@ xfs_end_ioend( done: if (ioend->io_append_trans) error = xfs_setfilesize_ioend(ioend, error); + list_replace_init(&ioend->io_list, &ioend_list); xfs_destroy_ioend(ioend, error); + + while (!list_empty(&ioend_list)) { + ioend = list_first_entry(&ioend_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_destroy_ioend(ioend, error); + } +} + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool +xfs_ioend_can_merge( + struct xfs_ioend *ioend, + int ioend_error, + struct xfs_ioend *next) +{ + int next_error; + + next_error = blk_status_to_errno(next->io_bio->bi_status); + if (ioend_error != next_error) + return false; + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) + return false; + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ + (next->io_state == XFS_EXT_UNWRITTEN)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next)) + return false; + return true; +} + +/* Try to merge adjacent completions. */ +STATIC void +xfs_ioend_try_merge( + struct xfs_ioend *ioend, + struct list_head *more_ioends) +{ + struct xfs_ioend *next_ioend; + int ioend_error; + int error; + + if (list_empty(more_ioends)) + return; + + ioend_error = blk_status_to_errno(ioend->io_bio->bi_status); + + while (!list_empty(more_ioends)) { + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, + io_list); + if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend)) + break; + list_move_tail(&next_ioend->io_list, &ioend->io_list); + ioend->io_size += next_ioend->io_size; + if (ioend->io_append_trans) { + error = xfs_setfilesize_ioend(next_ioend, 1); + ASSERT(error == 1); + } + } +} + +/* list_sort compare function for ioends */ +static int +xfs_ioend_compare( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_ioend *ia; + struct xfs_ioend *ib; + + ia = container_of(a, struct xfs_ioend, io_list); + ib = container_of(b, struct xfs_ioend, io_list); + if (ia->io_offset < ib->io_offset) + return -1; + else if (ia->io_offset > ib->io_offset) + return 1; + return 0; } /* Finish all pending io completions. */ @@ -292,10 +375,13 @@ xfs_end_io( list_replace_init(&ip->i_iodone_list, &completion_list); spin_unlock_irqrestore(&ip->i_iodone_lock, flags); + list_sort(NULL, &completion_list, xfs_ioend_compare); + while (!list_empty(&completion_list)) { ioend = list_first_entry(&completion_list, struct xfs_ioend, io_list); list_del_init(&ioend->io_list); + xfs_ioend_try_merge(ioend, &completion_list); xfs_end_ioend(ioend); } } ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-27 3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong @ 2019-03-28 14:10 ` Brian Foster 2019-03-28 15:17 ` Darrick J. Wong 0 siblings, 1 reply; 12+ messages in thread From: Brian Foster @ 2019-03-28 14:10 UTC (permalink / raw) To: Darrick J. Wong; +Cc: xfs On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > When we're processing an ioend on the list of io completions, check to > see if the next items on the list are both adjacent and of the same > type. If so, we can merge the completions to reduce transaction > overhead. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- I'm curious of the value of this one... what situations allow for batching on the ioend completion side that we haven't already accounted for in the ioend construction side? The latter already batches until we cross a change in fork type, extent state, or a break in logical or physical contiguity. The former looks like it follows similar logic for merging with the exceptions of allowing for merges of physically discontiguous extents and disallowing merges of those with different append status. That seems like a smallish window of opportunity to me.. am I missing something? If that is the gist but there is enough benefit for the more lenient merging, I also wonder whether it would be more efficient to try and also accomplish that on the construction side rather than via completion post-processing. For example, could we abstract a single ioend to cover an arbitrary list of bio/page -> sector mappings with the same higher level semantics? We already have a bio chaining mechanism, it's just only used for when a bio is full. Could we reuse that for dealing with physical discontiguity? Brian > fs/xfs/xfs_aops.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 86 insertions(+) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index f7a9bb661826..53afa2e6e3e7 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -237,6 +237,7 @@ STATIC void > xfs_end_ioend( > struct xfs_ioend *ioend) > { > + struct list_head ioend_list; > struct xfs_inode *ip = XFS_I(ioend->io_inode); > xfs_off_t offset = ioend->io_offset; > size_t size = ioend->io_size; > @@ -273,7 +274,89 @@ xfs_end_ioend( > done: > if (ioend->io_append_trans) > error = xfs_setfilesize_ioend(ioend, error); > + list_replace_init(&ioend->io_list, &ioend_list); > xfs_destroy_ioend(ioend, error); > + > + while (!list_empty(&ioend_list)) { > + ioend = list_first_entry(&ioend_list, struct xfs_ioend, > + io_list); > + list_del_init(&ioend->io_list); > + xfs_destroy_ioend(ioend, error); > + } > +} > + > +/* > + * We can merge two adjacent ioends if they have the same set of work to do. > + */ > +static bool > +xfs_ioend_can_merge( > + struct xfs_ioend *ioend, > + int ioend_error, > + struct xfs_ioend *next) > +{ > + int next_error; > + > + next_error = blk_status_to_errno(next->io_bio->bi_status); > + if (ioend_error != next_error) > + return false; > + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) > + return false; > + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ > + (next->io_state == XFS_EXT_UNWRITTEN)) > + return false; > + if (ioend->io_offset + ioend->io_size != next->io_offset) > + return false; > + if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next)) > + return false; > + return true; > +} > + > +/* Try to merge adjacent completions. */ > +STATIC void > +xfs_ioend_try_merge( > + struct xfs_ioend *ioend, > + struct list_head *more_ioends) > +{ > + struct xfs_ioend *next_ioend; > + int ioend_error; > + int error; > + > + if (list_empty(more_ioends)) > + return; > + > + ioend_error = blk_status_to_errno(ioend->io_bio->bi_status); > + > + while (!list_empty(more_ioends)) { > + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, > + io_list); > + if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend)) > + break; > + list_move_tail(&next_ioend->io_list, &ioend->io_list); > + ioend->io_size += next_ioend->io_size; > + if (ioend->io_append_trans) { > + error = xfs_setfilesize_ioend(next_ioend, 1); > + ASSERT(error == 1); > + } > + } > +} > + > +/* list_sort compare function for ioends */ > +static int > +xfs_ioend_compare( > + void *priv, > + struct list_head *a, > + struct list_head *b) > +{ > + struct xfs_ioend *ia; > + struct xfs_ioend *ib; > + > + ia = container_of(a, struct xfs_ioend, io_list); > + ib = container_of(b, struct xfs_ioend, io_list); > + if (ia->io_offset < ib->io_offset) > + return -1; > + else if (ia->io_offset > ib->io_offset) > + return 1; > + return 0; > } > > /* Finish all pending io completions. */ > @@ -292,10 +375,13 @@ xfs_end_io( > list_replace_init(&ip->i_iodone_list, &completion_list); > spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > + list_sort(NULL, &completion_list, xfs_ioend_compare); > + > while (!list_empty(&completion_list)) { > ioend = list_first_entry(&completion_list, struct xfs_ioend, > io_list); > list_del_init(&ioend->io_list); > + xfs_ioend_try_merge(ioend, &completion_list); > xfs_end_ioend(ioend); > } > } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-28 14:10 ` Brian Foster @ 2019-03-28 15:17 ` Darrick J. Wong 2019-03-28 16:46 ` Brian Foster 2019-03-28 21:06 ` Dave Chinner 0 siblings, 2 replies; 12+ messages in thread From: Darrick J. Wong @ 2019-03-28 15:17 UTC (permalink / raw) To: Brian Foster; +Cc: xfs On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > When we're processing an ioend on the list of io completions, check to > > see if the next items on the list are both adjacent and of the same > > type. If so, we can merge the completions to reduce transaction > > overhead. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > I'm curious of the value of this one... what situations allow for > batching on the ioend completion side that we haven't already accounted > for in the ioend construction side? I was skeptical too, but Dave (I think?) pointed out that writeback can split into 1GB chunks so it actually is possible to end up with adjacent ioends. So I wrote this patch and added a tracepoint, and lo it actually did trigger when there's a lot of data to flush out, and we succeed at allocating a single extent for the entire delalloc reservation. > The latter already batches until we > cross a change in fork type, extent state, or a break in logical or > physical contiguity. The former looks like it follows similar logic for > merging with the exceptions of allowing for merges of physically > discontiguous extents and disallowing merges of those with different > append status. That seems like a smallish window of opportunity to me.. > am I missing something? Yep, it's a smallish window; small discontiguous writes don't benefit here at all. > If that is the gist but there is enough benefit for the more lenient > merging, I also wonder whether it would be more efficient to try and > also accomplish that on the construction side rather than via completion > post-processing. For example, could we abstract a single ioend to cover > an arbitrary list of bio/page -> sector mappings with the same higher > level semantics? We already have a bio chaining mechanism, it's just > only used for when a bio is full. Could we reuse that for dealing with > physical discontiguity? I suppose we could, though the bigger the ioend the longer it'll take to process responses. Also, I think it's the case that if any of the bios fail then we treat all of the chained ones as failed? (Meh, it's writeback, it's not like you get to know /which/ writes failed unless you do a stupid write()/fsync() dance...) The other thing is that directio completions look very similar to writeback completions, including the potential for having the thundering herds pounding on the ILOCK. I was thinking about refactoring those to use the per-inode queue as a next step, though the directio completion paths are murky. > Brian > > > fs/xfs/xfs_aops.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 86 insertions(+) > > > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > > index f7a9bb661826..53afa2e6e3e7 100644 > > --- a/fs/xfs/xfs_aops.c > > +++ b/fs/xfs/xfs_aops.c > > @@ -237,6 +237,7 @@ STATIC void > > xfs_end_ioend( > > struct xfs_ioend *ioend) > > { > > + struct list_head ioend_list; > > struct xfs_inode *ip = XFS_I(ioend->io_inode); > > xfs_off_t offset = ioend->io_offset; > > size_t size = ioend->io_size; > > @@ -273,7 +274,89 @@ xfs_end_ioend( > > done: > > if (ioend->io_append_trans) > > error = xfs_setfilesize_ioend(ioend, error); > > + list_replace_init(&ioend->io_list, &ioend_list); > > xfs_destroy_ioend(ioend, error); > > + > > + while (!list_empty(&ioend_list)) { > > + ioend = list_first_entry(&ioend_list, struct xfs_ioend, > > + io_list); > > + list_del_init(&ioend->io_list); > > + xfs_destroy_ioend(ioend, error); > > + } > > +} > > + > > +/* > > + * We can merge two adjacent ioends if they have the same set of work to do. > > + */ > > +static bool > > +xfs_ioend_can_merge( > > + struct xfs_ioend *ioend, > > + int ioend_error, > > + struct xfs_ioend *next) > > +{ > > + int next_error; > > + > > + next_error = blk_status_to_errno(next->io_bio->bi_status); > > + if (ioend_error != next_error) > > + return false; > > + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) > > + return false; > > + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ > > + (next->io_state == XFS_EXT_UNWRITTEN)) > > + return false; > > + if (ioend->io_offset + ioend->io_size != next->io_offset) > > + return false; > > + if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next)) > > + return false; > > + return true; > > +} > > + > > +/* Try to merge adjacent completions. */ > > +STATIC void > > +xfs_ioend_try_merge( > > + struct xfs_ioend *ioend, > > + struct list_head *more_ioends) > > +{ > > + struct xfs_ioend *next_ioend; > > + int ioend_error; > > + int error; > > + > > + if (list_empty(more_ioends)) > > + return; > > + > > + ioend_error = blk_status_to_errno(ioend->io_bio->bi_status); > > + > > + while (!list_empty(more_ioends)) { > > + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, > > + io_list); > > + if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend)) > > + break; > > + list_move_tail(&next_ioend->io_list, &ioend->io_list); > > + ioend->io_size += next_ioend->io_size; > > + if (ioend->io_append_trans) { > > + error = xfs_setfilesize_ioend(next_ioend, 1); > > + ASSERT(error == 1); > > + } > > + } > > +} > > + > > +/* list_sort compare function for ioends */ > > +static int > > +xfs_ioend_compare( > > + void *priv, > > + struct list_head *a, > > + struct list_head *b) > > +{ > > + struct xfs_ioend *ia; > > + struct xfs_ioend *ib; > > + > > + ia = container_of(a, struct xfs_ioend, io_list); > > + ib = container_of(b, struct xfs_ioend, io_list); > > + if (ia->io_offset < ib->io_offset) > > + return -1; > > + else if (ia->io_offset > ib->io_offset) > > + return 1; > > + return 0; > > } > > > > /* Finish all pending io completions. */ > > @@ -292,10 +375,13 @@ xfs_end_io( > > list_replace_init(&ip->i_iodone_list, &completion_list); > > spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > > > + list_sort(NULL, &completion_list, xfs_ioend_compare); > > + > > while (!list_empty(&completion_list)) { > > ioend = list_first_entry(&completion_list, struct xfs_ioend, > > io_list); > > list_del_init(&ioend->io_list); > > + xfs_ioend_try_merge(ioend, &completion_list); > > xfs_end_ioend(ioend); > > } > > } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-28 15:17 ` Darrick J. Wong @ 2019-03-28 16:46 ` Brian Foster 2019-03-28 21:06 ` Dave Chinner 1 sibling, 0 replies; 12+ messages in thread From: Brian Foster @ 2019-03-28 16:46 UTC (permalink / raw) To: Darrick J. Wong; +Cc: xfs On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote: > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > When we're processing an ioend on the list of io completions, check to > > > see if the next items on the list are both adjacent and of the same > > > type. If so, we can merge the completions to reduce transaction > > > overhead. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > --- > > > > I'm curious of the value of this one... what situations allow for > > batching on the ioend completion side that we haven't already accounted > > for in the ioend construction side? > > I was skeptical too, but Dave (I think?) pointed out that writeback can > split into 1GB chunks so it actually is possible to end up with adjacent > ioends. So I wrote this patch and added a tracepoint, and lo it > actually did trigger when there's a lot of data to flush out, and we > succeed at allocating a single extent for the entire delalloc reservation. > That doesn't seem like a huge overhead to me, but I am curious where that splitting logic is. Somewhere in the writeback code..? (I assume Dave can chime in on some of this stuff if he's more familiar with it..). > > The latter already batches until we > > cross a change in fork type, extent state, or a break in logical or > > physical contiguity. The former looks like it follows similar logic for > > merging with the exceptions of allowing for merges of physically > > discontiguous extents and disallowing merges of those with different > > append status. That seems like a smallish window of opportunity to me.. > > am I missing something? > > Yep, it's a smallish window; small discontiguous writes don't benefit > here at all. > Ok. The whole append thing is somewhat dynamic/non-deterministic as well, along with the fact that we're subject to completion/wq timing to allow for this kind of batching to occur. E.g., if we're completing a series of 1GB ioends, what are the odds the next ioend completes before the current one starts in the wq? I'd guess not great, but tbh I have no idea.. > > If that is the gist but there is enough benefit for the more lenient > > merging, I also wonder whether it would be more efficient to try and > > also accomplish that on the construction side rather than via completion > > post-processing. For example, could we abstract a single ioend to cover > > an arbitrary list of bio/page -> sector mappings with the same higher > > level semantics? We already have a bio chaining mechanism, it's just > > only used for when a bio is full. Could we reuse that for dealing with > > physical discontiguity? > > I suppose we could, though the bigger the ioend the longer it'll take to > process responses. Also, I think it's the case that if any of the bios > fail then we treat all of the chained ones as failed? (Meh, it's > writeback, it's not like you get to know /which/ writes failed unless > you do a stupid write()/fsync() dance...) > This already seems to be the case to a large degree with our current batching. That 1GB ioend above is already a fairly large bio chain, as I think that a bio can't have more than 256 (BIO_MAX_PAGES) pages. Eh, I guess my take is that this doesn't necessarily seem like an unreasonable change and it's not like it's a huge amount of code, but it does seem like potentially more standalone code than it's worth for minimal benefit. It seems the amount of code and processing could be reduced and benefit slightly increased by allocating fewer ioends in the first place if this were pre-processed vs. post-processed. I'd prefer to see more analysis of the potential benefits either way... > The other thing is that directio completions look very similar to > writeback completions, including the potential for having the thundering > herds pounding on the ILOCK. I was thinking about refactoring those to > use the per-inode queue as a next step, though the directio completion > paths are murky. > Indeed, though we don't have the degree of submission batching taking place for direct I/O. Perhaps it just depends more on the workload since aio can presumably drive a deeper queue. A couple things to note.. I vaguely recall that the dio completion code has a bit of a messy history of being switched back and forth between using ioends or bitmasks or whatever else to avoid ioend allocations. This doesn't rule out potential new benefits of using ioends that might be achieved in light of new features like reflink, but FWIW I'd be more skeptical of refactors along those lines that don't come along with a measurable benefit or solve a tangible problem. Another thing to be aware of here is that the iomap code looks like it already can invoke our callback in a wq and it may complete the aio as soon as we return... Brian > > Brian > > > > > fs/xfs/xfs_aops.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > > > 1 file changed, 86 insertions(+) > > > > > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > > > index f7a9bb661826..53afa2e6e3e7 100644 > > > --- a/fs/xfs/xfs_aops.c > > > +++ b/fs/xfs/xfs_aops.c > > > @@ -237,6 +237,7 @@ STATIC void > > > xfs_end_ioend( > > > struct xfs_ioend *ioend) > > > { > > > + struct list_head ioend_list; > > > struct xfs_inode *ip = XFS_I(ioend->io_inode); > > > xfs_off_t offset = ioend->io_offset; > > > size_t size = ioend->io_size; > > > @@ -273,7 +274,89 @@ xfs_end_ioend( > > > done: > > > if (ioend->io_append_trans) > > > error = xfs_setfilesize_ioend(ioend, error); > > > + list_replace_init(&ioend->io_list, &ioend_list); > > > xfs_destroy_ioend(ioend, error); > > > + > > > + while (!list_empty(&ioend_list)) { > > > + ioend = list_first_entry(&ioend_list, struct xfs_ioend, > > > + io_list); > > > + list_del_init(&ioend->io_list); > > > + xfs_destroy_ioend(ioend, error); > > > + } > > > +} > > > + > > > +/* > > > + * We can merge two adjacent ioends if they have the same set of work to do. > > > + */ > > > +static bool > > > +xfs_ioend_can_merge( > > > + struct xfs_ioend *ioend, > > > + int ioend_error, > > > + struct xfs_ioend *next) > > > +{ > > > + int next_error; > > > + > > > + next_error = blk_status_to_errno(next->io_bio->bi_status); > > > + if (ioend_error != next_error) > > > + return false; > > > + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) > > > + return false; > > > + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ > > > + (next->io_state == XFS_EXT_UNWRITTEN)) > > > + return false; > > > + if (ioend->io_offset + ioend->io_size != next->io_offset) > > > + return false; > > > + if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next)) > > > + return false; > > > + return true; > > > +} > > > + > > > +/* Try to merge adjacent completions. */ > > > +STATIC void > > > +xfs_ioend_try_merge( > > > + struct xfs_ioend *ioend, > > > + struct list_head *more_ioends) > > > +{ > > > + struct xfs_ioend *next_ioend; > > > + int ioend_error; > > > + int error; > > > + > > > + if (list_empty(more_ioends)) > > > + return; > > > + > > > + ioend_error = blk_status_to_errno(ioend->io_bio->bi_status); > > > + > > > + while (!list_empty(more_ioends)) { > > > + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, > > > + io_list); > > > + if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend)) > > > + break; > > > + list_move_tail(&next_ioend->io_list, &ioend->io_list); > > > + ioend->io_size += next_ioend->io_size; > > > + if (ioend->io_append_trans) { > > > + error = xfs_setfilesize_ioend(next_ioend, 1); > > > + ASSERT(error == 1); > > > + } > > > + } > > > +} > > > + > > > +/* list_sort compare function for ioends */ > > > +static int > > > +xfs_ioend_compare( > > > + void *priv, > > > + struct list_head *a, > > > + struct list_head *b) > > > +{ > > > + struct xfs_ioend *ia; > > > + struct xfs_ioend *ib; > > > + > > > + ia = container_of(a, struct xfs_ioend, io_list); > > > + ib = container_of(b, struct xfs_ioend, io_list); > > > + if (ia->io_offset < ib->io_offset) > > > + return -1; > > > + else if (ia->io_offset > ib->io_offset) > > > + return 1; > > > + return 0; > > > } > > > > > > /* Finish all pending io completions. */ > > > @@ -292,10 +375,13 @@ xfs_end_io( > > > list_replace_init(&ip->i_iodone_list, &completion_list); > > > spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > > > > > + list_sort(NULL, &completion_list, xfs_ioend_compare); > > > + > > > while (!list_empty(&completion_list)) { > > > ioend = list_first_entry(&completion_list, struct xfs_ioend, > > > io_list); > > > list_del_init(&ioend->io_list); > > > + xfs_ioend_try_merge(ioend, &completion_list); > > > xfs_end_ioend(ioend); > > > } > > > } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-28 15:17 ` Darrick J. Wong 2019-03-28 16:46 ` Brian Foster @ 2019-03-28 21:06 ` Dave Chinner 2019-03-29 12:51 ` Brian Foster 1 sibling, 1 reply; 12+ messages in thread From: Dave Chinner @ 2019-03-28 21:06 UTC (permalink / raw) To: Darrick J. Wong; +Cc: Brian Foster, xfs On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote: > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > When we're processing an ioend on the list of io completions, check to > > > see if the next items on the list are both adjacent and of the same > > > type. If so, we can merge the completions to reduce transaction > > > overhead. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > --- > > > > I'm curious of the value of this one... what situations allow for > > batching on the ioend completion side that we haven't already accounted > > for in the ioend construction side? > > I was skeptical too, but Dave (I think?) pointed out that writeback can > split into 1GB chunks so it actually is possible to end up with adjacent > ioends. When there amount of writeback for a single file exceeds the measured bandwidth of the device, or there are a number of dirty files that the writeback bandwidthis shared between, then writeback code breaks up the amount of data that can be written in any single writepages call to any single inode. This can get down as low as MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we can end up writing large files in lots of very small chunks. > So I wrote this patch and added a tracepoint, and lo it > actually did trigger when there's a lot of data to flush out, and we > succeed at allocating a single extent for the entire delalloc reservation. I'd expect it to fire more when there are lots of large files being written concurently than just for single files (i.e. the writeback interleaving fragmentation problem that speculative delalloc avoids). > > The latter already batches until we > > cross a change in fork type, extent state, or a break in logical or > > physical contiguity. The former looks like it follows similar logic for > > merging with the exceptions of allowing for merges of physically > > discontiguous extents and disallowing merges of those with different > > append status. That seems like a smallish window of opportunity to me.. > > am I missing something? > > Yep, it's a smallish window; small discontiguous writes don't benefit > here at all. > > > If that is the gist but there is enough benefit for the more lenient > > merging, I also wonder whether it would be more efficient to try and > > also accomplish that on the construction side rather than via completion > > post-processing. For example, could we abstract a single ioend to cover > > an arbitrary list of bio/page -> sector mappings with the same higher > > level semantics? We already have a bio chaining mechanism, it's just > > only used for when a bio is full. Could we reuse that for dealing with > > physical discontiguity? While possible, I think that's way more complex and problematic than merging successful completions optimistically... In reality, we need some numbers to prove whether this is worthwhile or not. If we can't find obvious workloads where it actually makes a difference (either in throughput, IO latency or CPU usage) then, like most things, we write off as an interesting experiement that didn't provide the benefit we thought it might... > The other thing is that directio completions look very similar to > writeback completions, including the potential for having the thundering > herds pounding on the ILOCK. I was thinking about refactoring those to > use the per-inode queue as a next step, though the directio completion > paths are murky. Yeah, i think there can be benefit here for AIO+DIO when issuing multiple concurrent sequential writes. I'm looking at making xfs-fsr use AIO so it can pipeline the data movement (rather than being synchronous) and I can see having completion batching for unwritten extent conversion having a positive effect on such operations.... Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-28 21:06 ` Dave Chinner @ 2019-03-29 12:51 ` Brian Foster 2019-03-31 22:24 ` Dave Chinner 0 siblings, 1 reply; 12+ messages in thread From: Brian Foster @ 2019-03-29 12:51 UTC (permalink / raw) To: Dave Chinner; +Cc: Darrick J. Wong, xfs On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote: > On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote: > > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > When we're processing an ioend on the list of io completions, check to > > > > see if the next items on the list are both adjacent and of the same > > > > type. If so, we can merge the completions to reduce transaction > > > > overhead. > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > --- > > > > > > I'm curious of the value of this one... what situations allow for > > > batching on the ioend completion side that we haven't already accounted > > > for in the ioend construction side? > > > > I was skeptical too, but Dave (I think?) pointed out that writeback can > > split into 1GB chunks so it actually is possible to end up with adjacent > > ioends. > > When there amount of writeback for a single file exceeds the > measured bandwidth of the device, or there are a number of dirty files > that the writeback bandwidthis shared between, then writeback code > breaks up the amount of data that can be written in any single > writepages call to any single inode. This can get down as low as > MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we > can end up writing large files in lots of very small chunks. > Ok, so in general this has more to do with working around higher level writeback behavior than improving our own ioend batching/mapping from a single ->writepages() instance. Taking a look at the writeback code, this sounds more relevant to background writeback because integrity writeback uses the LONG_MAX chunk size for ->writepages() calls. Background writeback calculates a chunk size based on bandwidth, etc. as you've noted and looks like it rotors across dirty inodes in a given superblock until a higher level writeback count is achieved. Makes sense. > > So I wrote this patch and added a tracepoint, and lo it > > actually did trigger when there's a lot of data to flush out, and we > > succeed at allocating a single extent for the entire delalloc reservation. > > I'd expect it to fire more when there are lots of large files being > written concurently than just for single files (i.e. the writeback > interleaving fragmentation problem that speculative delalloc > avoids). > > > > The latter already batches until we > > > cross a change in fork type, extent state, or a break in logical or > > > physical contiguity. The former looks like it follows similar logic for > > > merging with the exceptions of allowing for merges of physically > > > discontiguous extents and disallowing merges of those with different > > > append status. That seems like a smallish window of opportunity to me.. > > > am I missing something? > > > > Yep, it's a smallish window; small discontiguous writes don't benefit > > here at all. > > > > > If that is the gist but there is enough benefit for the more lenient > > > merging, I also wonder whether it would be more efficient to try and > > > also accomplish that on the construction side rather than via completion > > > post-processing. For example, could we abstract a single ioend to cover > > > an arbitrary list of bio/page -> sector mappings with the same higher > > > level semantics? We already have a bio chaining mechanism, it's just > > > only used for when a bio is full. Could we reuse that for dealing with > > > physical discontiguity? > > While possible, I think that's way more complex and problematic than > merging successful completions optimistically... > Note again that the suggestion above applies only to ioend batching within a single ->writepages() instance as opposed to across multiple writebacks. It's less relevant given the context you added above around potentially optimizing background writeback completion across multiple ->writepages() calls. That said, I don't agree that it's complex or problematic to implement. I was going to elaborate, but on looking again I realize that it's easy enough to just try. See the appended diff[1]. This allows a single ioend to cover multiple logically contiguous but physically discontiguous extents so long as they have the same general ioend state (i.e., unwritten, etc.). This is not thoroughly tested of course, but works for a couple quick tests. There are caveats to this having now taken a closer look. This doesn't reduce transaction load for COW or unwritten extents because we convert/remap with a transaction per extent either way. I suppose it could reduce ioend memory allocation overhead a bit and potentially the need to allocate multiple append transactions in some cases. > In reality, we need some numbers to prove whether this is worthwhile > or not. If we can't find obvious workloads where it actually makes a > difference (either in throughput, IO latency or CPU usage) then, > like most things, we write off as an interesting experiement that > didn't provide the benefit we thought it might... > Agreed. Brian [1] Quick hack to batch physically discontiguous extents in a single ioend. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3619e9e8d359..c9bed8f3cb90 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -654,13 +654,13 @@ xfs_add_to_ioend( if (!wpc->ioend || wpc->fork != wpc->ioend->io_fork || wpc->imap.br_state != wpc->ioend->io_state || - sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, wpc->imap.br_state, offset, bdev, sector); - } + } else if (sector != bio_end_sector(wpc->ioend->io_bio)) + xfs_chain_bio(wpc->ioend, wbc, bdev, sector); if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) { if (iop) ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-29 12:51 ` Brian Foster @ 2019-03-31 22:24 ` Dave Chinner 2019-04-01 14:59 ` Brian Foster 0 siblings, 1 reply; 12+ messages in thread From: Dave Chinner @ 2019-03-31 22:24 UTC (permalink / raw) To: Brian Foster; +Cc: Darrick J. Wong, xfs On Fri, Mar 29, 2019 at 08:51:29AM -0400, Brian Foster wrote: > On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote: > > On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote: > > > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > > > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > When we're processing an ioend on the list of io completions, check to > > > > > see if the next items on the list are both adjacent and of the same > > > > > type. If so, we can merge the completions to reduce transaction > > > > > overhead. > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > --- > > > > > > > > I'm curious of the value of this one... what situations allow for > > > > batching on the ioend completion side that we haven't already accounted > > > > for in the ioend construction side? > > > > > > I was skeptical too, but Dave (I think?) pointed out that writeback can > > > split into 1GB chunks so it actually is possible to end up with adjacent > > > ioends. > > > > When there amount of writeback for a single file exceeds the > > measured bandwidth of the device, or there are a number of dirty files > > that the writeback bandwidthis shared between, then writeback code > > breaks up the amount of data that can be written in any single > > writepages call to any single inode. This can get down as low as > > MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we > > can end up writing large files in lots of very small chunks. > > > > Ok, so in general this has more to do with working around higher level > writeback behavior than improving our own ioend batching/mapping from a > single ->writepages() instance. I wouldn't say "working around", because the higher level stuff is avoiding inode writeback starvation which is entirely necessary. The MIN_WRITEBACK_PAGES limit is essentially a mechanism to prevent falling into thrashing with small IOs to lots of files and becoming seek bound rather than bandwidth bound on spinning disks. It's only when there are so many writers that dirty page throttling occurs before a process can dirty 4MB of pages that we see writeback chunks drop below this limit, and that's where the IO being really efficient at cleaning pages really matters.... > Taking a look at the writeback code, this sounds more relevant to > background writeback because integrity writeback uses the LONG_MAX chunk > size for ->writepages() calls. Background writeback calculates a chunk > size based on bandwidth, etc. as you've noted and looks like it rotors > across dirty inodes in a given superblock until a higher level writeback > count is achieved. Makes sense. *nod* > > > > So I wrote this patch and added a tracepoint, and lo it > > > actually did trigger when there's a lot of data to flush out, and we > > > succeed at allocating a single extent for the entire delalloc reservation. > > > > I'd expect it to fire more when there are lots of large files being > > written concurently than just for single files (i.e. the writeback > > interleaving fragmentation problem that speculative delalloc > > avoids). > > > > > > The latter already batches until we > > > > cross a change in fork type, extent state, or a break in logical or > > > > physical contiguity. The former looks like it follows similar logic for > > > > merging with the exceptions of allowing for merges of physically > > > > discontiguous extents and disallowing merges of those with different > > > > append status. That seems like a smallish window of opportunity to me.. > > > > am I missing something? > > > > > > Yep, it's a smallish window; small discontiguous writes don't benefit > > > here at all. > > > > > > > If that is the gist but there is enough benefit for the more lenient > > > > merging, I also wonder whether it would be more efficient to try and > > > > also accomplish that on the construction side rather than via completion > > > > post-processing. For example, could we abstract a single ioend to cover > > > > an arbitrary list of bio/page -> sector mappings with the same higher > > > > level semantics? We already have a bio chaining mechanism, it's just > > > > only used for when a bio is full. Could we reuse that for dealing with > > > > physical discontiguity? > > > > While possible, I think that's way more complex and problematic than > > merging successful completions optimistically... > > > > Note again that the suggestion above applies only to ioend batching > within a single ->writepages() instance as opposed to across multiple > writebacks. It's less relevant given the context you added above around > potentially optimizing background writeback completion across multiple > ->writepages() calls. Ah, ok, I was thinking you were talking about cross-writepages() clustering.... > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 3619e9e8d359..c9bed8f3cb90 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -654,13 +654,13 @@ xfs_add_to_ioend( > if (!wpc->ioend || > wpc->fork != wpc->ioend->io_fork || > wpc->imap.br_state != wpc->ioend->io_state || > - sector != bio_end_sector(wpc->ioend->io_bio) || > offset != wpc->ioend->io_offset + wpc->ioend->io_size) { > if (wpc->ioend) > list_add(&wpc->ioend->io_list, iolist); > wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, > wpc->imap.br_state, offset, bdev, sector); > - } > + } else if (sector != bio_end_sector(wpc->ioend->io_bio)) > + xfs_chain_bio(wpc->ioend, wbc, bdev, sector); That might make sense from a completion optimisation POV, but it's going to have a significant impact on IO latency. i.e. we now delay the completion of the first IO to after we've seeked to the second IO and compelted that (and potentially many more). This will increase the latency of cleaning dirty pages on spinning disks, so i suspect this would be detrimental under heavy memory pressure and lots of dirty pages to write back.... Cheers, Dave. -- Dave Chinner david@fromorbit.com ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: merge adjacent io completions of the same type 2019-03-31 22:24 ` Dave Chinner @ 2019-04-01 14:59 ` Brian Foster 0 siblings, 0 replies; 12+ messages in thread From: Brian Foster @ 2019-04-01 14:59 UTC (permalink / raw) To: Dave Chinner; +Cc: Darrick J. Wong, xfs On Mon, Apr 01, 2019 at 09:24:56AM +1100, Dave Chinner wrote: > On Fri, Mar 29, 2019 at 08:51:29AM -0400, Brian Foster wrote: > > On Fri, Mar 29, 2019 at 08:06:28AM +1100, Dave Chinner wrote: > > > On Thu, Mar 28, 2019 at 08:17:44AM -0700, Darrick J. Wong wrote: > > > > On Thu, Mar 28, 2019 at 10:10:10AM -0400, Brian Foster wrote: > > > > > On Tue, Mar 26, 2019 at 08:06:34PM -0700, Darrick J. Wong wrote: > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > When we're processing an ioend on the list of io completions, check to > > > > > > see if the next items on the list are both adjacent and of the same > > > > > > type. If so, we can merge the completions to reduce transaction > > > > > > overhead. > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > --- > > > > > > > > > > I'm curious of the value of this one... what situations allow for > > > > > batching on the ioend completion side that we haven't already accounted > > > > > for in the ioend construction side? > > > > > > > > I was skeptical too, but Dave (I think?) pointed out that writeback can > > > > split into 1GB chunks so it actually is possible to end up with adjacent > > > > ioends. > > > > > > When there amount of writeback for a single file exceeds the > > > measured bandwidth of the device, or there are a number of dirty files > > > that the writeback bandwidthis shared between, then writeback code > > > breaks up the amount of data that can be written in any single > > > writepages call to any single inode. This can get down as low as > > > MIN_WRITEBACK_PAGES (which ends up being 4MB of pages), and so we > > > can end up writing large files in lots of very small chunks. > > > > > > > Ok, so in general this has more to do with working around higher level > > writeback behavior than improving our own ioend batching/mapping from a > > single ->writepages() instance. > > I wouldn't say "working around", because the higher level stuff is > avoiding inode writeback starvation which is entirely necessary. The > MIN_WRITEBACK_PAGES limit is essentially a mechanism to prevent > falling into thrashing with small IOs to lots of files and becoming > seek bound rather than bandwidth bound on spinning disks. It's only > when there are so many writers that dirty page throttling occurs > before a process can dirty 4MB of pages that we see writeback chunks > drop below this limit, and that's where the IO being really > efficient at cleaning pages really matters.... > Sure, I just mean that the potential optimization targets inter-writepages patterns as opposed to intra-writepages.. > > Taking a look at the writeback code, this sounds more relevant to > > background writeback because integrity writeback uses the LONG_MAX chunk > > size for ->writepages() calls. Background writeback calculates a chunk > > size based on bandwidth, etc. as you've noted and looks like it rotors > > across dirty inodes in a given superblock until a higher level writeback > > count is achieved. Makes sense. > > *nod* > > > > > > > So I wrote this patch and added a tracepoint, and lo it > > > > actually did trigger when there's a lot of data to flush out, and we > > > > succeed at allocating a single extent for the entire delalloc reservation. > > > > > > I'd expect it to fire more when there are lots of large files being > > > written concurently than just for single files (i.e. the writeback > > > interleaving fragmentation problem that speculative delalloc > > > avoids). > > > > > > > > The latter already batches until we > > > > > cross a change in fork type, extent state, or a break in logical or > > > > > physical contiguity. The former looks like it follows similar logic for > > > > > merging with the exceptions of allowing for merges of physically > > > > > discontiguous extents and disallowing merges of those with different > > > > > append status. That seems like a smallish window of opportunity to me.. > > > > > am I missing something? > > > > > > > > Yep, it's a smallish window; small discontiguous writes don't benefit > > > > here at all. > > > > > > > > > If that is the gist but there is enough benefit for the more lenient > > > > > merging, I also wonder whether it would be more efficient to try and > > > > > also accomplish that on the construction side rather than via completion > > > > > post-processing. For example, could we abstract a single ioend to cover > > > > > an arbitrary list of bio/page -> sector mappings with the same higher > > > > > level semantics? We already have a bio chaining mechanism, it's just > > > > > only used for when a bio is full. Could we reuse that for dealing with > > > > > physical discontiguity? > > > > > > While possible, I think that's way more complex and problematic than > > > merging successful completions optimistically... > > > > > > > Note again that the suggestion above applies only to ioend batching > > within a single ->writepages() instance as opposed to across multiple > > writebacks. It's less relevant given the context you added above around > > potentially optimizing background writeback completion across multiple > > ->writepages() calls. > > Ah, ok, I was thinking you were talking about cross-writepages() > clustering.... > > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > > index 3619e9e8d359..c9bed8f3cb90 100644 > > --- a/fs/xfs/xfs_aops.c > > +++ b/fs/xfs/xfs_aops.c > > @@ -654,13 +654,13 @@ xfs_add_to_ioend( > > if (!wpc->ioend || > > wpc->fork != wpc->ioend->io_fork || > > wpc->imap.br_state != wpc->ioend->io_state || > > - sector != bio_end_sector(wpc->ioend->io_bio) || > > offset != wpc->ioend->io_offset + wpc->ioend->io_size) { > > if (wpc->ioend) > > list_add(&wpc->ioend->io_list, iolist); > > wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, > > wpc->imap.br_state, offset, bdev, sector); > > - } > > + } else if (sector != bio_end_sector(wpc->ioend->io_bio)) > > + xfs_chain_bio(wpc->ioend, wbc, bdev, sector); > > That might make sense from a completion optimisation POV, but it's > going to have a significant impact on IO latency. i.e. we now delay > the completion of the first IO to after we've seeked to the second > IO and compelted that (and potentially many more). This will > increase the latency of cleaning dirty pages on spinning disks, so i > suspect this would be detrimental under heavy memory pressure and > lots of dirty pages to write back.... > (By cleaning pages I assume you refer to making dirty pages available for reclaim/reuse and not clearing of the dirty bit, which happens before writeback submission.) Yeah, I could see the potential for that. I don't think it's necessarily imminent if you consider all of the existing writeback chunk size heuristics, plugging, and the fact that a large contiguous writeback can impose a similar per-page writeback latency cost by constructing a huge bio chain. Sequential I/O is certainly much faster in the spinning disk example you cited, but we can already block a single page writeback waiter on potentially GBs of bios in certain cases without any currently known problems. We could always implement a heuristic to restrict the length of more "random" bio chains for ioends that don't have enough anticipated transaction completion overhead to trade off for the additional completion latency. (In fact, if this is a concern I wonder if we should have some chain limiting logic anyways..). Anyways, I think the larger takeaway right now is that if post-completion batching is useful and effective across multiple writepages() calls, it's probably not worth the additional complexity of this kind of submission batching when we can get the same primary transaction overhead reduction benefit from a more comprehensive implementation. Brian > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: implement per-inode writeback completion 2019-03-27 3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong 2019-03-27 3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong @ 2019-03-28 14:08 ` Brian Foster 2019-03-28 15:00 ` Darrick J. Wong 1 sibling, 1 reply; 12+ messages in thread From: Brian Foster @ 2019-03-28 14:08 UTC (permalink / raw) To: Darrick J. Wong; +Cc: xfs On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote: > Hi folks, > > Here's a quick patchset reworking writeback ioend completion processing > into per-inode completion queues so that we don't have a thundering herd > of unwritten/cow completion kworkers contending for the ILOCK. The > second patch will also combine adjacent ioends when possible to reduce > the overhead further. Let me know what you think. :) > I assume the issue is that you get a bunch of ioends associated with COW remaps or whatever and workqueue parallelism leads to a bunch contending on inode locks because they happen to be associated with the same inode. Hence, it's more efficient to have a single work item per inode, let that task complete as many ioends as available for the inode and let the workqueue spread around workers for different inodes. Sounds like a nice idea to me. Is there a workload that measurably benefits from this or is this just based on observation? A couple small nits.. > --D > --- > From: Darrick J. Wong <darrick.wong@oracle.com> > > Restructure the buffered writeback completion code to use a single work > item per inode, since it's pointless to awaken a thundering herd of > threads to contend on a single inode's ILOCK. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- > fs/xfs/xfs_aops.c | 48 +++++++++++++++++++++++++++++++++++++----------- > fs/xfs/xfs_aops.h | 1 - > fs/xfs/xfs_icache.c | 3 +++ > fs/xfs/xfs_inode.h | 7 +++++++ > 4 files changed, 47 insertions(+), 12 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 3619e9e8d359..f7a9bb661826 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -234,11 +234,9 @@ xfs_setfilesize_ioend( > * IO write completion. > */ > STATIC void > -xfs_end_io( > - struct work_struct *work) > +xfs_end_ioend( > + struct xfs_ioend *ioend) > { > - struct xfs_ioend *ioend = > - container_of(work, struct xfs_ioend, io_work); > struct xfs_inode *ip = XFS_I(ioend->io_inode); > xfs_off_t offset = ioend->io_offset; > size_t size = ioend->io_size; > @@ -278,19 +276,48 @@ xfs_end_io( > xfs_destroy_ioend(ioend, error); > } > > +/* Finish all pending io completions. */ > +void > +xfs_end_io( > + struct work_struct *work) > +{ > + struct xfs_inode *ip; > + struct xfs_ioend *ioend; > + struct list_head completion_list; > + unsigned long flags; > + > + ip = container_of(work, struct xfs_inode, i_iodone_work); > + > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > + list_replace_init(&ip->i_iodone_list, &completion_list); > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > + > + while (!list_empty(&completion_list)) { > + ioend = list_first_entry(&completion_list, struct xfs_ioend, > + io_list); > + list_del_init(&ioend->io_list); > + xfs_end_ioend(ioend); > + } > +} > + > STATIC void > xfs_end_bio( > struct bio *bio) > { > struct xfs_ioend *ioend = bio->bi_private; > - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > + struct xfs_mount *mp = ip->i_mount; > + unsigned long flags; > > if (ioend->io_fork == XFS_COW_FORK || > - ioend->io_state == XFS_EXT_UNWRITTEN) > - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); > - else if (ioend->io_append_trans) > - queue_work(mp->m_data_workqueue, &ioend->io_work); > - else > + ioend->io_state == XFS_EXT_UNWRITTEN || > + ioend->io_append_trans != NULL) { > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > + if (list_empty(&ip->i_iodone_list)) > + queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work); > + list_add_tail(&ioend->io_list, &ip->i_iodone_list); > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > + } else Ok, so there's no longer a need for multiple ioend workqueues because all such operations are going to be processed by the single work execution. Perhaps we should rename m_unwritten_workqueue or just use m_data_workqueue since it happens to have a more generic name. We can also kill off the other wq either way since it appears to be unused. > xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); > } > > @@ -594,7 +621,6 @@ xfs_alloc_ioend( > ioend->io_inode = inode; > ioend->io_size = 0; > ioend->io_offset = offset; > - INIT_WORK(&ioend->io_work, xfs_end_io); > ioend->io_append_trans = NULL; > ioend->io_bio = bio; > return ioend; > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index 6c2615b83c5d..f62b03186c62 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -18,7 +18,6 @@ struct xfs_ioend { > struct inode *io_inode; /* file being written to */ > size_t io_size; /* size of the extent */ > xfs_off_t io_offset; /* offset in the file */ > - struct work_struct io_work; /* xfsdatad work queue */ > struct xfs_trans *io_append_trans;/* xact. for size update */ > struct bio *io_bio; /* bio being built */ > struct bio io_inline_bio; /* MUST BE LAST! */ > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c > index 245483cc282b..e70e7db29026 100644 > --- a/fs/xfs/xfs_icache.c > +++ b/fs/xfs/xfs_icache.c > @@ -70,6 +70,9 @@ xfs_inode_alloc( > ip->i_flags = 0; > ip->i_delayed_blks = 0; > memset(&ip->i_d, 0, sizeof(ip->i_d)); > + INIT_WORK(&ip->i_iodone_work, xfs_end_io); > + INIT_LIST_HEAD(&ip->i_iodone_list); > + spin_lock_init(&ip->i_iodone_lock); > > return ip; > } > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index e62074a5257c..88239c2dd824 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -57,6 +57,11 @@ typedef struct xfs_inode { > > /* VFS inode */ > struct inode i_vnode; /* embedded VFS inode */ > + > + /* pending io completions */ > + spinlock_t i_iodone_lock; > + struct work_struct i_iodone_work; > + struct list_head i_iodone_list; A nit but I guess I'd name these with ioend instead of iodone. The latter kind of confuses with buffer iodone callbacks (to me). Brian > } xfs_inode_t; > > /* Convert from vfs inode to xfs inode */ > @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip); > int xfs_iunlink_init(struct xfs_perag *pag); > void xfs_iunlink_destroy(struct xfs_perag *pag); > > +void xfs_end_io(struct work_struct *work); > + > #endif /* __XFS_INODE_H__ */ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: implement per-inode writeback completion 2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster @ 2019-03-28 15:00 ` Darrick J. Wong 2019-03-28 16:24 ` Brian Foster 0 siblings, 1 reply; 12+ messages in thread From: Darrick J. Wong @ 2019-03-28 15:00 UTC (permalink / raw) To: Brian Foster; +Cc: xfs On Thu, Mar 28, 2019 at 10:08:59AM -0400, Brian Foster wrote: > On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote: > > Hi folks, > > > > Here's a quick patchset reworking writeback ioend completion processing > > into per-inode completion queues so that we don't have a thundering herd > > of unwritten/cow completion kworkers contending for the ILOCK. The > > second patch will also combine adjacent ioends when possible to reduce > > the overhead further. Let me know what you think. :) > > > > I assume the issue is that you get a bunch of ioends associated with COW > remaps or whatever and workqueue parallelism leads to a bunch contending > on inode locks because they happen to be associated with the same inode. Yes, as outlined in the thread "xlog_grant_head_wait deadlocks on high-rolling transactions?" > Hence, it's more efficient to have a single work item per inode, let > that task complete as many ioends as available for the inode and let the > workqueue spread around workers for different inodes. Sounds like a nice > idea to me. Is there a workload that measurably benefits from this or is > this just based on observation? If you consider 'xfstests' to be my primary workload, then yes, it reduces developer deadlock triage time considerably. :) Uhh but more seriously, on kvm hosts that use pagecache-as-diskcache(!) the number of kworkers spirals insanely when someone issue a cache flush (a.k.a. fsync) because they're all contending with each other and the host writeback slows to a crawl if it doesn't just fall over dead. > A couple small nits.. > > > --D > > --- > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Restructure the buffered writeback completion code to use a single work > > item per inode, since it's pointless to awaken a thundering herd of > > threads to contend on a single inode's ILOCK. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > fs/xfs/xfs_aops.c | 48 +++++++++++++++++++++++++++++++++++++----------- > > fs/xfs/xfs_aops.h | 1 - > > fs/xfs/xfs_icache.c | 3 +++ > > fs/xfs/xfs_inode.h | 7 +++++++ > > 4 files changed, 47 insertions(+), 12 deletions(-) > > > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > > index 3619e9e8d359..f7a9bb661826 100644 > > --- a/fs/xfs/xfs_aops.c > > +++ b/fs/xfs/xfs_aops.c > > @@ -234,11 +234,9 @@ xfs_setfilesize_ioend( > > * IO write completion. > > */ > > STATIC void > > -xfs_end_io( > > - struct work_struct *work) > > +xfs_end_ioend( > > + struct xfs_ioend *ioend) > > { > > - struct xfs_ioend *ioend = > > - container_of(work, struct xfs_ioend, io_work); > > struct xfs_inode *ip = XFS_I(ioend->io_inode); > > xfs_off_t offset = ioend->io_offset; > > size_t size = ioend->io_size; > > @@ -278,19 +276,48 @@ xfs_end_io( > > xfs_destroy_ioend(ioend, error); > > } > > > > +/* Finish all pending io completions. */ > > +void > > +xfs_end_io( > > + struct work_struct *work) > > +{ > > + struct xfs_inode *ip; > > + struct xfs_ioend *ioend; > > + struct list_head completion_list; > > + unsigned long flags; > > + > > + ip = container_of(work, struct xfs_inode, i_iodone_work); > > + > > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > > + list_replace_init(&ip->i_iodone_list, &completion_list); > > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > + > > + while (!list_empty(&completion_list)) { > > + ioend = list_first_entry(&completion_list, struct xfs_ioend, > > + io_list); > > + list_del_init(&ioend->io_list); > > + xfs_end_ioend(ioend); > > + } > > +} > > + > > STATIC void > > xfs_end_bio( > > struct bio *bio) > > { > > struct xfs_ioend *ioend = bio->bi_private; > > - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; > > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > > + struct xfs_mount *mp = ip->i_mount; > > + unsigned long flags; > > > > if (ioend->io_fork == XFS_COW_FORK || > > - ioend->io_state == XFS_EXT_UNWRITTEN) > > - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); > > - else if (ioend->io_append_trans) > > - queue_work(mp->m_data_workqueue, &ioend->io_work); > > - else > > + ioend->io_state == XFS_EXT_UNWRITTEN || > > + ioend->io_append_trans != NULL) { > > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > > + if (list_empty(&ip->i_iodone_list)) > > + queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work); > > + list_add_tail(&ioend->io_list, &ip->i_iodone_list); > > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > + } else > > Ok, so there's no longer a need for multiple ioend workqueues because > all such operations are going to be processed by the single work > execution. Perhaps we should rename m_unwritten_workqueue or just use > m_data_workqueue since it happens to have a more generic name. We can > also kill off the other wq either way since it appears to be unused. <nod> I'll add a patch to kill off one of the workqueues. > > xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); > > } > > > > @@ -594,7 +621,6 @@ xfs_alloc_ioend( > > ioend->io_inode = inode; > > ioend->io_size = 0; > > ioend->io_offset = offset; > > - INIT_WORK(&ioend->io_work, xfs_end_io); > > ioend->io_append_trans = NULL; > > ioend->io_bio = bio; > > return ioend; > > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > > index 6c2615b83c5d..f62b03186c62 100644 > > --- a/fs/xfs/xfs_aops.h > > +++ b/fs/xfs/xfs_aops.h > > @@ -18,7 +18,6 @@ struct xfs_ioend { > > struct inode *io_inode; /* file being written to */ > > size_t io_size; /* size of the extent */ > > xfs_off_t io_offset; /* offset in the file */ > > - struct work_struct io_work; /* xfsdatad work queue */ > > struct xfs_trans *io_append_trans;/* xact. for size update */ > > struct bio *io_bio; /* bio being built */ > > struct bio io_inline_bio; /* MUST BE LAST! */ > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c > > index 245483cc282b..e70e7db29026 100644 > > --- a/fs/xfs/xfs_icache.c > > +++ b/fs/xfs/xfs_icache.c > > @@ -70,6 +70,9 @@ xfs_inode_alloc( > > ip->i_flags = 0; > > ip->i_delayed_blks = 0; > > memset(&ip->i_d, 0, sizeof(ip->i_d)); > > + INIT_WORK(&ip->i_iodone_work, xfs_end_io); > > + INIT_LIST_HEAD(&ip->i_iodone_list); > > + spin_lock_init(&ip->i_iodone_lock); > > > > return ip; > > } > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > index e62074a5257c..88239c2dd824 100644 > > --- a/fs/xfs/xfs_inode.h > > +++ b/fs/xfs/xfs_inode.h > > @@ -57,6 +57,11 @@ typedef struct xfs_inode { > > > > /* VFS inode */ > > struct inode i_vnode; /* embedded VFS inode */ > > + > > + /* pending io completions */ > > + spinlock_t i_iodone_lock; > > + struct work_struct i_iodone_work; > > + struct list_head i_iodone_list; > > A nit but I guess I'd name these with ioend instead of iodone. The > latter kind of confuses with buffer iodone callbacks (to me). <nod> Will do. --D > Brian > > > } xfs_inode_t; > > > > /* Convert from vfs inode to xfs inode */ > > @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip); > > int xfs_iunlink_init(struct xfs_perag *pag); > > void xfs_iunlink_destroy(struct xfs_perag *pag); > > > > +void xfs_end_io(struct work_struct *work); > > + > > #endif /* __XFS_INODE_H__ */ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] xfs: implement per-inode writeback completion 2019-03-28 15:00 ` Darrick J. Wong @ 2019-03-28 16:24 ` Brian Foster 0 siblings, 0 replies; 12+ messages in thread From: Brian Foster @ 2019-03-28 16:24 UTC (permalink / raw) To: Darrick J. Wong; +Cc: xfs On Thu, Mar 28, 2019 at 08:00:22AM -0700, Darrick J. Wong wrote: > On Thu, Mar 28, 2019 at 10:08:59AM -0400, Brian Foster wrote: > > On Tue, Mar 26, 2019 at 08:05:50PM -0700, Darrick J. Wong wrote: > > > Hi folks, > > > > > > Here's a quick patchset reworking writeback ioend completion processing > > > into per-inode completion queues so that we don't have a thundering herd > > > of unwritten/cow completion kworkers contending for the ILOCK. The > > > second patch will also combine adjacent ioends when possible to reduce > > > the overhead further. Let me know what you think. :) > > > > > > > I assume the issue is that you get a bunch of ioends associated with COW > > remaps or whatever and workqueue parallelism leads to a bunch contending > > on inode locks because they happen to be associated with the same inode. > > Yes, as outlined in the thread "xlog_grant_head_wait deadlocks on > high-rolling transactions?" > > > Hence, it's more efficient to have a single work item per inode, let > > that task complete as many ioends as available for the inode and let the > > workqueue spread around workers for different inodes. Sounds like a nice > > idea to me. Is there a workload that measurably benefits from this or is > > this just based on observation? > > If you consider 'xfstests' to be my primary workload, then yes, it > reduces developer deadlock triage time considerably. :) > Ah, Ok. So this is not primarily a performance issue but rather serializes the per inode I/O completion work to address that parallel transaction alloc -> ilock -> ... problem of exhausting log reservation space. We should probably document that in the commit log. ;) > Uhh but more seriously, on kvm hosts that use pagecache-as-diskcache(!) > the number of kworkers spirals insanely when someone issue a cache flush > (a.k.a. fsync) because they're all contending with each other and the > host writeback slows to a crawl if it doesn't just fall over dead. > That's more of what I was expecting, but it makes sense for the above too. Brian > > A couple small nits.. > > > > > --D > > > --- > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > Restructure the buffered writeback completion code to use a single work > > > item per inode, since it's pointless to awaken a thundering herd of > > > threads to contend on a single inode's ILOCK. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > --- > > > fs/xfs/xfs_aops.c | 48 +++++++++++++++++++++++++++++++++++++----------- > > > fs/xfs/xfs_aops.h | 1 - > > > fs/xfs/xfs_icache.c | 3 +++ > > > fs/xfs/xfs_inode.h | 7 +++++++ > > > 4 files changed, 47 insertions(+), 12 deletions(-) > > > > > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > > > index 3619e9e8d359..f7a9bb661826 100644 > > > --- a/fs/xfs/xfs_aops.c > > > +++ b/fs/xfs/xfs_aops.c > > > @@ -234,11 +234,9 @@ xfs_setfilesize_ioend( > > > * IO write completion. > > > */ > > > STATIC void > > > -xfs_end_io( > > > - struct work_struct *work) > > > +xfs_end_ioend( > > > + struct xfs_ioend *ioend) > > > { > > > - struct xfs_ioend *ioend = > > > - container_of(work, struct xfs_ioend, io_work); > > > struct xfs_inode *ip = XFS_I(ioend->io_inode); > > > xfs_off_t offset = ioend->io_offset; > > > size_t size = ioend->io_size; > > > @@ -278,19 +276,48 @@ xfs_end_io( > > > xfs_destroy_ioend(ioend, error); > > > } > > > > > > +/* Finish all pending io completions. */ > > > +void > > > +xfs_end_io( > > > + struct work_struct *work) > > > +{ > > > + struct xfs_inode *ip; > > > + struct xfs_ioend *ioend; > > > + struct list_head completion_list; > > > + unsigned long flags; > > > + > > > + ip = container_of(work, struct xfs_inode, i_iodone_work); > > > + > > > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > > > + list_replace_init(&ip->i_iodone_list, &completion_list); > > > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > > + > > > + while (!list_empty(&completion_list)) { > > > + ioend = list_first_entry(&completion_list, struct xfs_ioend, > > > + io_list); > > > + list_del_init(&ioend->io_list); > > > + xfs_end_ioend(ioend); > > > + } > > > +} > > > + > > > STATIC void > > > xfs_end_bio( > > > struct bio *bio) > > > { > > > struct xfs_ioend *ioend = bio->bi_private; > > > - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; > > > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > > > + struct xfs_mount *mp = ip->i_mount; > > > + unsigned long flags; > > > > > > if (ioend->io_fork == XFS_COW_FORK || > > > - ioend->io_state == XFS_EXT_UNWRITTEN) > > > - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); > > > - else if (ioend->io_append_trans) > > > - queue_work(mp->m_data_workqueue, &ioend->io_work); > > > - else > > > + ioend->io_state == XFS_EXT_UNWRITTEN || > > > + ioend->io_append_trans != NULL) { > > > + spin_lock_irqsave(&ip->i_iodone_lock, flags); > > > + if (list_empty(&ip->i_iodone_list)) > > > + queue_work(mp->m_unwritten_workqueue, &ip->i_iodone_work); > > > + list_add_tail(&ioend->io_list, &ip->i_iodone_list); > > > + spin_unlock_irqrestore(&ip->i_iodone_lock, flags); > > > + } else > > > > Ok, so there's no longer a need for multiple ioend workqueues because > > all such operations are going to be processed by the single work > > execution. Perhaps we should rename m_unwritten_workqueue or just use > > m_data_workqueue since it happens to have a more generic name. We can > > also kill off the other wq either way since it appears to be unused. > > <nod> I'll add a patch to kill off one of the workqueues. > > > > xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); > > > } > > > > > > @@ -594,7 +621,6 @@ xfs_alloc_ioend( > > > ioend->io_inode = inode; > > > ioend->io_size = 0; > > > ioend->io_offset = offset; > > > - INIT_WORK(&ioend->io_work, xfs_end_io); > > > ioend->io_append_trans = NULL; > > > ioend->io_bio = bio; > > > return ioend; > > > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > > > index 6c2615b83c5d..f62b03186c62 100644 > > > --- a/fs/xfs/xfs_aops.h > > > +++ b/fs/xfs/xfs_aops.h > > > @@ -18,7 +18,6 @@ struct xfs_ioend { > > > struct inode *io_inode; /* file being written to */ > > > size_t io_size; /* size of the extent */ > > > xfs_off_t io_offset; /* offset in the file */ > > > - struct work_struct io_work; /* xfsdatad work queue */ > > > struct xfs_trans *io_append_trans;/* xact. for size update */ > > > struct bio *io_bio; /* bio being built */ > > > struct bio io_inline_bio; /* MUST BE LAST! */ > > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c > > > index 245483cc282b..e70e7db29026 100644 > > > --- a/fs/xfs/xfs_icache.c > > > +++ b/fs/xfs/xfs_icache.c > > > @@ -70,6 +70,9 @@ xfs_inode_alloc( > > > ip->i_flags = 0; > > > ip->i_delayed_blks = 0; > > > memset(&ip->i_d, 0, sizeof(ip->i_d)); > > > + INIT_WORK(&ip->i_iodone_work, xfs_end_io); > > > + INIT_LIST_HEAD(&ip->i_iodone_list); > > > + spin_lock_init(&ip->i_iodone_lock); > > > > > > return ip; > > > } > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > > index e62074a5257c..88239c2dd824 100644 > > > --- a/fs/xfs/xfs_inode.h > > > +++ b/fs/xfs/xfs_inode.h > > > @@ -57,6 +57,11 @@ typedef struct xfs_inode { > > > > > > /* VFS inode */ > > > struct inode i_vnode; /* embedded VFS inode */ > > > + > > > + /* pending io completions */ > > > + spinlock_t i_iodone_lock; > > > + struct work_struct i_iodone_work; > > > + struct list_head i_iodone_list; > > > > A nit but I guess I'd name these with ioend instead of iodone. The > > latter kind of confuses with buffer iodone callbacks (to me). > > <nod> Will do. > > --D > > > Brian > > > > > } xfs_inode_t; > > > > > > /* Convert from vfs inode to xfs inode */ > > > @@ -503,4 +508,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip); > > > int xfs_iunlink_init(struct xfs_perag *pag); > > > void xfs_iunlink_destroy(struct xfs_perag *pag); > > > > > > +void xfs_end_io(struct work_struct *work); > > > + > > > #endif /* __XFS_INODE_H__ */ ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2019-04-01 14:59 UTC | newest] Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-03-27 3:05 [RFC PATCH] xfs: implement per-inode writeback completion Darrick J. Wong 2019-03-27 3:06 ` [RFC PATCH] xfs: merge adjacent io completions of the same type Darrick J. Wong 2019-03-28 14:10 ` Brian Foster 2019-03-28 15:17 ` Darrick J. Wong 2019-03-28 16:46 ` Brian Foster 2019-03-28 21:06 ` Dave Chinner 2019-03-29 12:51 ` Brian Foster 2019-03-31 22:24 ` Dave Chinner 2019-04-01 14:59 ` Brian Foster 2019-03-28 14:08 ` [RFC PATCH] xfs: implement per-inode writeback completion Brian Foster 2019-03-28 15:00 ` Darrick J. Wong 2019-03-28 16:24 ` Brian Foster
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.