All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
@ 2017-05-15 15:46 Johannes Weiner
  2017-05-16  1:34 ` [RFC PATCH] " Andreas Dilger
  2017-05-16 14:36 ` [RFC PATCH] fs: " Jan Kara
  0 siblings, 2 replies; 10+ messages in thread
From: Johannes Weiner @ 2017-05-15 15:46 UTC (permalink / raw)
  To: Jan Kara
  Cc: Theodore Ts'o, Alexander Viro, linux-ext4, linux-fsdevel,
	linux-kernel, kernel-team

We have observed across several workloads situations where kswapd and
direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
causing allocation latencies across tasks in the system, while there
are dozens of gigabytes of clean page cache covering multiple disks.

The stack traces of such an instance looks like this:

[<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
[<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
[<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
[<ffffffff811f2230>] evict+0xc0/0x190
[<ffffffff811f2339>] dispose_list+0x39/0x50
[<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
[<ffffffff811dba71>] super_cache_scan+0x141/0x190
[<ffffffff8116e755>] shrink_slab+0x235/0x440
[<ffffffff81172b48>] shrink_zone+0x268/0x2d0
[<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
[<ffffffff81173265>] try_to_free_pages+0xb5/0x160
[<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
[<ffffffff811acac8>] alloc_pages_current+0x88/0x120
[<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
[<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
[<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
[<ffffffff81769727>] inet_sendmsg+0x67/0xa0
[<ffffffff816d0488>] sock_sendmsg+0x38/0x50
[<ffffffff816d0518>] sock_write_iter+0x78/0xd0
[<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
[<ffffffff811d8468>] do_readv_writev+0x178/0x210
[<ffffffff811d871c>] vfs_writev+0x3c/0x50
[<ffffffff811d8782>] do_writev+0x52/0xd0
[<ffffffff811d9810>] SyS_writev+0x10/0x20
[<ffffffff81002910>] do_syscall_64+0x50/0xa0
[<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
[<ffffffffffffffff>] 0xffffffffffffffff

The inode shrinker has provisions to skip any inodes that require
writeback, to avoid tarpitting the entire system behind a single
object when there are many other pools to recycle memory from. But
that logic doesn't cover the situation where an ext4 inode is clean
but journaled and tied to a commit that yet needs to hit the platter.

Add a superblock operation that lets the generic inode shrinker query
the filesystem whether evicting a given inode will require any IO; add
an ext4 implementation that checks whether the journal is caught up to
the commit id associated with the inode.

Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 fs/ext4/ext4.h       |  1 +
 fs/ext4/inode.c      | 17 +++++++++++++++++
 fs/ext4/super.c      |  1 +
 fs/inode.c           |  7 +++++++
 fs/jbd2/journal.c    |  9 ++++++++-
 include/linux/fs.h   |  1 +
 include/linux/jbd2.h |  1 +
 7 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8e8046104f4d..b552978fc31f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2463,6 +2463,7 @@ extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int ext4_evict_needs_io(struct inode *);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5834c4d76be8..4cb6cf932d9a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -182,6 +182,23 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 	return ret;
 }
 
+int ext4_evict_needs_io(struct inode *inode)
+{
+	if (inode->i_nlink &&
+	    inode->i_ino != EXT4_JOURNAL_INO &&
+	    ext4_should_journal_data(inode) &&
+	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+		journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+		tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+		int err;
+
+		err = __jbd2_complete_transaction(journal, commit_tid, false);
+		if (err == -EAGAIN)
+			return 1;
+	}
+	return 0;
+}
+
 /*
  * Called at the last iput() if i_nlink is zero.
  */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c90edf09b0c3..e0e7cff589eb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1287,6 +1287,7 @@ static const struct super_operations ext4_sops = {
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
 	.drop_inode	= ext4_drop_inode,
+	.evict_needs_io	= ext4_evict_needs_io,
 	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
 	.sync_fs	= ext4_sync_fs,
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..730d2034b8a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -703,6 +703,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 {
 	struct list_head *freeable = arg;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
+	const struct super_operations *op = inode->i_sb->s_op;
 
 	/*
 	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
@@ -723,6 +724,12 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 		return LRU_REMOVED;
 	}
 
+	/* If eviction may block on IO, defer for now */
+	if (op->evict_needs_io && !op->evict_needs_io(inode)) {
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
+
 	/* recently referenced inodes get one more pass */
 	if (inode->i_state & I_REFERENCED) {
 		inode->i_state &= ~I_REFERENCED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5a0245e36240..e353f3a0abf5 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -745,7 +745,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
  * the transaction id is stale, it is by definition already completed,
  * so just return SUCCESS.
  */
-int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+int __jbd2_complete_transaction(journal_t *journal, tid_t tid, bool wait)
 {
 	int	need_to_wait = 1;
 
@@ -765,8 +765,15 @@ int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 	if (!need_to_wait)
 		return 0;
 wait_commit:
+	if (!wait)
+		return -EAGAIN;
 	return jbd2_log_wait_commit(journal, tid);
 }
+
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	return __jbd2_complete_transaction(journal, tid, true);
+}
 EXPORT_SYMBOL(jbd2_complete_transaction);
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 26488b419965..3a895f31afa7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1777,6 +1777,7 @@ struct super_operations {
    	void (*dirty_inode) (struct inode *, int flags);
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	int (*drop_inode) (struct inode *);
+	int (*evict_needs_io) (struct inode *);
 	void (*evict_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 606b6bce3a5b..3b902bed6e8b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int __jbd2_complete_transaction(journal_t *journal, tid_t tid, bool wait);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
-- 
2.12.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-05-15 15:46 [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO Johannes Weiner
@ 2017-05-16  1:34 ` Andreas Dilger
  2017-05-16 14:36 ` [RFC PATCH] fs: " Jan Kara
  1 sibling, 0 replies; 10+ messages in thread
From: Andreas Dilger @ 2017-05-16  1:34 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Jan Kara, Theodore Ts'o, Alexander Viro, linux-ext4,
	linux-fsdevel, linux-kernel, kernel-team

[-- Attachment #1: Type: text/plain, Size: 2269 bytes --]

On May 15, 2017, at 5:46 PM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> We have observed across several workloads situations where kswapd and
> direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> causing allocation latencies across tasks in the system, while there
> are dozens of gigabytes of clean page cache covering multiple disks.
> 
> The inode shrinker has provisions to skip any inodes that require
> writeback, to avoid tarpitting the entire system behind a single
> object when there are many other pools to recycle memory from. But
> that logic doesn't cover the situation where an ext4 inode is clean
> but journaled and tied to a commit that yet needs to hit the platter.
> 
> Add a superblock operation that lets the generic inode shrinker query
> the filesystem whether evicting a given inode will require any IO; add
> an ext4 implementation that checks whether the journal is caught up to
> the commit id associated with the inode.
[snip]

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5834c4d76be8..4cb6cf932d9a 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -182,6 +182,23 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
> 	return ret;
> }
> 
> +int ext4_evict_needs_io(struct inode *inode)
> +{
> +	if (inode->i_nlink &&
> +	    inode->i_ino != EXT4_JOURNAL_INO &&
> +	    ext4_should_journal_data(inode) &&
> +	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
> +		journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +		tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
> +		int err;
> +
> +		err = __jbd2_complete_transaction(journal, commit_tid, false);

From a code style point of view, having a function exported with a name like
"__jbd2..." is probably bad.  An argument of "false" is meaningless at the
caller here.  It would be better to have a function name something like
jbd2_complete_transaction_nowait() exported, so that it is more clear what
this function does.

It would be even better to add a comment block for this function that
explains that the caller should use "jbd2_log_wait_commit(journal, tid)"
if it wants to wait for this transaction to complete commit.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-05-15 15:46 [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO Johannes Weiner
  2017-05-16  1:34 ` [RFC PATCH] " Andreas Dilger
@ 2017-05-16 14:36 ` Jan Kara
  2017-05-16 15:41   ` Johannes Weiner
  1 sibling, 1 reply; 10+ messages in thread
From: Jan Kara @ 2017-05-16 14:36 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Jan Kara, Theodore Ts'o, Alexander Viro, linux-ext4,
	linux-fsdevel, linux-kernel, kernel-team

On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> We have observed across several workloads situations where kswapd and
> direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> causing allocation latencies across tasks in the system, while there
> are dozens of gigabytes of clean page cache covering multiple disks.
> 
> The stack traces of such an instance looks like this:
> 
> [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> [<ffffffff811f2230>] evict+0xc0/0x190
> [<ffffffff811f2339>] dispose_list+0x39/0x50
> [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> [<ffffffff8116e755>] shrink_slab+0x235/0x440
> [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> [<ffffffff811d8782>] do_writev+0x52/0xd0
> [<ffffffff811d9810>] SyS_writev+0x10/0x20
> [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> The inode shrinker has provisions to skip any inodes that require
> writeback, to avoid tarpitting the entire system behind a single
> object when there are many other pools to recycle memory from. But
> that logic doesn't cover the situation where an ext4 inode is clean
> but journaled and tied to a commit that yet needs to hit the platter.
> 
> Add a superblock operation that lets the generic inode shrinker query
> the filesystem whether evicting a given inode will require any IO; add
> an ext4 implementation that checks whether the journal is caught up to
> the commit id associated with the inode.
> 
> Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

OK. I have to say I'm somewhat surprised you use data journalling on some
of your files / filesystems but whatever - maybe these are long symlink
after all which would make sense. And I'm actually doubly surprised you can
see these stack traces as these days inode_lru_isolate() checks
inode->i_data.nrpages and uncommitted pages cannot be evicted from
pagecache (ext4_releasepage() will refuse to free them) so I don't see how
such inode can get to dispose_list(). But maybe the inode doesn't really
have any pages and i_datasync_tid just happens to be set to the current
transaction because it is initialized that way and we are evicting inode
that was recently read from disk.

Anyway if you add: "&& inode->i_data.nrpages" to the test in
ext4_evict_inode() do the stalls go away?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-05-16 14:36 ` [RFC PATCH] fs: " Jan Kara
@ 2017-05-16 15:41   ` Johannes Weiner
  2017-05-16 16:03     ` Jan Kara
  0 siblings, 1 reply; 10+ messages in thread
From: Johannes Weiner @ 2017-05-16 15:41 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Alexander Viro, linux-ext4,
	linux-fsdevel, linux-kernel, kernel-team

On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
> On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> > We have observed across several workloads situations where kswapd and
> > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> > causing allocation latencies across tasks in the system, while there
> > are dozens of gigabytes of clean page cache covering multiple disks.
> > 
> > The stack traces of such an instance looks like this:
> > 
> > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> > [<ffffffff811f2230>] evict+0xc0/0x190
> > [<ffffffff811f2339>] dispose_list+0x39/0x50
> > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> > [<ffffffff8116e755>] shrink_slab+0x235/0x440
> > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> > [<ffffffff811d8782>] do_writev+0x52/0xd0
> > [<ffffffff811d9810>] SyS_writev+0x10/0x20
> > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> > [<ffffffffffffffff>] 0xffffffffffffffff
> > 
> > The inode shrinker has provisions to skip any inodes that require
> > writeback, to avoid tarpitting the entire system behind a single
> > object when there are many other pools to recycle memory from. But
> > that logic doesn't cover the situation where an ext4 inode is clean
> > but journaled and tied to a commit that yet needs to hit the platter.
> > 
> > Add a superblock operation that lets the generic inode shrinker query
> > the filesystem whether evicting a given inode will require any IO; add
> > an ext4 implementation that checks whether the journal is caught up to
> > the commit id associated with the inode.
> > 
> > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> OK. I have to say I'm somewhat surprised you use data journalling on some
> of your files / filesystems but whatever - maybe these are long symlink
> after all which would make sense.

The filesystem is actually mounted data=ordered and we didn't catch
anyone in userspace enabling journaling on individual inodes. So we
assumed this must be from symlinks.

> And I'm actually doubly surprised you can see these stack traces as
> these days inode_lru_isolate() checks inode->i_data.nrpages and
> uncommitted pages cannot be evicted from pagecache
> (ext4_releasepage() will refuse to free them) so I don't see how
> such inode can get to dispose_list(). But maybe the inode doesn't
> really have any pages and i_datasync_tid just happens to be set to
> the current transaction because it is initialized that way and we
> are evicting inode that was recently read from disk.

Hm, we're running 4.6, but that already has the nrpages check in
inode_lru_isolate(). There couldn't be any pages in those inodes by
the time the shrinker gets to them.

> Anyway if you add: "&& inode->i_data.nrpages" to the test in
> ext4_evict_inode() do the stalls go away?

Want me to still test this?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-05-16 15:41   ` Johannes Weiner
@ 2017-05-16 16:03     ` Jan Kara
  2017-06-12  8:09       ` Jan Kara
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Kara @ 2017-05-16 16:03 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Alexander Viro,
	linux-ext4, linux-fsdevel, linux-kernel, kernel-team

[-- Attachment #1: Type: text/plain, Size: 4188 bytes --]

On Tue 16-05-17 11:41:05, Johannes Weiner wrote:
> On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
> > On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> > > We have observed across several workloads situations where kswapd and
> > > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> > > causing allocation latencies across tasks in the system, while there
> > > are dozens of gigabytes of clean page cache covering multiple disks.
> > > 
> > > The stack traces of such an instance looks like this:
> > > 
> > > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> > > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> > > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> > > [<ffffffff811f2230>] evict+0xc0/0x190
> > > [<ffffffff811f2339>] dispose_list+0x39/0x50
> > > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> > > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> > > [<ffffffff8116e755>] shrink_slab+0x235/0x440
> > > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> > > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> > > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> > > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> > > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> > > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> > > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> > > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> > > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> > > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> > > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> > > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> > > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> > > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> > > [<ffffffff811d8782>] do_writev+0x52/0xd0
> > > [<ffffffff811d9810>] SyS_writev+0x10/0x20
> > > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> > > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> > > [<ffffffffffffffff>] 0xffffffffffffffff
> > > 
> > > The inode shrinker has provisions to skip any inodes that require
> > > writeback, to avoid tarpitting the entire system behind a single
> > > object when there are many other pools to recycle memory from. But
> > > that logic doesn't cover the situation where an ext4 inode is clean
> > > but journaled and tied to a commit that yet needs to hit the platter.
> > > 
> > > Add a superblock operation that lets the generic inode shrinker query
> > > the filesystem whether evicting a given inode will require any IO; add
> > > an ext4 implementation that checks whether the journal is caught up to
> > > the commit id associated with the inode.
> > > 
> > > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > 
> > OK. I have to say I'm somewhat surprised you use data journalling on some
> > of your files / filesystems but whatever - maybe these are long symlink
> > after all which would make sense.
> 
> The filesystem is actually mounted data=ordered and we didn't catch
> anyone in userspace enabling journaling on individual inodes. So we
> assumed this must be from symlinks.

OK.

> > And I'm actually doubly surprised you can see these stack traces as
> > these days inode_lru_isolate() checks inode->i_data.nrpages and
> > uncommitted pages cannot be evicted from pagecache
> > (ext4_releasepage() will refuse to free them) so I don't see how
> > such inode can get to dispose_list(). But maybe the inode doesn't
> > really have any pages and i_datasync_tid just happens to be set to
> > the current transaction because it is initialized that way and we
> > are evicting inode that was recently read from disk.
> 
> Hm, we're running 4.6, but that already has the nrpages check in
> inode_lru_isolate(). There couldn't be any pages in those inodes by
> the time the shrinker gets to them.
> 
> > Anyway if you add: "&& inode->i_data.nrpages" to the test in
> > ext4_evict_inode() do the stalls go away?
> 
> Want me to still test this?

Can you try attached patch? I'd like to confirm the theory before merging
this... Thanks!

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

[-- Attachment #2: 0001-ext4-Avoid-unnecessary-stalls-in-ext4_evict_inode.patch --]
[-- Type: text/x-patch, Size: 1387 bytes --]

>From e87281dee65589e07b9251ad98191c1e6c488870 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 16 May 2017 17:56:36 +0200
Subject: [PATCH] ext4: Avoid unnecessary stalls in ext4_evict_inode()

These days inode reclaim calls evict_inode() only when it has no pages
in the mapping. In that case it is not necessary to wait for transaction
commit in ext4_evict_inode() as there can be no pages waiting to be
committed. So avoid unnecessary transaction waiting in that case.

We still have to keep the check for the case where ext4_evict_inode()
gets called from other paths (e.g. umount) where inode still can have
some page cache pages.

Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5834c4d76be8..3aef67ca18ac 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -213,7 +213,8 @@ void ext4_evict_inode(struct inode *inode)
 		 */
 		if (inode->i_ino != EXT4_JOURNAL_INO &&
 		    ext4_should_journal_data(inode) &&
-		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+		    inode->i_data.nrpages) {
 			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 
-- 
2.12.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-05-16 16:03     ` Jan Kara
@ 2017-06-12  8:09       ` Jan Kara
  2017-06-12 14:37         ` Johannes Weiner
  2017-07-11  7:16         ` Jerry Lee
  0 siblings, 2 replies; 10+ messages in thread
From: Jan Kara @ 2017-06-12  8:09 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Alexander Viro,
	linux-ext4, linux-fsdevel, linux-kernel, kernel-team

On Tue 16-05-17 18:03:37, Jan Kara wrote:
> On Tue 16-05-17 11:41:05, Johannes Weiner wrote:
> > On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
> > > On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> > > > We have observed across several workloads situations where kswapd and
> > > > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> > > > causing allocation latencies across tasks in the system, while there
> > > > are dozens of gigabytes of clean page cache covering multiple disks.
> > > > 
> > > > The stack traces of such an instance looks like this:
> > > > 
> > > > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> > > > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> > > > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> > > > [<ffffffff811f2230>] evict+0xc0/0x190
> > > > [<ffffffff811f2339>] dispose_list+0x39/0x50
> > > > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> > > > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> > > > [<ffffffff8116e755>] shrink_slab+0x235/0x440
> > > > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> > > > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> > > > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> > > > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> > > > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> > > > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> > > > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> > > > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> > > > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> > > > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> > > > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> > > > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> > > > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> > > > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> > > > [<ffffffff811d8782>] do_writev+0x52/0xd0
> > > > [<ffffffff811d9810>] SyS_writev+0x10/0x20
> > > > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> > > > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> > > > [<ffffffffffffffff>] 0xffffffffffffffff
> > > > 
> > > > The inode shrinker has provisions to skip any inodes that require
> > > > writeback, to avoid tarpitting the entire system behind a single
> > > > object when there are many other pools to recycle memory from. But
> > > > that logic doesn't cover the situation where an ext4 inode is clean
> > > > but journaled and tied to a commit that yet needs to hit the platter.
> > > > 
> > > > Add a superblock operation that lets the generic inode shrinker query
> > > > the filesystem whether evicting a given inode will require any IO; add
> > > > an ext4 implementation that checks whether the journal is caught up to
> > > > the commit id associated with the inode.
> > > > 
> > > > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > > 
> > > OK. I have to say I'm somewhat surprised you use data journalling on some
> > > of your files / filesystems but whatever - maybe these are long symlink
> > > after all which would make sense.
> > 
> > The filesystem is actually mounted data=ordered and we didn't catch
> > anyone in userspace enabling journaling on individual inodes. So we
> > assumed this must be from symlinks.
> 
> OK.
> 
> > > And I'm actually doubly surprised you can see these stack traces as
> > > these days inode_lru_isolate() checks inode->i_data.nrpages and
> > > uncommitted pages cannot be evicted from pagecache
> > > (ext4_releasepage() will refuse to free them) so I don't see how
> > > such inode can get to dispose_list(). But maybe the inode doesn't
> > > really have any pages and i_datasync_tid just happens to be set to
> > > the current transaction because it is initialized that way and we
> > > are evicting inode that was recently read from disk.
> > 
> > Hm, we're running 4.6, but that already has the nrpages check in
> > inode_lru_isolate(). There couldn't be any pages in those inodes by
> > the time the shrinker gets to them.
> > 
> > > Anyway if you add: "&& inode->i_data.nrpages" to the test in
> > > ext4_evict_inode() do the stalls go away?
> > 
> > Want me to still test this?
> 
> Can you try attached patch? I'd like to confirm the theory before merging
> this... Thanks!

Ping? Any result with this patch?

								Honza

> From e87281dee65589e07b9251ad98191c1e6c488870 Mon Sep 17 00:00:00 2001
> From: Jan Kara <jack@suse.cz>
> Date: Tue, 16 May 2017 17:56:36 +0200
> Subject: [PATCH] ext4: Avoid unnecessary stalls in ext4_evict_inode()
> 
> These days inode reclaim calls evict_inode() only when it has no pages
> in the mapping. In that case it is not necessary to wait for transaction
> commit in ext4_evict_inode() as there can be no pages waiting to be
> committed. So avoid unnecessary transaction waiting in that case.
> 
> We still have to keep the check for the case where ext4_evict_inode()
> gets called from other paths (e.g. umount) where inode still can have
> some page cache pages.
> 
> Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/inode.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5834c4d76be8..3aef67ca18ac 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -213,7 +213,8 @@ void ext4_evict_inode(struct inode *inode)
>  		 */
>  		if (inode->i_ino != EXT4_JOURNAL_INO &&
>  		    ext4_should_journal_data(inode) &&
> -		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
> +		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
> +		    inode->i_data.nrpages) {
>  			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>  			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
>  
> -- 
> 2.12.0
> 

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-06-12  8:09       ` Jan Kara
@ 2017-06-12 14:37         ` Johannes Weiner
  2017-06-12 15:19           ` Jan Kara
  2017-07-11  7:16         ` Jerry Lee
  1 sibling, 1 reply; 10+ messages in thread
From: Johannes Weiner @ 2017-06-12 14:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Alexander Viro, linux-ext4,
	linux-fsdevel, linux-kernel, kernel-team

Hi Jan,

On Mon, Jun 12, 2017 at 10:09:57AM +0200, Jan Kara wrote:
> On Tue 16-05-17 18:03:37, Jan Kara wrote:
> > On Tue 16-05-17 11:41:05, Johannes Weiner wrote:
> > > On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
> > > > On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> > > > > We have observed across several workloads situations where kswapd and
> > > > > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> > > > > causing allocation latencies across tasks in the system, while there
> > > > > are dozens of gigabytes of clean page cache covering multiple disks.
> > > > > 
> > > > > The stack traces of such an instance looks like this:
> > > > > 
> > > > > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> > > > > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> > > > > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> > > > > [<ffffffff811f2230>] evict+0xc0/0x190
> > > > > [<ffffffff811f2339>] dispose_list+0x39/0x50
> > > > > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> > > > > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> > > > > [<ffffffff8116e755>] shrink_slab+0x235/0x440
> > > > > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> > > > > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> > > > > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> > > > > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> > > > > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> > > > > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> > > > > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> > > > > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> > > > > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> > > > > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> > > > > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> > > > > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> > > > > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> > > > > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> > > > > [<ffffffff811d8782>] do_writev+0x52/0xd0
> > > > > [<ffffffff811d9810>] SyS_writev+0x10/0x20
> > > > > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> > > > > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> > > > > [<ffffffffffffffff>] 0xffffffffffffffff
> > > > > 
> > > > > The inode shrinker has provisions to skip any inodes that require
> > > > > writeback, to avoid tarpitting the entire system behind a single
> > > > > object when there are many other pools to recycle memory from. But
> > > > > that logic doesn't cover the situation where an ext4 inode is clean
> > > > > but journaled and tied to a commit that yet needs to hit the platter.
> > > > > 
> > > > > Add a superblock operation that lets the generic inode shrinker query
> > > > > the filesystem whether evicting a given inode will require any IO; add
> > > > > an ext4 implementation that checks whether the journal is caught up to
> > > > > the commit id associated with the inode.
> > > > > 
> > > > > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> > > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > 
> > > > OK. I have to say I'm somewhat surprised you use data journalling on some
> > > > of your files / filesystems but whatever - maybe these are long symlink
> > > > after all which would make sense.
> > > 
> > > The filesystem is actually mounted data=ordered and we didn't catch
> > > anyone in userspace enabling journaling on individual inodes. So we
> > > assumed this must be from symlinks.
> > 
> > OK.
> > 
> > > > And I'm actually doubly surprised you can see these stack traces as
> > > > these days inode_lru_isolate() checks inode->i_data.nrpages and
> > > > uncommitted pages cannot be evicted from pagecache
> > > > (ext4_releasepage() will refuse to free them) so I don't see how
> > > > such inode can get to dispose_list(). But maybe the inode doesn't
> > > > really have any pages and i_datasync_tid just happens to be set to
> > > > the current transaction because it is initialized that way and we
> > > > are evicting inode that was recently read from disk.
> > > 
> > > Hm, we're running 4.6, but that already has the nrpages check in
> > > inode_lru_isolate(). There couldn't be any pages in those inodes by
> > > the time the shrinker gets to them.
> > > 
> > > > Anyway if you add: "&& inode->i_data.nrpages" to the test in
> > > > ext4_evict_inode() do the stalls go away?
> > > 
> > > Want me to still test this?
> > 
> > Can you try attached patch? I'd like to confirm the theory before merging
> > this... Thanks!
> 
> Ping? Any result with this patch?

Sorry for not getting back earlier.

I was waiting for the internal reporter of this issue to test it, but
they have since mitigated the issue by reducing the memory footprint
of their application; bad memory health caused other problems as well.

Since this is in a production environment, they're reluctant to muck
with it now that things are working.

However, with or without a reproducer, it seems to make sense to not
serialize evict() on commits when we don't have to from a correctness
point of view. Would you be opposed to just merging the patch anyway?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-06-12 14:37         ` Johannes Weiner
@ 2017-06-12 15:19           ` Jan Kara
  2017-06-12 21:51             ` Johannes Weiner
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Kara @ 2017-06-12 15:19 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Jan Kara, Jan Kara, Theodore Ts'o, Alexander Viro,
	linux-ext4, linux-fsdevel, linux-kernel, kernel-team

On Mon 12-06-17 10:37:27, Johannes Weiner wrote:
> On Mon, Jun 12, 2017 at 10:09:57AM +0200, Jan Kara wrote:
> > On Tue 16-05-17 18:03:37, Jan Kara wrote:
> > > On Tue 16-05-17 11:41:05, Johannes Weiner wrote:
> > > > On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
> > > > > On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
> > > > > > We have observed across several workloads situations where kswapd and
> > > > > > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
> > > > > > causing allocation latencies across tasks in the system, while there
> > > > > > are dozens of gigabytes of clean page cache covering multiple disks.
> > > > > > 
> > > > > > The stack traces of such an instance looks like this:
> > > > > > 
> > > > > > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
> > > > > > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
> > > > > > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
> > > > > > [<ffffffff811f2230>] evict+0xc0/0x190
> > > > > > [<ffffffff811f2339>] dispose_list+0x39/0x50
> > > > > > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
> > > > > > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
> > > > > > [<ffffffff8116e755>] shrink_slab+0x235/0x440
> > > > > > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
> > > > > > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
> > > > > > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
> > > > > > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
> > > > > > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
> > > > > > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
> > > > > > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
> > > > > > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
> > > > > > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
> > > > > > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
> > > > > > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
> > > > > > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
> > > > > > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
> > > > > > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
> > > > > > [<ffffffff811d8782>] do_writev+0x52/0xd0
> > > > > > [<ffffffff811d9810>] SyS_writev+0x10/0x20
> > > > > > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
> > > > > > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
> > > > > > [<ffffffffffffffff>] 0xffffffffffffffff
> > > > > > 
> > > > > > The inode shrinker has provisions to skip any inodes that require
> > > > > > writeback, to avoid tarpitting the entire system behind a single
> > > > > > object when there are many other pools to recycle memory from. But
> > > > > > that logic doesn't cover the situation where an ext4 inode is clean
> > > > > > but journaled and tied to a commit that yet needs to hit the platter.
> > > > > > 
> > > > > > Add a superblock operation that lets the generic inode shrinker query
> > > > > > the filesystem whether evicting a given inode will require any IO; add
> > > > > > an ext4 implementation that checks whether the journal is caught up to
> > > > > > the commit id associated with the inode.
> > > > > > 
> > > > > > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
> > > > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > > 
> > > > > OK. I have to say I'm somewhat surprised you use data journalling on some
> > > > > of your files / filesystems but whatever - maybe these are long symlink
> > > > > after all which would make sense.
> > > > 
> > > > The filesystem is actually mounted data=ordered and we didn't catch
> > > > anyone in userspace enabling journaling on individual inodes. So we
> > > > assumed this must be from symlinks.
> > > 
> > > OK.
> > > 
> > > > > And I'm actually doubly surprised you can see these stack traces as
> > > > > these days inode_lru_isolate() checks inode->i_data.nrpages and
> > > > > uncommitted pages cannot be evicted from pagecache
> > > > > (ext4_releasepage() will refuse to free them) so I don't see how
> > > > > such inode can get to dispose_list(). But maybe the inode doesn't
> > > > > really have any pages and i_datasync_tid just happens to be set to
> > > > > the current transaction because it is initialized that way and we
> > > > > are evicting inode that was recently read from disk.
> > > > 
> > > > Hm, we're running 4.6, but that already has the nrpages check in
> > > > inode_lru_isolate(). There couldn't be any pages in those inodes by
> > > > the time the shrinker gets to them.
> > > > 
> > > > > Anyway if you add: "&& inode->i_data.nrpages" to the test in
> > > > > ext4_evict_inode() do the stalls go away?
> > > > 
> > > > Want me to still test this?
> > > 
> > > Can you try attached patch? I'd like to confirm the theory before merging
> > > this... Thanks!
> > 
> > Ping? Any result with this patch?
> 
> Sorry for not getting back earlier.
> 
> I was waiting for the internal reporter of this issue to test it, but
> they have since mitigated the issue by reducing the memory footprint
> of their application; bad memory health caused other problems as well.
> 
> Since this is in a production environment, they're reluctant to muck
> with it now that things are working.
> 
> However, with or without a reproducer, it seems to make sense to not
> serialize evict() on commits when we don't have to from a correctness
> point of view. Would you be opposed to just merging the patch anyway?

No, I guess the patch makes sense even without a reproducer. I'll send it
to Ted for inclusion.

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-06-12 15:19           ` Jan Kara
@ 2017-06-12 21:51             ` Johannes Weiner
  0 siblings, 0 replies; 10+ messages in thread
From: Johannes Weiner @ 2017-06-12 21:51 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Theodore Ts'o, Alexander Viro, linux-ext4,
	linux-fsdevel, linux-kernel, kernel-team

On Mon, Jun 12, 2017 at 05:19:34PM +0200, Jan Kara wrote:
> On Mon 12-06-17 10:37:27, Johannes Weiner wrote:
> > However, with or without a reproducer, it seems to make sense to not
> > serialize evict() on commits when we don't have to from a correctness
> > point of view. Would you be opposed to just merging the patch anyway?
> 
> No, I guess the patch makes sense even without a reproducer. I'll send it
> to Ted for inclusion.

Thank you, I appreciate it!

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO
  2017-06-12  8:09       ` Jan Kara
  2017-06-12 14:37         ` Johannes Weiner
@ 2017-07-11  7:16         ` Jerry Lee
  1 sibling, 0 replies; 10+ messages in thread
From: Jerry Lee @ 2017-07-11  7:16 UTC (permalink / raw)
  To: Jan Kara
  Cc: Johannes Weiner, Jan Kara, Theodore Ts'o, Alexander Viro,
	linux-ext4, linux-fsdevel, linux-kernel, kernel-team

Hi Jan,

We seemed to get this issue on a file system with data=order mode, and
it can steadily be re-produced by creating lots of symlinks and each
links to a 100KB file created by dd in advance.  During the process of
dd and making symlinks, copying a large file via samba to the file
system would get stuck with the following call stacks:

kswapd0
=======
[<ffffffff81254273>] jbd2_log_wait_commit+0x93/0x100
[<ffffffff81255700>] jbd2_complete_transaction+0x50/0x80
[<ffffffff8120f95c>] ext4_evict_inode+0x27c/0x400
[<ffffffff8117a4be>] evict+0xae/0x170
[<ffffffff8117a5b5>] dispose_list+0x35/0x50
[<ffffffff8117b396>] prune_icache_sb+0x46/0x60
[<ffffffff8116155c>] super_cache_scan+0x14c/0x1a0
[<ffffffff8111b98a>] shrink_slab.part.36+0x19a/0x250
[<ffffffff8111e38c>] shrink_zone+0x23c/0x250
[<ffffffff8111ef5a>] kswapd+0x54a/0x930
[<ffffffff8108b316>] kthread+0xd6/0xf0
[<ffffffff81a6afdf>] ret_from_fork+0x3f/0x70
[<ffffffffffffffff>] 0xffffffffffffffff

kworker/u8:4
============
[<ffffffff8124dc74>] wait_transaction_locked+0x74/0xa0
[<ffffffff8124df53>] start_this_handle+0x233/0x5d0
[<ffffffff8124e502>] jbd2__journal_start+0xf2/0x180
[<ffffffff8122f1f7>] __ext4_journal_start_sb+0x57/0x80
[<ffffffff8120e6aa>] ext4_writepages+0x2da/0xad0
[<ffffffff8111804e>] do_writepages+0x2e/0x70
[<ffffffff81188163>] __writeback_single_inode+0x33/0x170
[<ffffffff8118873a>] writeback_sb_inodes+0x20a/0x460
[<ffffffff81188a14>] __writeback_inodes_wb+0x84/0xb0
[<ffffffff81188be2>] wb_writeback+0x1a2/0x1b0
[<ffffffff81189166>] wb_workfn+0x1b6/0x320
[<ffffffff81086239>] process_one_work+0x139/0x370
[<ffffffff810867b1>] worker_thread+0x61/0x450
[<ffffffff8108b316>] kthread+0xd6/0xf0
[<ffffffff81a6afdf>] ret_from_fork+0x3f/0x70
[<ffffffffffffffff>] 0xffffffffffffffff

jbd2/dm-0-8
===========
[<ffffffff812508f9>] jbd2_journal_commit_transaction+0x1f9/0x1540
[<ffffffff81254861>] kjournald2+0xd1/0x250
[<ffffffff8108b316>] kthread+0xd6/0xf0
[<ffffffff81a6afdf>] ret_from_fork+0x3f/0x70
[<ffffffffffffffff>] 0xffffffffffffffff

We runs linux-4.2 and the issue is fixed with the patch you mentioned
in previous mail (adding "&& inode->i_data.nrpages").  If there's
anything I could help, feel free to let me know.  Thanks!


Jerry

On Mon, Jun 12, 2017 at 4:09 PM, Jan Kara <jack@suse.cz> wrote:
> On Tue 16-05-17 18:03:37, Jan Kara wrote:
>> On Tue 16-05-17 11:41:05, Johannes Weiner wrote:
>> > On Tue, May 16, 2017 at 04:36:45PM +0200, Jan Kara wrote:
>> > > On Mon 15-05-17 11:46:34, Johannes Weiner wrote:
>> > > > We have observed across several workloads situations where kswapd and
>> > > > direct reclaimers get stuck in the inode shrinker of the ext4 / mount,
>> > > > causing allocation latencies across tasks in the system, while there
>> > > > are dozens of gigabytes of clean page cache covering multiple disks.
>> > > >
>> > > > The stack traces of such an instance looks like this:
>> > > >
>> > > > [<ffffffff812b3225>] jbd2_log_wait_commit+0x95/0x110
>> > > > [<ffffffff812b4f29>] jbd2_complete_transaction+0x59/0x90
>> > > > [<ffffffff812668da>] ext4_evict_inode+0x2da/0x480
>> > > > [<ffffffff811f2230>] evict+0xc0/0x190
>> > > > [<ffffffff811f2339>] dispose_list+0x39/0x50
>> > > > [<ffffffff811f323b>] prune_icache_sb+0x4b/0x60
>> > > > [<ffffffff811dba71>] super_cache_scan+0x141/0x190
>> > > > [<ffffffff8116e755>] shrink_slab+0x235/0x440
>> > > > [<ffffffff81172b48>] shrink_zone+0x268/0x2d0
>> > > > [<ffffffff81172f04>] do_try_to_free_pages+0x164/0x410
>> > > > [<ffffffff81173265>] try_to_free_pages+0xb5/0x160
>> > > > [<ffffffff811656b6>] __alloc_pages_nodemask+0x636/0xb30
>> > > > [<ffffffff811acac8>] alloc_pages_current+0x88/0x120
>> > > > [<ffffffff816d4e46>] skb_page_frag_refill+0xc6/0xf0
>> > > > [<ffffffff816d4e8d>] sk_page_frag_refill+0x1d/0x80
>> > > > [<ffffffff8173f86b>] tcp_sendmsg+0x28b/0xb10
>> > > > [<ffffffff81769727>] inet_sendmsg+0x67/0xa0
>> > > > [<ffffffff816d0488>] sock_sendmsg+0x38/0x50
>> > > > [<ffffffff816d0518>] sock_write_iter+0x78/0xd0
>> > > > [<ffffffff811d774e>] do_iter_readv_writev+0x5e/0xa0
>> > > > [<ffffffff811d8468>] do_readv_writev+0x178/0x210
>> > > > [<ffffffff811d871c>] vfs_writev+0x3c/0x50
>> > > > [<ffffffff811d8782>] do_writev+0x52/0xd0
>> > > > [<ffffffff811d9810>] SyS_writev+0x10/0x20
>> > > > [<ffffffff81002910>] do_syscall_64+0x50/0xa0
>> > > > [<ffffffff817eed3c>] return_from_SYSCALL_64+0x0/0x6a
>> > > > [<ffffffffffffffff>] 0xffffffffffffffff
>> > > >
>> > > > The inode shrinker has provisions to skip any inodes that require
>> > > > writeback, to avoid tarpitting the entire system behind a single
>> > > > object when there are many other pools to recycle memory from. But
>> > > > that logic doesn't cover the situation where an ext4 inode is clean
>> > > > but journaled and tied to a commit that yet needs to hit the platter.
>> > > >
>> > > > Add a superblock operation that lets the generic inode shrinker query
>> > > > the filesystem whether evicting a given inode will require any IO; add
>> > > > an ext4 implementation that checks whether the journal is caught up to
>> > > > the commit id associated with the inode.
>> > > >
>> > > > Fixes: 2d859db3e4a8 ("ext4: fix data corruption in inodes with journalled data")
>> > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
>> > >
>> > > OK. I have to say I'm somewhat surprised you use data journalling on some
>> > > of your files / filesystems but whatever - maybe these are long symlink
>> > > after all which would make sense.
>> >
>> > The filesystem is actually mounted data=ordered and we didn't catch
>> > anyone in userspace enabling journaling on individual inodes. So we
>> > assumed this must be from symlinks.
>>
>> OK.
>>
>> > > And I'm actually doubly surprised you can see these stack traces as
>> > > these days inode_lru_isolate() checks inode->i_data.nrpages and
>> > > uncommitted pages cannot be evicted from pagecache
>> > > (ext4_releasepage() will refuse to free them) so I don't see how
>> > > such inode can get to dispose_list(). But maybe the inode doesn't
>> > > really have any pages and i_datasync_tid just happens to be set to
>> > > the current transaction because it is initialized that way and we
>> > > are evicting inode that was recently read from disk.
>> >
>> > Hm, we're running 4.6, but that already has the nrpages check in
>> > inode_lru_isolate(). There couldn't be any pages in those inodes by
>> > the time the shrinker gets to them.
>> >
>> > > Anyway if you add: "&& inode->i_data.nrpages" to the test in
>> > > ext4_evict_inode() do the stalls go away?
>> >
>> > Want me to still test this?
>>
>> Can you try attached patch? I'd like to confirm the theory before merging
>> this... Thanks!
>
> Ping? Any result with this patch?
>
>                                                                 Honza
>
>> From e87281dee65589e07b9251ad98191c1e6c488870 Mon Sep 17 00:00:00 2001
>> From: Jan Kara <jack@suse.cz>
>> Date: Tue, 16 May 2017 17:56:36 +0200
>> Subject: [PATCH] ext4: Avoid unnecessary stalls in ext4_evict_inode()
>>
>> These days inode reclaim calls evict_inode() only when it has no pages
>> in the mapping. In that case it is not necessary to wait for transaction
>> commit in ext4_evict_inode() as there can be no pages waiting to be
>> committed. So avoid unnecessary transaction waiting in that case.
>>
>> We still have to keep the check for the case where ext4_evict_inode()
>> gets called from other paths (e.g. umount) where inode still can have
>> some page cache pages.
>>
>> Reported-by: Johannes Weiner <hannes@cmpxchg.org>
>> Signed-off-by: Jan Kara <jack@suse.cz>
>> ---
>>  fs/ext4/inode.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5834c4d76be8..3aef67ca18ac 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -213,7 +213,8 @@ void ext4_evict_inode(struct inode *inode)
>>                */
>>               if (inode->i_ino != EXT4_JOURNAL_INO &&
>>                   ext4_should_journal_data(inode) &&
>> -                 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
>> +                 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
>> +                 inode->i_data.nrpages) {
>>                       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>>                       tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
>>
>> --
>> 2.12.0
>>
>
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR



-- 
Jerry Lee
QNAP Systems, Inc.
Email: jerrylee@qnap.com
Tel: (+886)-2-2393-5152 ext. 15019
Address: 13F., No.56, Sec. 1, Xinsheng S. Rd., Zhongzheng Dist.,
Taipei City, Taiwan

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-07-11  7:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-15 15:46 [RFC PATCH] fs: ext4: don't trap kswapd and allocating tasks on ext4 inode IO Johannes Weiner
2017-05-16  1:34 ` [RFC PATCH] " Andreas Dilger
2017-05-16 14:36 ` [RFC PATCH] fs: " Jan Kara
2017-05-16 15:41   ` Johannes Weiner
2017-05-16 16:03     ` Jan Kara
2017-06-12  8:09       ` Jan Kara
2017-06-12 14:37         ` Johannes Weiner
2017-06-12 15:19           ` Jan Kara
2017-06-12 21:51             ` Johannes Weiner
2017-07-11  7:16         ` Jerry Lee

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.