linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
@ 2019-08-29 22:47 Tejun Heo
  2019-08-30 13:43 ` Jens Axboe
  2019-08-30 15:40 ` Jan Kara
  0 siblings, 2 replies; 8+ messages in thread
From: Tejun Heo @ 2019-08-29 22:47 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-kernel, Jan Kara

cgroup foreign inode handling has quite a bit of heuristics and
internal states which sometimes makes it difficult to understand
what's going on.  Add tracepoints to improve visibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/fs-writeback.c                |    5 +
 include/trace/events/writeback.h |  123 +++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c                  |    5 +
 3 files changed, 133 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 658dc16c9e6d..8aaa7eec7b74 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	if (unlikely(inode->i_state & I_FREEING))
 		goto skip_switch;
 
+	trace_inode_switch_wbs(inode, old_wb, new_wb);
+
 	/*
 	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
 	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
@@ -673,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc)
 		if (wbc->wb_id != max_id)
 			history |= (1U << slots) - 1;
 
+		if (history)
+			trace_inode_foreign_history(inode, wbc, history);
+
 		/*
 		 * Switch if the current wb isn't the consistent winner.
 		 * If there are multiple closely competing dirtiers, the
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index aa7f3aeac740..3dc9fb9e7c78 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -176,6 +176,129 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 #endif	/* CREATE_TRACE_POINTS */
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+TRACE_EVENT(inode_foreign_history,
+
+	TP_PROTO(struct inode *inode, struct writeback_control *wbc,
+		 unsigned int history),
+
+	TP_ARGS(inode, wbc, history),
+
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned long,	ino)
+		__field(unsigned int,	cgroup_ino)
+		__field(unsigned int,	history)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(inode_to_bdi(inode)->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
+		__entry->history	= history;
+	),
+
+	TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x",
+		__entry->name,
+		__entry->ino,
+		__entry->cgroup_ino,
+		__entry->history
+	)
+);
+
+TRACE_EVENT(inode_switch_wbs,
+
+	TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
+		 struct bdi_writeback *new_wb),
+
+	TP_ARGS(inode, old_wb, new_wb),
+
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned long,	ino)
+		__field(unsigned int,	old_cgroup_ino)
+		__field(unsigned int,	new_cgroup_ino)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,	dev_name(old_wb->bdi->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->old_cgroup_ino	= __trace_wb_assign_cgroup(old_wb);
+		__entry->new_cgroup_ino	= __trace_wb_assign_cgroup(new_wb);
+	),
+
+	TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u",
+		__entry->name,
+		__entry->ino,
+		__entry->old_cgroup_ino,
+		__entry->new_cgroup_ino
+	)
+);
+
+TRACE_EVENT(track_foreign_dirty,
+
+	TP_PROTO(struct page *page, struct bdi_writeback *wb),
+
+	TP_ARGS(page, wb),
+
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(u64,		bdi_id)
+		__field(unsigned long,	ino)
+		__field(unsigned int,	memcg_id)
+		__field(unsigned int,	cgroup_ino)
+		__field(unsigned int,	page_cgroup_ino)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,	dev_name(wb->bdi->dev), 32);
+		__entry->bdi_id		= wb->bdi->id;
+		__entry->ino		= page->mapping->host->i_ino;
+		__entry->memcg_id	= wb->memcg_css->id;
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
+		__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
+	),
+
+	TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u",
+		__entry->name,
+		__entry->bdi_id,
+		__entry->ino,
+		__entry->memcg_id,
+		__entry->cgroup_ino,
+		__entry->page_cgroup_ino
+	)
+);
+
+TRACE_EVENT(flush_foreign,
+
+	TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
+		 unsigned int frn_memcg_id),
+
+	TP_ARGS(wb, frn_bdi_id, frn_memcg_id),
+
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned int,	cgroup_ino)
+		__field(unsigned int,	frn_bdi_id)
+		__field(unsigned int,	frn_memcg_id)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,	dev_name(wb->bdi->dev), 32);
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
+		__entry->frn_bdi_id	= frn_bdi_id;
+		__entry->frn_memcg_id	= frn_memcg_id;
+	),
+
+	TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u",
+		__entry->name,
+		__entry->cgroup_ino,
+		__entry->frn_bdi_id,
+		__entry->frn_memcg_id
+	)
+);
+#endif
+
 DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eb626a290d93..b74c9d143d5e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4159,6 +4159,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+#include <trace/events/writeback.h>
+
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
 {
 	return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4296,6 +4298,8 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
 	int oldest = -1;
 	int i;
 
+	trace_track_foreign_dirty(page, wb);
+
 	/*
 	 * Pick the slot to use.  If there is already a slot for @wb, keep
 	 * using it.  If not replace the oldest one which isn't being
@@ -4356,6 +4360,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
 		if (time_after64(frn->at, now - intv) &&
 		    atomic_read(&frn->done.cnt) == 1) {
 			frn->at = 0;
+			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
 			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
 					       WB_REASON_FOREIGN_FLUSH,
 					       &frn->done);

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-29 22:47 [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks Tejun Heo
@ 2019-08-30 13:43 ` Jens Axboe
  2019-08-30 15:40 ` Jan Kara
  1 sibling, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2019-08-30 13:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-block, linux-kernel, Jan Kara

On 8/29/19 4:47 PM, Tejun Heo wrote:
> cgroup foreign inode handling has quite a bit of heuristics and
> internal states which sometimes makes it difficult to understand
> what's going on.  Add tracepoints to improve visibility.

LGTM, applied.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-29 22:47 [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks Tejun Heo
  2019-08-30 13:43 ` Jens Axboe
@ 2019-08-30 15:40 ` Jan Kara
  2019-08-30 15:49   ` Tejun Heo
  2019-08-30 15:51   ` Tejun Heo
  1 sibling, 2 replies; 8+ messages in thread
From: Jan Kara @ 2019-08-30 15:40 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-block, linux-kernel, Jan Kara

On Thu 29-08-19 15:47:19, Tejun Heo wrote:
> cgroup foreign inode handling has quite a bit of heuristics and
> internal states which sometimes makes it difficult to understand
> what's going on.  Add tracepoints to improve visibility.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
...
> +TRACE_EVENT(track_foreign_dirty,
> +
> +	TP_PROTO(struct page *page, struct bdi_writeback *wb),
> +
> +	TP_ARGS(page, wb),
> +
> +	TP_STRUCT__entry(
> +		__array(char,		name, 32)
> +		__field(u64,		bdi_id)
> +		__field(unsigned long,	ino)
> +		__field(unsigned int,	memcg_id)
> +		__field(unsigned int,	cgroup_ino)
> +		__field(unsigned int,	page_cgroup_ino)
> +	),
> +
> +	TP_fast_assign(
> +		strncpy(__entry->name,	dev_name(wb->bdi->dev), 32);
> +		__entry->bdi_id		= wb->bdi->id;
> +		__entry->ino		= page->mapping->host->i_ino;
> +		__entry->memcg_id	= wb->memcg_css->id;
> +		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> +		__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
> +	),

Are the page dereferences above safe? I suppose lock_page_memcg() protects
the page->mem_cgroup->css.cgroup->kn->id dereference? But page->mapping
does not seem to be protected by page lock?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-30 15:40 ` Jan Kara
@ 2019-08-30 15:49   ` Tejun Heo
  2019-08-30 16:42     ` Jan Kara
  2019-08-30 15:51   ` Tejun Heo
  1 sibling, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2019-08-30 15:49 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jens Axboe, linux-block, linux-kernel

Hello, Jan.

On Fri, Aug 30, 2019 at 05:40:23PM +0200, Jan Kara wrote:
> > +	TP_fast_assign(
> > +		strncpy(__entry->name,	dev_name(wb->bdi->dev), 32);
> > +		__entry->bdi_id		= wb->bdi->id;
> > +		__entry->ino		= page->mapping->host->i_ino;
> > +		__entry->memcg_id	= wb->memcg_css->id;
> > +		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> > +		__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
> > +	),
> 
> Are the page dereferences above safe? I suppose lock_page_memcg() protects
> the page->mem_cgroup->css.cgroup->kn->id dereference? But page->mapping
> does not seem to be protected by page lock?

Hah, I assumed it would work because there are preceding if
(page_mapping()) tests in the dirty paths -
e.g. __set_page_dirty_nobuffers().  Oh, regardless of that assumption,
I should have used page_mapping().

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-30 15:40 ` Jan Kara
  2019-08-30 15:49   ` Tejun Heo
@ 2019-08-30 15:51   ` Tejun Heo
  1 sibling, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2019-08-30 15:51 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jens Axboe, linux-block, linux-kernel

On Fri, Aug 30, 2019 at 05:40:23PM +0200, Jan Kara wrote:
> Are the page dereferences above safe? I suppose lock_page_memcg() protects
> the page->mem_cgroup->css.cgroup->kn->id dereference? But page->mapping
> does not seem to be protected by page lock?

Forgot to respond to the mem_cgroup part.  The page to memcg
association never changes on cgroup2 as long as the page has ref, so I
think that one should be safe.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-30 15:49   ` Tejun Heo
@ 2019-08-30 16:42     ` Jan Kara
  2019-08-30 17:09       ` Tejun Heo
  0 siblings, 1 reply; 8+ messages in thread
From: Jan Kara @ 2019-08-30 16:42 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jan Kara, Jens Axboe, linux-block, linux-kernel

On Fri 30-08-19 08:49:21, Tejun Heo wrote:
> Hello, Jan.
> 
> On Fri, Aug 30, 2019 at 05:40:23PM +0200, Jan Kara wrote:
> > > +	TP_fast_assign(
> > > +		strncpy(__entry->name,	dev_name(wb->bdi->dev), 32);
> > > +		__entry->bdi_id		= wb->bdi->id;
> > > +		__entry->ino		= page->mapping->host->i_ino;
> > > +		__entry->memcg_id	= wb->memcg_css->id;
> > > +		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> > > +		__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
> > > +	),
> > 
> > Are the page dereferences above safe? I suppose lock_page_memcg() protects
> > the page->mem_cgroup->css.cgroup->kn->id dereference? But page->mapping
> > does not seem to be protected by page lock?
> 
> Hah, I assumed it would work because there are preceding if
> (page_mapping()) tests in the dirty paths -
> e.g. __set_page_dirty_nobuffers().  Oh, regardless of that assumption,
> I should have used page_mapping().

Well, but if you look at __set_page_dirty_nobuffers() it is careful. It
does:

struct address_space *mapping = page_mapping(page);

if (!mapping) {
	bail
}
... use mapping

Exactly because page->mapping can become NULL under your hands if you don't
hold page lock. So I think you either need something similar in your
tracepoint or handle this in the caller.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-30 16:42     ` Jan Kara
@ 2019-08-30 17:09       ` Tejun Heo
  2019-09-02  8:38         ` Jan Kara
  0 siblings, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2019-08-30 17:09 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jens Axboe, linux-block, linux-kernel

Hello, Jan.

On Fri, Aug 30, 2019 at 06:42:11PM +0200, Jan Kara wrote:
> Well, but if you look at __set_page_dirty_nobuffers() it is careful. It
> does:
> 
> struct address_space *mapping = page_mapping(page);
> 
> if (!mapping) {
> 	bail
> }
> ... use mapping
> 
> Exactly because page->mapping can become NULL under your hands if you don't
> hold page lock. So I think you either need something similar in your
> tracepoint or handle this in the caller.

So, account_page_dirtied() is called from two places.

__set_page_dirty() and __set_page_dirty_nobuffers().  The following is
from the latter.

	lock_page_memcg(page);
	if (!TestSetPageDirty(page)) {
		struct address_space *mapping = page_mapping(page);
		...

		if (!mapping) {
			unlock_page_memcg(page);
			return 1;
		}

		xa_lock_irqsave(&mapping->i_pages, flags);
		BUG_ON(page_mapping(page) != mapping);
		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
		account_page_dirtied(page, mapping);
		...

If I'm reading it right, it's saying that at this point if mapping
exists after setting page dirty, it must not change while locking
i_pages.

__set_page_dirty_nobuffers() is more brief but seems to be making the
same assumption.

	xa_lock_irqsave(&mapping->i_pages, flags);
	if (page->mapping) {	/* Race with truncate? */
		WARN_ON_ONCE(warn && !PageUptodate(page));
		account_page_dirtied(page, mapping);
		__xa_set_mark(&mapping->i_pages, page_index(page),
				PAGECACHE_TAG_DIRTY);
	}
	xa_unlock_irqrestore(&mapping->i_pages, flags);

Both are clearly assuming that once i_pages is locked, mapping can't
change.  So, inside account_page_dirtied(), mapping clearly can't
change.  The TP in question - track_foreign_dirty - is invoked from
mem_cgroup_track_foreign_dirty() which is only called from
account_page_dirty(), so I'm failing to see how mapping would change
there.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks
  2019-08-30 17:09       ` Tejun Heo
@ 2019-09-02  8:38         ` Jan Kara
  0 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2019-09-02  8:38 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jan Kara, Jens Axboe, linux-block, linux-kernel

Hello Tejun,

On Fri 30-08-19 10:09:03, Tejun Heo wrote:
> On Fri, Aug 30, 2019 at 06:42:11PM +0200, Jan Kara wrote:
> > Well, but if you look at __set_page_dirty_nobuffers() it is careful. It
> > does:
> > 
> > struct address_space *mapping = page_mapping(page);
> > 
> > if (!mapping) {
> > 	bail
> > }
> > ... use mapping
> > 
> > Exactly because page->mapping can become NULL under your hands if you don't
> > hold page lock. So I think you either need something similar in your
> > tracepoint or handle this in the caller.
> 
> So, account_page_dirtied() is called from two places.
> 
> __set_page_dirty() and __set_page_dirty_nobuffers().  The following is
> from the latter.
> 
> 	lock_page_memcg(page);
> 	if (!TestSetPageDirty(page)) {
> 		struct address_space *mapping = page_mapping(page);
> 		...
> 
> 		if (!mapping) {
> 			unlock_page_memcg(page);
> 			return 1;
> 		}
> 
> 		xa_lock_irqsave(&mapping->i_pages, flags);
> 		BUG_ON(page_mapping(page) != mapping);
> 		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
> 		account_page_dirtied(page, mapping);
> 		...
> 
> If I'm reading it right, it's saying that at this point if mapping
> exists after setting page dirty, it must not change while locking
> i_pages.

Correct __set_page_dirty_nobuffers() is supposed to be called serialized
with truncation either through page lock or other means. At least the
comment says so and the code looks like that.

> 
> __set_page_dirty_nobuffers() is more brief but seems to be making the
> same assumption.

I suppose you mean __set_page_dirty() here.

> 	xa_lock_irqsave(&mapping->i_pages, flags);
> 	if (page->mapping) {	/* Race with truncate? */
> 		WARN_ON_ONCE(warn && !PageUptodate(page));
> 		account_page_dirtied(page, mapping);
> 		__xa_set_mark(&mapping->i_pages, page_index(page),
> 				PAGECACHE_TAG_DIRTY);
> 	}
> 	xa_unlock_irqrestore(&mapping->i_pages, flags);
> 
> Both are clearly assuming that once i_pages is locked, mapping can't
> change.  So, inside account_page_dirtied(), mapping clearly can't
> change.  The TP in question - track_foreign_dirty - is invoked from
> mem_cgroup_track_foreign_dirty() which is only called from
> account_page_dirty(), so I'm failing to see how mapping would change
> there.

I'm not sure where we depend here on page->mapping not getting cleared. The
point is even if page->mapping is getting cleared while we work on the
page, we have 'mapping' stored locally so we just account everything
against the original mapping. 

I've researched this a bit more and commit 2d6d7f982846 "mm: protect
set_page_dirty() from ongoing truncation" introduced the idea that
__set_page_dirty_nobuffers() should be only called synchronized with
truncation. Now I know for a fact that this is not always the case (e.g.
various RDMA drivers calling set_page_dirty() without a lock or any other
protection against truncate) but let's consider this a bug in the caller of
set_page_dirty(). So in the end I agree that you're fine with relying on
page_mapping() not changing under you.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-09-02  8:38 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-29 22:47 [PATCH block/for-next] writeback: add tracepoints for cgroup foreign writebacks Tejun Heo
2019-08-30 13:43 ` Jens Axboe
2019-08-30 15:40 ` Jan Kara
2019-08-30 15:49   ` Tejun Heo
2019-08-30 16:42     ` Jan Kara
2019-08-30 17:09       ` Tejun Heo
2019-09-02  8:38         ` Jan Kara
2019-08-30 15:51   ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).