Linux-rt-users archive on lore.kernel.org
 help / color / Atom feed
From: Yang Shi <yang.shi@linaro.org>
To: gregkh@linuxfoundation.org, tj@kernel.org, lizefan@huawei.com,
	tglx@linutronix.de, rostedt@goodmis.org, bigeasy@linutronix.de
Cc: linux-kernel@vger.kernel.org, linux-rt-users@vger.kernel.org,
	linaro-kernel@lists.linaro.org, yang.shi@linaro.org
Subject: [RFC V4 PATCH] trace: writeback: replace cgroup path to cgroup ino
Date: Thu,  3 Mar 2016 01:08:57 -0800
Message-ID: <1456996137-8354-1-git-send-email-yang.shi@linaro.org> (raw)

commit 5634cc2aa9aebc77bc862992e7805469dcf83dac ("writeback: update writeback
tracepoints to report cgroup") made writeback tracepoints print out cgroup
path when CGROUP_WRITEBACK is enabled, but it may trigger the below bug on -rt
kernel since kernfs_path and kernfs_path_len are called by tracepoints, which
acquire spin lock that is sleepable on -rt kernel.

BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:930
in_atomic(): 1, irqs_disabled(): 0, pid: 625, name: kworker/u16:3
INFO: lockdep is turned off.
Preemption disabled at:[<ffffffc000374a5c>] wb_writeback+0xec/0x830

CPU: 7 PID: 625 Comm: kworker/u16:3 Not tainted 4.4.1-rt5 #20
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Workqueue: writeback wb_workfn (flush-7:0)
Call trace:
[<ffffffc00008d708>] dump_backtrace+0x0/0x200
[<ffffffc00008d92c>] show_stack+0x24/0x30
[<ffffffc0007b0f40>] dump_stack+0x88/0xa8
[<ffffffc000127d74>] ___might_sleep+0x2ec/0x300
[<ffffffc000d5d550>] rt_spin_lock+0x38/0xb8
[<ffffffc0003e0548>] kernfs_path_len+0x30/0x90
[<ffffffc00036b360>] trace_event_raw_event_writeback_work_class+0xe8/0x2e8
[<ffffffc000374f90>] wb_writeback+0x620/0x830
[<ffffffc000376224>] wb_workfn+0x61c/0x950
[<ffffffc000110adc>] process_one_work+0x3ac/0xb30
[<ffffffc0001112fc>] worker_thread+0x9c/0x7a8
[<ffffffc00011a9e8>] kthread+0x190/0x1b0
[<ffffffc000086ca0>] ret_from_fork+0x10/0x30

With unlocked kernfs_* functions, synchronize_sched() has to be called in
kernfs_rename which could be called in syscall path, but it is problematic.
So, print out cgroup ino instead of path name, which could be converted to
path name by userland.

Withouth CGROUP_WRITEBACK enabled, it just prints out root dir. But, root
dir ino vary from different filesystems, so printing out -1U to indicate
an invalid cgroup ino.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
---
It should be applicable to both mainline and -rt kernel.
The change survives ftrace stress test in ltp.

v4: Acording to the discussion in v3 review, print out cgroup ino instead of
    cgroup path name since it is problematic to call synchronize_sched() in
    syscall.

v3:
  * Took Greg's comment to rename _kernfs_* and _cgroup_path to
    kernfs_*_unlocked and cgroup_path_unlocked.

v2:
  * Adopted Steven's suggestion to call synchronize_sched in kernfs_rename_ns.

 include/trace/events/writeback.h | 121 +++++++++++++++------------------------
 1 file changed, 45 insertions(+), 76 deletions(-)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index fff846b..73614ce 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
 #ifdef CREATE_TRACE_POINTS
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
 {
-	return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+	return wb->memcg_css->cgroup->kn->ino;
 }
 
-static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
-{
-	struct cgroup *cgrp = wb->memcg_css->cgroup;
-	char *path;
-
-	path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
-	WARN_ON_ONCE(path != buf);
-}
-
-static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
-{
-	if (wbc->wb)
-		return __trace_wb_cgroup_size(wbc->wb);
-	else
-		return 2;
-}
-
-static inline void __trace_wbc_assign_cgroup(char *buf,
-					     struct writeback_control *wbc)
+static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
 {
 	if (wbc->wb)
-		__trace_wb_assign_cgroup(buf, wbc->wb);
+		return __trace_wb_assign_cgroup(wbc->wb);
 	else
-		strcpy(buf, "/");
+		return -1U;
 }
-
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
-static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
-{
-	return 2;
-}
-
-static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
-{
-	strcpy(buf, "/");
-}
-
-static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
 {
-	return 2;
+	return -1U;
 }
 
-static inline void __trace_wbc_assign_cgroup(char *buf,
-					     struct writeback_control *wbc)
+static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
 {
-	strcpy(buf, "/");
+	return -1U;
 }
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
@@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 		__array(char, name, 32)
 		__field(unsigned long, ino)
 		__field(int, sync_mode)
-		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
+		__field(unsigned int, cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 			dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
-		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
+		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
 	),
 
-	TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
+	TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
 		__entry->name,
 		__entry->ino,
 		__entry->sync_mode,
-		__get_str(cgroup)
+		__entry->cgroup_ino
 	)
 );
 
@@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, range_cyclic)
 		__field(int, for_background)
 		__field(int, reason)
-		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+		__field(unsigned int, cgroup_ino)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name,
@@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
 		__entry->reason = work->reason;
-		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
+		__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
 	),
 	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-		  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
+		  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
@@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		  __entry->range_cyclic,
 		  __entry->for_background,
 		  __print_symbolic(__entry->reason, WB_WORK_REASON),
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
@@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
 	TP_ARGS(wb),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
-		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+		__field(unsigned int, cgroup_ino)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
-		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
+		__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
 	),
-	TP_printk("bdi %s: cgroup=%s",
+	TP_printk("bdi %s: cgroup_ino=%u",
 		  __entry->name,
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	)
 );
 #define DEFINE_WRITEBACK_EVENT(name) \
@@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(int, range_cyclic)
 		__field(long, range_start)
 		__field(long, range_end)
-		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
+		__field(unsigned int, cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
-		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
+		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
 	),
 
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
 		"bgrd=%d reclm=%d cyclic=%d "
-		"start=0x%lx end=0x%lx cgroup=%s",
+		"start=0x%lx end=0x%lx cgroup_ino=%u",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
@@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_cyclic,
 		__entry->range_start,
 		__entry->range_end,
-		__get_str(cgroup)
+		__entry->cgroup_ino
 	)
 )
 
@@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
 		__field(long,		age)
 		__field(int,		moved)
 		__field(int,		reason)
-		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+		__field(unsigned int,	cgroup_ino)
 	),
 	TP_fast_assign(
 		unsigned long *older_than_this = work->older_than_this;
@@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
 		__entry->reason	= work->reason;
-		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
 	),
-	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
 		__entry->name,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
 		__entry->moved,
 		__print_symbolic(__entry->reason, WB_WORK_REASON),
-		__get_str(cgroup)
+		__entry->cgroup_ino
 	)
 );
 
@@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 		__field(unsigned long,	dirty_ratelimit)
 		__field(unsigned long,	task_ratelimit)
 		__field(unsigned long,	balanced_dirty_ratelimit)
-		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+		__field(unsigned int,	cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->balanced_dirty_ratelimit =
 					KBps(wb->balanced_dirty_ratelimit);
-		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
 	),
 
 	TP_printk("bdi %s: "
 		  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
-		  "balanced_dirty_ratelimit=%lu cgroup=%s",
+		  "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
 		  __entry->bdi,
 		  __entry->write_bw,		/* write bandwidth */
 		  __entry->avg_write_bw,	/* avg write bandwidth */
@@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 		  __entry->dirty_ratelimit,	/* base ratelimit */
 		  __entry->task_ratelimit, /* ratelimit with position control */
 		  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	)
 );
 
@@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(	 long,	pause)
 		__field(unsigned long,	period)
 		__field(	 long,	think)
-		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+		__field(unsigned int,	cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->period		= period * 1000 / HZ;
 		__entry->pause		= pause * 1000 / HZ;
 		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
-		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
 	),
 
 
@@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
 		  "bdi_setpoint=%lu bdi_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
-		  "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
+		  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
 		  __entry->bdi,
 		  __entry->limit,
 		  __entry->setpoint,
@@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->pause,	/* ms */
 		  __entry->period,	/* ms */
 		  __entry->think,	/* ms */
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	  )
 );
 
@@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
 		__field(unsigned long, ino)
 		__field(unsigned long, state)
 		__field(unsigned long, dirtied_when)
-		__dynamic_array(char, cgroup,
-				__trace_wb_cgroup_size(inode_to_wb(inode)))
+		__field(unsigned int, cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
 		__entry->ino		= inode->i_ino;
 		__entry->state		= inode->i_state;
 		__entry->dirtied_when	= inode->dirtied_when;
-		__trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
+		__entry->cgroup_ino	= __trace_wb_assign_cgroup(inode_to_wb(inode));
 	),
 
-	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
+	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
 		  __entry->name,
 		  __entry->ino,
 		  show_inode_state(__entry->state),
 		  __entry->dirtied_when,
 		  (jiffies - __entry->dirtied_when) / HZ,
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	)
 );
 
@@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		__field(unsigned long, writeback_index)
 		__field(long, nr_to_write)
 		__field(unsigned long, wrote)
-		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
+		__field(unsigned int, cgroup_ino)
 	),
 
 	TP_fast_assign(
@@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->nr_to_write	= nr_to_write;
 		__entry->wrote		= nr_to_write - wbc->nr_to_write;
-		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
+		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
 	),
 
 	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
-		  "index=%lu to_write=%ld wrote=%lu cgroup=%s",
+		  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
 		  __entry->name,
 		  __entry->ino,
 		  show_inode_state(__entry->state),
@@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		  __entry->writeback_index,
 		  __entry->nr_to_write,
 		  __entry->wrote,
-		  __get_str(cgroup)
+		  __entry->cgroup_ino
 	)
 );
 
-- 
2.0.2

             reply index

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-03  9:08 Yang Shi [this message]
2016-03-04 12:52 ` Tejun Heo
2016-03-07  3:32   ` Shi, Yang
2016-03-07 14:03     ` Steven Rostedt
2016-03-29 14:30       ` Sebastian Andrzej Siewior

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1456996137-8354-1-git-send-email-yang.shi@linaro.org \
    --to=yang.shi@linaro.org \
    --cc=bigeasy@linutronix.de \
    --cc=gregkh@linuxfoundation.org \
    --cc=linaro-kernel@lists.linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=lizefan@huawei.com \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-rt-users archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rt-users/0 linux-rt-users/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rt-users linux-rt-users/ https://lore.kernel.org/linux-rt-users \
		linux-rt-users@vger.kernel.org
	public-inbox-index linux-rt-users

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rt-users


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git