All of lore.kernel.org
 help / color / mirror / Atom feed
From: Byungchul Park <byungchul.park@lge.com>
To: peterz@infradead.org, mingo@kernel.org
Cc: tglx@linutronix.de, walken@google.com, boqun.feng@gmail.com,
	kirill@shutemov.name, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org, npiggin@gmail.com
Subject: [PATCH v4 01/15] x86/dumpstack: Optimize save_stack_trace
Date: Fri,  9 Dec 2016 14:11:57 +0900	[thread overview]
Message-ID: <1481260331-360-2-git-send-email-byungchul.park@lge.com> (raw)
In-Reply-To: <1481260331-360-1-git-send-email-byungchul.park@lge.com>

Currently, x86 implementation of save_stack_trace() is walking all stack
region word by word regardless of what the trace->max_entries is.
However, it's unnecessary to walk after already fulfilling caller's
requirement, say, if trace->nr_entries >= trace->max_entries is true.

I measured its overhead and printed its difference of sched_clock() with
my QEMU x86 machine. The latency was improved over 70% when
trace->max_entries = 5.

Before this patch:

[    2.329573] save_stack_trace() takes 76820 ns
[    2.329863] save_stack_trace() takes 62131 ns
[    2.330000] save_stack_trace() takes 99476 ns
[    2.329846] save_stack_trace() takes 62419 ns
[    2.330000] save_stack_trace() takes 88918 ns
[    2.330253] save_stack_trace() takes 73669 ns
[    2.330520] save_stack_trace() takes 67876 ns
[    2.330671] save_stack_trace() takes 75963 ns
[    2.330983] save_stack_trace() takes 95079 ns
[    2.330451] save_stack_trace() takes 62352 ns

After this patch:

[    2.795000] save_stack_trace() takes 21147 ns
[    2.795397] save_stack_trace() takes 20230 ns
[    2.795397] save_stack_trace() takes 31274 ns
[    2.795739] save_stack_trace() takes 19706 ns
[    2.796484] save_stack_trace() takes 20266 ns
[    2.796484] save_stack_trace() takes 20902 ns
[    2.797000] save_stack_trace() takes 38110 ns
[    2.797510] save_stack_trace() takes 20224 ns
[    2.798181] save_stack_trace() takes 20172 ns
[    2.798837] save_stack_trace() takes 20824 ns

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
---
 arch/x86/include/asm/stacktrace.h | 1 +
 arch/x86/kernel/dumpstack.c       | 4 ++++
 arch/x86/kernel/dumpstack_32.c    | 2 ++
 arch/x86/kernel/stacktrace.c      | 7 +++++++
 4 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0944218..f6d0694 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -41,6 +41,7 @@ struct stacktrace_ops {
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
 	walk_stack_t	walk_stack;
+	int (*end_walk)(void *data);
 };
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ef8017c..274d42a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -113,6 +113,8 @@ print_context_stack(struct task_struct *task,
 			print_ftrace_graph_addr(addr, data, ops, task, graph);
 		}
 		stack++;
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 	}
 	return bp;
 }
@@ -138,6 +140,8 @@ print_context_stack_bp(struct task_struct *task,
 		frame = frame->next_frame;
 		ret_addr = &frame->return_address;
 		print_ftrace_graph_addr(addr, data, ops, task, graph);
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 	}
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index fef917e..762d1fd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -69,6 +69,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
 		bp = ops->walk_stack(task, stack, bp, ops, data,
 				     end_stack, &graph);
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 
 		/* Stop if not on irq stack */
 		if (!end_stack)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 9ee98ee..a44de4d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -47,10 +47,17 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 	return __save_stack_address(data, addr, reliable, true);
 }
 
+static int save_stack_end(void *data)
+{
+	struct stack_trace *trace = data;
+	return trace->nr_entries >= trace->max_entries;
+}
+
 static const struct stacktrace_ops save_stack_ops = {
 	.stack		= save_stack_stack,
 	.address	= save_stack_address,
 	.walk_stack	= print_context_stack,
+	.end_walk	= save_stack_end,
 };
 
 static const struct stacktrace_ops save_stack_ops_nosched = {
-- 
1.9.1

WARNING: multiple messages have this Message-ID (diff)
From: Byungchul Park <byungchul.park@lge.com>
To: peterz@infradead.org, mingo@kernel.org
Cc: tglx@linutronix.de, walken@google.com, boqun.feng@gmail.com,
	kirill@shutemov.name, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org, npiggin@gmail.com
Subject: [PATCH v4 01/15] x86/dumpstack: Optimize save_stack_trace
Date: Fri,  9 Dec 2016 14:11:57 +0900	[thread overview]
Message-ID: <1481260331-360-2-git-send-email-byungchul.park@lge.com> (raw)
In-Reply-To: <1481260331-360-1-git-send-email-byungchul.park@lge.com>

Currently, x86 implementation of save_stack_trace() is walking all stack
region word by word regardless of what the trace->max_entries is.
However, it's unnecessary to walk after already fulfilling caller's
requirement, say, if trace->nr_entries >= trace->max_entries is true.

I measured its overhead and printed its difference of sched_clock() with
my QEMU x86 machine. The latency was improved over 70% when
trace->max_entries = 5.

Before this patch:

[    2.329573] save_stack_trace() takes 76820 ns
[    2.329863] save_stack_trace() takes 62131 ns
[    2.330000] save_stack_trace() takes 99476 ns
[    2.329846] save_stack_trace() takes 62419 ns
[    2.330000] save_stack_trace() takes 88918 ns
[    2.330253] save_stack_trace() takes 73669 ns
[    2.330520] save_stack_trace() takes 67876 ns
[    2.330671] save_stack_trace() takes 75963 ns
[    2.330983] save_stack_trace() takes 95079 ns
[    2.330451] save_stack_trace() takes 62352 ns

After this patch:

[    2.795000] save_stack_trace() takes 21147 ns
[    2.795397] save_stack_trace() takes 20230 ns
[    2.795397] save_stack_trace() takes 31274 ns
[    2.795739] save_stack_trace() takes 19706 ns
[    2.796484] save_stack_trace() takes 20266 ns
[    2.796484] save_stack_trace() takes 20902 ns
[    2.797000] save_stack_trace() takes 38110 ns
[    2.797510] save_stack_trace() takes 20224 ns
[    2.798181] save_stack_trace() takes 20172 ns
[    2.798837] save_stack_trace() takes 20824 ns

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
---
 arch/x86/include/asm/stacktrace.h | 1 +
 arch/x86/kernel/dumpstack.c       | 4 ++++
 arch/x86/kernel/dumpstack_32.c    | 2 ++
 arch/x86/kernel/stacktrace.c      | 7 +++++++
 4 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0944218..f6d0694 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -41,6 +41,7 @@ struct stacktrace_ops {
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
 	walk_stack_t	walk_stack;
+	int (*end_walk)(void *data);
 };
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ef8017c..274d42a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -113,6 +113,8 @@ print_context_stack(struct task_struct *task,
 			print_ftrace_graph_addr(addr, data, ops, task, graph);
 		}
 		stack++;
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 	}
 	return bp;
 }
@@ -138,6 +140,8 @@ print_context_stack_bp(struct task_struct *task,
 		frame = frame->next_frame;
 		ret_addr = &frame->return_address;
 		print_ftrace_graph_addr(addr, data, ops, task, graph);
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 	}
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index fef917e..762d1fd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -69,6 +69,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
 		bp = ops->walk_stack(task, stack, bp, ops, data,
 				     end_stack, &graph);
+		if (ops->end_walk && ops->end_walk(data))
+			break;
 
 		/* Stop if not on irq stack */
 		if (!end_stack)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 9ee98ee..a44de4d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -47,10 +47,17 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 	return __save_stack_address(data, addr, reliable, true);
 }
 
+static int save_stack_end(void *data)
+{
+	struct stack_trace *trace = data;
+	return trace->nr_entries >= trace->max_entries;
+}
+
 static const struct stacktrace_ops save_stack_ops = {
 	.stack		= save_stack_stack,
 	.address	= save_stack_address,
 	.walk_stack	= print_context_stack,
+	.end_walk	= save_stack_end,
 };
 
 static const struct stacktrace_ops save_stack_ops_nosched = {
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2016-12-09  5:16 UTC|newest]

Thread overview: 104+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-09  5:11 [PATCH v4 00/15] lockdep: Implement crossrelease feature Byungchul Park
2016-12-09  5:11 ` Byungchul Park
2016-12-09  5:11 ` Byungchul Park [this message]
2016-12-09  5:11   ` [PATCH v4 01/15] x86/dumpstack: Optimize save_stack_trace Byungchul Park
2016-12-09  5:11 ` [PATCH v4 02/15] x86/dumpstack: Add save_stack_trace()_fast() Byungchul Park
2016-12-09  5:11   ` Byungchul Park
2016-12-09  5:11 ` [PATCH v4 03/15] lockdep: Refactor lookup_chain_cache() Byungchul Park
2016-12-09  5:11   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 04/15] lockdep: Add a function building a chain between two classes Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2017-01-10 21:00   ` Peter Zijlstra
2017-01-10 21:00     ` Peter Zijlstra
2017-01-12  1:41     ` Byungchul Park
2017-01-12  1:41       ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 05/15] lockdep: Make check_prev_add can use a separate stack_trace Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2017-01-12 16:16   ` Peter Zijlstra
2017-01-12 16:16     ` Peter Zijlstra
2017-01-13  2:45     ` Byungchul Park
2017-01-13  2:45       ` Byungchul Park
2017-01-13  4:09     ` [PATCH] lockdep: Make a stack_trace instance passed to check_prev_add as an arg Byungchul Park
2017-01-13  4:38       ` Byungchul Park
2017-01-13 10:11     ` [PATCH v4 05/15] lockdep: Make check_prev_add can use a separate stack_trace Byungchul Park
2017-01-13 10:11       ` Byungchul Park
2017-01-13 10:17       ` [PATCH 1/2] lockdep: Refactor save_trace() Byungchul Park
2017-01-13 10:17       ` [PATCH 2/2] lockdep: Pass a callback arg to check_prev_add() to handle stack_trace Byungchul Park
2017-01-17 15:54       ` [PATCH v4 05/15] lockdep: Make check_prev_add can use a separate stack_trace Peter Zijlstra
2017-01-17 15:54         ` Peter Zijlstra
2017-01-18  2:04         ` Byungchul Park
2017-01-18  2:04           ` Byungchul Park
2017-01-18 15:10           ` Peter Zijlstra
2017-01-18 15:10             ` Peter Zijlstra
2017-01-19  2:47             ` Byungchul Park
2017-01-19  2:47               ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 06/15] lockdep: Make save_trace can skip stack tracing of the current Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2017-01-12 16:37   ` Peter Zijlstra
2017-01-12 16:37     ` Peter Zijlstra
2017-01-13  0:18     ` Byungchul Park
2017-01-13  0:18       ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 07/15] lockdep: Implement crossrelease feature Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2017-01-13  4:39   ` Lai Jiangshan
2017-01-13  4:39     ` Lai Jiangshan
2017-01-13  5:02     ` Byungchul Park
2017-01-13  5:02       ` Byungchul Park
2017-01-16 15:10   ` Peter Zijlstra
2017-01-16 15:10     ` Peter Zijlstra
2017-01-17  2:05     ` Byungchul Park
2017-01-17  2:05       ` Byungchul Park
2017-01-17  7:12       ` Peter Zijlstra
2017-01-17  7:12         ` Peter Zijlstra
2017-01-17  7:49         ` Byungchul Park
2017-01-17  7:49           ` Byungchul Park
2017-01-17  7:14       ` Peter Zijlstra
2017-01-17  7:14         ` Peter Zijlstra
2017-01-17  7:45         ` Byungchul Park
2017-01-17  7:45           ` Byungchul Park
2017-01-16 15:13   ` Peter Zijlstra
2017-01-16 15:13     ` Peter Zijlstra
2017-01-17  2:33     ` Byungchul Park
2017-01-17  2:33       ` Byungchul Park
2017-01-17  6:24       ` Boqun Feng
2017-01-17  7:43         ` Byungchul Park
2017-01-17  7:43           ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 08/15] lockdep: Make crossrelease use save_stack_trace_fast() Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 09/15] lockdep: Make print_circular_bug() crosslock-aware Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 10/15] lockdep: Apply crossrelease to completion operation Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 11/15] pagemap.h: Remove trailing white space Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 12/15] lockdep: Apply crossrelease to PG_locked lock Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 13/15] lockdep: Apply lock_acquire(release) on __Set(__Clear)PageLocked Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 14/15] lockdep: Move data used in CONFIG_LOCKDEP_PAGELOCK from page to page_ext Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2016-12-09  5:12 ` [PATCH v4 15/15] lockdep: Crossrelease feature documentation Byungchul Park
2016-12-09  5:12   ` Byungchul Park
2017-01-10 20:08   ` Peter Zijlstra
2017-01-10 20:08     ` Peter Zijlstra
2017-01-11  1:29     ` Byungchul Park
2017-01-11  1:29       ` Byungchul Park
2017-01-18  6:42   ` Boqun Feng
2017-01-18 10:53     ` Byungchul Park
2017-01-18 10:53       ` Byungchul Park
2017-01-18 11:03       ` Peter Zijlstra
2017-01-18 11:03         ` Peter Zijlstra
2017-01-18 11:54         ` Byungchul Park
2017-01-18 11:54           ` Byungchul Park
2017-01-18 12:07           ` Peter Zijlstra
2017-01-18 12:07             ` Peter Zijlstra
2017-01-18 12:14             ` byungchul.park
2017-01-18 12:14               ` byungchul.park
2017-01-18 14:12               ` Peter Zijlstra
2017-01-18 14:12                 ` Peter Zijlstra
2017-01-19  1:54                 ` Byungchul Park
2017-01-19  1:54                   ` Byungchul Park
2017-01-18 12:49             ` byungchul.park
2017-01-18 12:49               ` byungchul.park
2016-12-09  5:21 ` [FYI] Output of 'cat /proc/lockdep' after applying crossrelease Byungchul Park
2016-12-09  5:21   ` Byungchul Park

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1481260331-360-2-git-send-email-byungchul.park@lge.com \
    --to=byungchul.park@lge.com \
    --cc=akpm@linux-foundation.org \
    --cc=boqun.feng@gmail.com \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=kirill@shutemov.name \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@kernel.org \
    --cc=npiggin@gmail.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=walken@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.