[PATCH] printk: Add best-effort printk() buffering.

* [PATCH] printk: Add best-effort printk() buffering.
@ 2017-04-30 13:54 Tetsuo Handa
  2017-04-30 16:11 ` Joe Perches
  2017-05-08  7:05 ` Sergey Senozhatsky
  0 siblings, 2 replies; 11+ messages in thread
From: Tetsuo Handa @ 2017-04-30 13:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: Tetsuo Handa, Ingo Molnar, Peter Zijlstra, Petr Mladek,
	Sergey Senozhatsky, Steven Rostedt

Sometimes we want to printk() multiple lines in a group without being
disturbed by concurrent printk() from interrupts and/or other threads.
For example, mixed printk() output of multiple thread's dump makes it
hard to interpret.

This patch introduces fixed-sized statically allocated buffers for
buffering printk() output for each thread/context in best effort
(i.e. up to PAGE_SIZE bytes, up to 16 concurrent printk() users).

Typical thread dumps are likely less than PAGE_SIZE bytes. If they
exceeded PAGE_SIZE bytes, they will be printed as multiple groups
by trying to group as many lines as possible.

If there happened to be more than 16 concurrent printk() users, existing
printk() will be used for users who failed to assign buffers. Of course,
if there were more than 16 concurrent printk() users, the printk() output
would flood the console and the system would be already unusable. Thus,
16 buffers should be sufficient unless e.g. hundreds of memory allocating
threads concurrently call warn_alloc() for reporting allocation stalls on
a large system.

Three functions (get_printk_buffer(), flush_printk_buffer() and
put_printk_buffer()) are provided for printk() buffering.

  get_printk_buffer() tries to assign a buffer of the calling
  thread/context.

  flush_printk_buffer() flushes the buffer of the calling thread/context.
  flush_printk_buffer() becomes a no-op if the buffer was not assigned.

  put_printk_buffer() flushes and releases the buffer of the calling
  thread/context. put_printk_buffer() becomes a no-op if the buffer was
  not assigned.

These functions are safe to call from NMI context, hard IRQ context,
soft IRQ context, or task context. But these functions become a no-op
if called from NMI context, for vprintk_func() will call vprintk_nmi()
rather than vprintk_default() (where printk() buffering is taken place)
if printk() is called from NMI context.

get_printk_buffer() and put_printk_buffer() can nest as with
rcu_read_lock() and rcu_read_unlock(), and put_printk_buffer() must match
corresponding get_printk_buffer() as with rcu_read_unlock() must match
corresponding rcu_read_lock().

This patch calls these functions from show_state_filter() as an example.
By avoiding small writes using buffers, average of 10 times of

  # time echo t > /proc/sysrq-trigger

showed 25% improvement for default console (no console= argument) case

  buffered:    798.2ms
  unbuffered: 1070.4ms

and 10% improvement for console=ttyS0,115200n8 console=tty0 case

  buffered:    2915.5ms
  unbuffered:  3272.1ms

in my environment (VMware Player, 4CPUs, 2048MB RAM).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/printk.h |  12 +++
 kernel/printk/printk.c | 243 +++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/core.c    |   8 +-
 lib/dump_stack.c       |   2 +
 4 files changed, 238 insertions(+), 27 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index e10f274..da35a01 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -170,6 +170,9 @@ int printk_emit(int facility, int level,
 
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
+void get_printk_buffer(void);
+void flush_printk_buffer(void);
+void put_printk_buffer(void);
 
 /*
  * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
@@ -217,6 +220,15 @@ int printk(const char *s, ...)
 {
 	return 0;
 }
+static inline void get_printk_buffer(void)
+{
+}
+static inline void flush_printk_buffer(void)
+{
+}
+static inline void put_printk_buffer(void)
+{
+}
 static inline __printf(1, 2) __cold
 int printk_deferred(const char *s, ...)
 {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fb2d159..4f5663f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1693,33 +1693,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
 	return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
 }
 
-asmlinkage int vprintk_emit(int facility, int level,
-			    const char *dict, size_t dictlen,
-			    const char *fmt, va_list args)
+static int process_log(int facility, int level, const char *dict,
+		       size_t dictlen, char *text, size_t text_len)
 {
-	static char textbuf[LOG_LINE_MAX];
-	char *text = textbuf;
-	size_t text_len = 0;
 	enum log_flags lflags = 0;
-	unsigned long flags;
-	int printed_len = 0;
-	bool in_sched = false;
-
-	if (level == LOGLEVEL_SCHED) {
-		level = LOGLEVEL_DEFAULT;
-		in_sched = true;
-	}
-
-	boot_delay_msec(level);
-	printk_delay();
-
-	/* This stops the holder of console_sem just where we want him */
-	logbuf_lock_irqsave(flags);
-	/*
-	 * The printf needs to come first; we need the syslog
-	 * prefix which might be passed-in as a parameter.
-	 */
-	text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
 
 	/* mark and strip a trailing newline */
 	if (text_len && text[text_len-1] == '\n') {
@@ -1755,7 +1732,39 @@ asmlinkage int vprintk_emit(int facility, int level,
 	if (dict)
 		lflags |= LOG_PREFIX|LOG_NEWLINE;
 
-	printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
+	return log_output(facility, level, lflags, dict, dictlen, text,
+			  text_len);
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+			    const char *dict, size_t dictlen,
+			    const char *fmt, va_list args)
+{
+	static char textbuf[LOG_LINE_MAX];
+	char *text = textbuf;
+	size_t text_len = 0;
+	unsigned long flags;
+	int printed_len = 0;
+	bool in_sched = false;
+
+	if (level == LOGLEVEL_SCHED) {
+		level = LOGLEVEL_DEFAULT;
+		in_sched = true;
+	}
+
+	boot_delay_msec(level);
+	printk_delay();
+
+	/* This stops the holder of console_sem just where we want him */
+	logbuf_lock_irqsave(flags);
+	/*
+	 * The printf needs to come first; we need the syslog
+	 * prefix which might be passed-in as a parameter.
+	 */
+	text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+
+	printed_len = process_log(facility, level, dict, dictlen, text,
+				  text_len);
 
 	logbuf_unlock_irqrestore(flags);
 
@@ -1795,8 +1804,153 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
+#define MAX_PRINTK_BUFFERS 16
+static struct printk_buffer {
+	unsigned long context; /* printk_context() */
+	unsigned int nested;
+	unsigned int used; /* Valid bytes in buf[]. */
+	char buf[PAGE_SIZE];
+} printk_buffers[MAX_PRINTK_BUFFERS];
+
+/* Must not be called from NMI context. */
+static void __flush_printk_buffer(struct printk_buffer *ptr, bool all)
+{
+	unsigned long flags;
+
+	if (!ptr->used)
+		return;
+
+	/*
+	 * Since printk_deferred() directly calls vprintk_emit(LOGLEVEL_SCHED),
+	 * this function does not need to care about LOGLEVEL_SCHED case.
+	 * Therefore, it is safe to call console_trylock() + console_unlock().
+	 *
+	 * We don't call boot_delay_msec(level) here because level is unknown.
+	 */
+	printk_delay();
+
+	/* This stops the holder of console_sem just where we want him */
+	logbuf_lock_irqsave(flags);
+	while (1) {
+		char *text = ptr->buf;
+		unsigned int text_len = ptr->used;
+		char *cp = memchr(text, '\n', text_len);
+		char c;
+
+		if (cp++)
+			text_len = cp - text;
+		else if (all)
+			cp = text + text_len;
+		else
+			break;
+		/* printk_get_level() depends on text '\0'-terminated. */
+		c = *cp;
+		*cp = '\0';
+		process_log(0, LOGLEVEL_DEFAULT, NULL, 0, text, text_len);
+		ptr->used -= text_len;
+		if (!ptr->used)
+			break;
+		*cp = c;
+		memmove(text, text + text_len, ptr->used);
+	}
+	logbuf_unlock_irqrestore(flags);
+	/*
+	 * Try to acquire and then immediately release the console
+	 * semaphore.  The release will print out buffers and wake up
+	 * /dev/kmsg and syslog() users.
+	 */
+	if (console_trylock())
+		console_unlock();
+}
+
+static unsigned long printk_context(void)
+{
+	/*
+	 * Assume that we can use lower 2 bits for flags, as with
+	 * __mutex_owner() does.
+	 */
+	unsigned long context = (unsigned long) current;
+
+	/* Both bits set means processing NMI context. */
+	if (in_nmi())
+		context |= 3;
+	/* Only next-LSB set means processing hard IRQ context. */
+	else if (in_irq())
+		context |= 2;
+	/* Only LSB set means processing soft IRQ context. */
+	else if (in_serving_softirq())
+		context |= 1;
+	/*
+	 * Neither bits set means processing task context,
+	 * though still might be non sleepable context.
+	 */
+	return context;
+}
+
+static struct printk_buffer *find_printk_buffer(void)
+{
+	const unsigned long context = printk_context();
+	int i;
+
+	/* No-op if called from NMI context. */
+	if ((context & 3) == 3)
+		return NULL;
+	for (i = 0; i < MAX_PRINTK_BUFFERS; i++)
+		if (context == printk_buffers[i].context)
+			return &printk_buffers[i];
+	return NULL;
+}
+
+void get_printk_buffer(void)
+{
+	const unsigned long context = printk_context();
+	int i;
+
+	/* No-op if called from NMI context. */
+	if ((context & 3) == 3)
+		return;
+	for (i = 0; i < MAX_PRINTK_BUFFERS; i++) {
+		struct printk_buffer *ptr = &printk_buffers[i];
+
+		if (ptr->context != context) {
+			if (ptr->context ||
+			    cmpxchg(&ptr->context, 0, context))
+				continue;
+			ptr->nested = 0;
+			ptr->used = 0;
+		} else {
+			ptr->nested++;
+		}
+		break;
+	}
+}
+EXPORT_SYMBOL(get_printk_buffer);
+
+void flush_printk_buffer(void)
+{
+	struct printk_buffer *ptr = find_printk_buffer();
+
+	if (ptr)
+		__flush_printk_buffer(ptr, true);
+}
+EXPORT_SYMBOL(flush_printk_buffer);
+
+void put_printk_buffer(void)
+{
+	struct printk_buffer *ptr = find_printk_buffer();
+
+	if (!ptr || ptr->nested--)
+		return;
+	__flush_printk_buffer(ptr, true);
+	xchg(&ptr->context, 0);
+}
+EXPORT_SYMBOL(put_printk_buffer);
+
 int vprintk_default(const char *fmt, va_list args)
 {
+	struct printk_buffer *ptr;
+	va_list args2;
+	unsigned int i;
 	int r;
 
 #ifdef CONFIG_KGDB_KDB
@@ -1806,6 +1960,43 @@ int vprintk_default(const char *fmt, va_list args)
 		return r;
 	}
 #endif
+	ptr = find_printk_buffer();
+	if (!ptr)
+		goto use_unbuffered;
+	/*
+	 * Try to store to printk_buffer first. If it fails, flush completed
+	 * lines in printk_buffer, and then try to store to printk_buffer
+	 * again. If it still fails, flush incomplete line in printk_buffer
+	 * and use unbuffered printing.
+	 *
+	 * Since printk_buffer is identified by current thread and interrupt
+	 * context and same level of context does not recurse, we don't need
+	 * logbuf_lock_irqsave()/logbuf_unlock_irqrestore() here except
+	 * __flush_printk_buffer().
+	 */
+	for (i = 0; i < 2; i++) {
+		unsigned int pos = ptr->used;
+		char *text = ptr->buf + pos;
+
+		va_copy(args2, args);
+		r = vsnprintf(text, sizeof(ptr->buf) - pos, fmt, args2);
+		va_end(args2);
+		if (r + pos < sizeof(ptr->buf)) {
+			/*
+			 * Eliminate KERN_CONT at this point because we can
+			 * concatenate incomplete lines inside printk_buffer.
+			 */
+			if (r >= 2 && printk_get_level(text) == 'c') {
+				memmove(text, text + 2, r - 2);
+				ptr->used += r - 2;
+			} else {
+				ptr->used += r;
+			}
+			return r;
+		}
+		__flush_printk_buffer(ptr, i);
+	}
+use_unbuffered:
 	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 
 	return r;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39b9a279..54517f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5305,6 +5305,7 @@ void sched_show_task(struct task_struct *p)
 
 	if (!try_get_task_stack(p))
 		return;
+	get_printk_buffer();
 	if (state)
 		state = __ffs(state) + 1;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
@@ -5326,12 +5327,14 @@ void sched_show_task(struct task_struct *p)
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 	put_task_stack(p);
+	put_printk_buffer();
 }
 
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 
+	get_printk_buffer();
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
@@ -5350,8 +5353,10 @@ void show_state_filter(unsigned long state_filter)
 		 */
 		touch_nmi_watchdog();
 		touch_all_softlockup_watchdogs();
-		if (!state_filter || (p->state & state_filter))
+		if (!state_filter || (p->state & state_filter)) {
 			sched_show_task(p);
+			flush_printk_buffer();
+		}
 	}
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -5364,6 +5369,7 @@ void show_state_filter(unsigned long state_filter)
 	 */
 	if (!state_filter)
 		debug_show_all_locks();
+	put_printk_buffer();
 }
 
 void init_idle_bootup_task(struct task_struct *idle)
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 625375e..da0e64c 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,8 +12,10 @@
 
 static void __dump_stack(void)
 {
+	get_printk_buffer();
 	dump_stack_print_info(KERN_DEFAULT);
 	show_stack(NULL, NULL);
+	put_printk_buffer();
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread