All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] tracing: don't busy wait in buffer splice
@ 2014-11-06 21:26 Rabin Vincent
  2014-11-06 21:26 ` [PATCH 2/2] tracing: don't risk busy looping " Rabin Vincent
  2014-11-07 18:21 ` [PATCH 1/2] tracing: don't busy wait " Steven Rostedt
  0 siblings, 2 replies; 5+ messages in thread
From: Rabin Vincent @ 2014-11-06 21:26 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar; +Cc: linux-kernel, Rabin Vincent

On a !PREEMPT kernel, attempting to use trace-cmd results in a soft
lockup:

 # trace-cmd record -e raw_syscalls:* -F false
 NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61]
 ...
 Call Trace:
  [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90
  [<ffffffff81092e25>] wait_on_pipe+0x35/0x40
  [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0
  [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0
  [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40
  [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290
  [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0
  [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
  [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60
  [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
  [<ffffffff8112415d>] do_splice_to+0x6d/0x90
  [<ffffffff81126971>] SyS_splice+0x7c1/0x800
  [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8

The problem is this: tracing_buffers_splice_read() calls
ring_buffer_wait() to wait for data in the ring buffers.  The buffers
are not empty so ring_buffer_wait() returns immediately.  But
tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1,
meaning it only wants to read a full page.  When the full page is not
available, tracing_buffers_splice_read() tries to wait again with
ring_buffer_wait(), which again returns immediately, and so on.

Fix this by adding a "full" argument to ring_buffer_wait() which will
make ring_buffer_wait() wait until the writer has left the reader's
page, i.e.  until full-page reads will succeed.

Signed-off-by: Rabin Vincent <rabin@rab.in>
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 81 ++++++++++++++++++++++++++++++---------------
 kernel/trace/trace.c        | 12 ++++---
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 49a4d6f..e2c13cd 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu);
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full);
 int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94..8c8143b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
  * ring_buffer_wait - wait for input to the ring buffer
  * @buffer: buffer to wait on
  * @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available
  *
  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
  * as data is added to any of the @buffer's cpu buffers. Otherwise
  * it will wait for data to be added to a specific cpu buffer.
  */
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
 	DEFINE_WAIT(wait);
 	struct rb_irq_work *work;
+	int ret = 0;
 
 	/*
 	 * Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 	}
 
 
-	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+	while (true) {
+		prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
 
-	/*
-	 * The events can happen in critical sections where
-	 * checking a work queue can cause deadlocks.
-	 * After adding a task to the queue, this flag is set
-	 * only to notify events to try to wake up the queue
-	 * using irq_work.
-	 *
-	 * We don't clear it even if the buffer is no longer
-	 * empty. The flag only causes the next event to run
-	 * irq_work to do the work queue wake up. The worse
-	 * that can happen if we race with !trace_empty() is that
-	 * an event will cause an irq_work to try to wake up
-	 * an empty queue.
-	 *
-	 * There's no reason to protect this flag either, as
-	 * the work queue and irq_work logic will do the necessary
-	 * synchronization for the wake ups. The only thing
-	 * that is necessary is that the wake up happens after
-	 * a task has been queued. It's OK for spurious wake ups.
-	 */
-	work->waiters_pending = true;
+		/*
+		 * The events can happen in critical sections where
+		 * checking a work queue can cause deadlocks.
+		 * After adding a task to the queue, this flag is set
+		 * only to notify events to try to wake up the queue
+		 * using irq_work.
+		 *
+		 * We don't clear it even if the buffer is no longer
+		 * empty. The flag only causes the next event to run
+		 * irq_work to do the work queue wake up. The worse
+		 * that can happen if we race with !trace_empty() is that
+		 * an event will cause an irq_work to try to wake up
+		 * an empty queue.
+		 *
+		 * There's no reason to protect this flag either, as
+		 * the work queue and irq_work logic will do the necessary
+		 * synchronization for the wake ups. The only thing
+		 * that is necessary is that the wake up happens after
+		 * a task has been queued. It's OK for spurious wake ups.
+		 */
+		work->waiters_pending = true;
+
+		if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+			break;
+
+		if (cpu != RING_BUFFER_ALL_CPUS &&
+		    !ring_buffer_empty_cpu(buffer, cpu)) {
+			unsigned long flags;
+			bool pagebusy;
+
+			if (!full)
+				break;
+
+			raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+			pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+			if (!pagebusy)
+				break;
+		}
 
-	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
-	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
 		schedule();
 
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+	}
+
 	finish_wait(&work->waiters, &wait);
-	return 0;
+
+	return ret;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a52839..e67d0de 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
 {
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
 		return 0;
 
-	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+				full);
 }
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,7 +4435,7 @@ static int tracing_wait_pipe(struct file *filp)
 
 		mutex_unlock(&iter->mutex);
 
-		ret = wait_on_pipe(iter);
+		ret = wait_on_pipe(iter, false);
 
 		mutex_lock(&iter->mutex);
 
@@ -5372,7 +5373,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 				goto out_unlock;
 			}
 			mutex_unlock(&trace_types_lock);
-			ret = wait_on_pipe(iter);
+			ret = wait_on_pipe(iter, false);
 			mutex_lock(&trace_types_lock);
 			if (ret) {
 				size = ret;
@@ -5587,7 +5588,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			goto out;
 		}
 		mutex_unlock(&trace_types_lock);
-		ret = wait_on_pipe(iter);
+		ret = wait_on_pipe(iter, true);
 		mutex_lock(&trace_types_lock);
 		if (ret)
 			goto out;
@@ -5595,6 +5596,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			ret = -EINTR;
 			goto out;
 		}
+
 		goto again;
 	}
 
-- 
2.1.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2/2] tracing: don't risk busy looping in buffer splice
  2014-11-06 21:26 [PATCH 1/2] tracing: don't busy wait in buffer splice Rabin Vincent
@ 2014-11-06 21:26 ` Rabin Vincent
  2014-11-07 18:21 ` [PATCH 1/2] tracing: don't busy wait " Steven Rostedt
  1 sibling, 0 replies; 5+ messages in thread
From: Rabin Vincent @ 2014-11-06 21:26 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar; +Cc: linux-kernel, Rabin Vincent

If the read loop in trace_buffers_splice_read() keeps failing due to
memory allocation failures without reading even a single page then this
function will keep busy looping.

Remove the risk for that by exiting the function if memory allocation
failures are seen.

Signed-off-by: Rabin Vincent <rabin@rab.in>
---
 kernel/trace/trace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e67d0de..e838c96 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5501,7 +5501,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	};
 	struct buffer_ref *ref;
 	int entries, size, i;
-	ssize_t ret;
+	ssize_t ret = 0;
 
 	mutex_lock(&trace_types_lock);
 
@@ -5539,13 +5539,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		int r;
 
 		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
-		if (!ref)
+		if (!ref) {
+			ret = -ENOMEM;
 			break;
+		}
 
 		ref->ref = 1;
 		ref->buffer = iter->trace_buffer->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
 		if (!ref->page) {
+			ret = -ENOMEM;
 			kfree(ref);
 			break;
 		}
@@ -5583,6 +5586,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
+		if (ret)
+			goto out;
+
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
 			ret = -EAGAIN;
 			goto out;
-- 
2.1.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] tracing: don't busy wait in buffer splice
  2014-11-06 21:26 [PATCH 1/2] tracing: don't busy wait in buffer splice Rabin Vincent
  2014-11-06 21:26 ` [PATCH 2/2] tracing: don't risk busy looping " Rabin Vincent
@ 2014-11-07 18:21 ` Steven Rostedt
  2014-11-08  0:04   ` Steven Rostedt
  2014-11-10 18:46   ` [PATCHv2] " Rabin Vincent
  1 sibling, 2 replies; 5+ messages in thread
From: Steven Rostedt @ 2014-11-07 18:21 UTC (permalink / raw)
  To: Rabin Vincent; +Cc: Ingo Molnar, linux-kernel

On Thu,  6 Nov 2014 22:26:06 +0100
Rabin Vincent <rabin@rab.in> wrote:

> On a !PREEMPT kernel, attempting to use trace-cmd results in a soft
> lockup:
> 
>  # trace-cmd record -e raw_syscalls:* -F false
>  NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61]
>  ...
>  Call Trace:
>   [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90
>   [<ffffffff81092e25>] wait_on_pipe+0x35/0x40
>   [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0
>   [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0
>   [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40
>   [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290
>   [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0
>   [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
>   [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60
>   [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
>   [<ffffffff8112415d>] do_splice_to+0x6d/0x90
>   [<ffffffff81126971>] SyS_splice+0x7c1/0x800
>   [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8
> 

Ouch. I was able to reproduce this too.

> The problem is this: tracing_buffers_splice_read() calls
> ring_buffer_wait() to wait for data in the ring buffers.  The buffers
> are not empty so ring_buffer_wait() returns immediately.  But
> tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1,
> meaning it only wants to read a full page.  When the full page is not
> available, tracing_buffers_splice_read() tries to wait again with
> ring_buffer_wait(), which again returns immediately, and so on.
> 
> Fix this by adding a "full" argument to ring_buffer_wait() which will
> make ring_buffer_wait() wait until the writer has left the reader's
> page, i.e.  until full-page reads will succeed.
> 
> Signed-off-by: Rabin Vincent <rabin@rab.in>
> ---
>  include/linux/ring_buffer.h |  2 +-
>  kernel/trace/ring_buffer.c  | 81 ++++++++++++++++++++++++++++++---------------
>  kernel/trace/trace.c        | 12 ++++---
>  3 files changed, 62 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> index 49a4d6f..e2c13cd 100644
> --- a/include/linux/ring_buffer.h
> +++ b/include/linux/ring_buffer.h
> @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
>  	__ring_buffer_alloc((size), (flags), &__key);	\
>  })
>  
> -int ring_buffer_wait(struct ring_buffer *buffer, int cpu);
> +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full);
>  int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
>  			  struct file *filp, poll_table *poll_table);
>  
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 2d75c94..8c8143b 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
>   * ring_buffer_wait - wait for input to the ring buffer
>   * @buffer: buffer to wait on
>   * @cpu: the cpu buffer to wait on
> + * @full: wait until a full page is available
>   *
>   * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
>   * as data is added to any of the @buffer's cpu buffers. Otherwise
>   * it will wait for data to be added to a specific cpu buffer.
>   */
> -int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
> +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
>  {
> -	struct ring_buffer_per_cpu *cpu_buffer;
> +	struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
>  	DEFINE_WAIT(wait);
>  	struct rb_irq_work *work;
> +	int ret = 0;
>  
>  	/*
>  	 * Depending on what the caller is waiting for, either any
> @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
>  	}
>  
>  
> -	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
> +	while (true) {
> +		prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
>  
> -	/*
> -	 * The events can happen in critical sections where
> -	 * checking a work queue can cause deadlocks.
> -	 * After adding a task to the queue, this flag is set
> -	 * only to notify events to try to wake up the queue
> -	 * using irq_work.
> -	 *
> -	 * We don't clear it even if the buffer is no longer
> -	 * empty. The flag only causes the next event to run
> -	 * irq_work to do the work queue wake up. The worse
> -	 * that can happen if we race with !trace_empty() is that
> -	 * an event will cause an irq_work to try to wake up
> -	 * an empty queue.
> -	 *
> -	 * There's no reason to protect this flag either, as
> -	 * the work queue and irq_work logic will do the necessary
> -	 * synchronization for the wake ups. The only thing
> -	 * that is necessary is that the wake up happens after
> -	 * a task has been queued. It's OK for spurious wake ups.
> -	 */
> -	work->waiters_pending = true;
> +		/*
> +		 * The events can happen in critical sections where
> +		 * checking a work queue can cause deadlocks.
> +		 * After adding a task to the queue, this flag is set
> +		 * only to notify events to try to wake up the queue
> +		 * using irq_work.
> +		 *
> +		 * We don't clear it even if the buffer is no longer
> +		 * empty. The flag only causes the next event to run
> +		 * irq_work to do the work queue wake up. The worse
> +		 * that can happen if we race with !trace_empty() is that
> +		 * an event will cause an irq_work to try to wake up
> +		 * an empty queue.
> +		 *
> +		 * There's no reason to protect this flag either, as
> +		 * the work queue and irq_work logic will do the necessary
> +		 * synchronization for the wake ups. The only thing
> +		 * that is necessary is that the wake up happens after
> +		 * a task has been queued. It's OK for spurious wake ups.
> +		 */
> +		work->waiters_pending = true;
> +
> +		if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
> +			break;

A comment should be made that states that the @full flag is only
applicable if cpu != RING_BUFFER_ALL_CPUS.



> +
> +		if (cpu != RING_BUFFER_ALL_CPUS &&
> +		    !ring_buffer_empty_cpu(buffer, cpu)) {
> +			unsigned long flags;
> +			bool pagebusy;
> +
> +			if (!full)
> +				break;
> +
> +			raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
> +			pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
> +			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
> +
> +			if (!pagebusy)
> +				break;
> +		}
>  
> -	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
> -	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
>  		schedule();
>  
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
> +	}
> +
>  	finish_wait(&work->waiters, &wait);
> -	return 0;
> +
> +	return ret;
>  }
>  
>  /**
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 8a52839..e67d0de 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
>  }
>  #endif /* CONFIG_TRACER_MAX_TRACE */
>  
> -static int wait_on_pipe(struct trace_iterator *iter)
> +static int wait_on_pipe(struct trace_iterator *iter, bool full)
>  {
>  	/* Iterators are static, they should be filled or empty */
>  	if (trace_buffer_iter(iter, iter->cpu_file))
>  		return 0;
>  
> -	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
> +	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
> +				full);
>  }
>  
>  #ifdef CONFIG_FTRACE_STARTUP_TEST
> @@ -4434,7 +4435,7 @@ static int tracing_wait_pipe(struct file *filp)
>  
>  		mutex_unlock(&iter->mutex);
>  
> -		ret = wait_on_pipe(iter);
> +		ret = wait_on_pipe(iter, false);
>  
>  		mutex_lock(&iter->mutex);
>  
> @@ -5372,7 +5373,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
>  				goto out_unlock;
>  			}
>  			mutex_unlock(&trace_types_lock);
> -			ret = wait_on_pipe(iter);
> +			ret = wait_on_pipe(iter, false);
>  			mutex_lock(&trace_types_lock);
>  			if (ret) {
>  				size = ret;
> @@ -5587,7 +5588,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  			goto out;
>  		}
>  		mutex_unlock(&trace_types_lock);
> -		ret = wait_on_pipe(iter);
> +		ret = wait_on_pipe(iter, true);
>  		mutex_lock(&trace_types_lock);

I wonder if we can remove the signal_pending() check after this call,
as it is already done by the wait_on_pipe().

This also needs to be marked for stable. I'll start looking at how far
this goes back. I'm thinking it may go back as far as the splice code
was created.

Thanks,

-- Steve

>  		if (ret)
>  			goto out;
> @@ -5595,6 +5596,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
>  			ret = -EINTR;
>  			goto out;
>  		}
> +
>  		goto again;
>  	}
>  


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] tracing: don't busy wait in buffer splice
  2014-11-07 18:21 ` [PATCH 1/2] tracing: don't busy wait " Steven Rostedt
@ 2014-11-08  0:04   ` Steven Rostedt
  2014-11-10 18:46   ` [PATCHv2] " Rabin Vincent
  1 sibling, 0 replies; 5+ messages in thread
From: Steven Rostedt @ 2014-11-08  0:04 UTC (permalink / raw)
  To: Rabin Vincent; +Cc: Ingo Molnar, linux-kernel

On Fri, 7 Nov 2014 13:21:06 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:


> This also needs to be marked for stable. I'll start looking at how far
> this goes back. I'm thinking it may go back as far as the splice code
> was created.

Actually, it is broken back to 3.16. 3.15 and before was safe because
it was still using the mockup poll_wait_pipe() that would wake up every
1/10 of a second to see if something was in the buffer.

See commit: b1169cc69ba9 "tracing: Remove mock up poll wait function"

-- Steve

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCHv2] tracing: don't busy wait in buffer splice
  2014-11-07 18:21 ` [PATCH 1/2] tracing: don't busy wait " Steven Rostedt
  2014-11-08  0:04   ` Steven Rostedt
@ 2014-11-10 18:46   ` Rabin Vincent
  1 sibling, 0 replies; 5+ messages in thread
From: Rabin Vincent @ 2014-11-10 18:46 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar; +Cc: linux-kernel, Rabin Vincent

On a !PREEMPT kernel, attempting to use trace-cmd results in a soft
lockup:

 # trace-cmd record -e raw_syscalls:* -F false
 NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61]
 ...
 Call Trace:
  [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90
  [<ffffffff81092e25>] wait_on_pipe+0x35/0x40
  [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0
  [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0
  [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40
  [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290
  [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0
  [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
  [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60
  [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80
  [<ffffffff8112415d>] do_splice_to+0x6d/0x90
  [<ffffffff81126971>] SyS_splice+0x7c1/0x800
  [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8

The problem is this: tracing_buffers_splice_read() calls
ring_buffer_wait() to wait for data in the ring buffers.  The buffers
are not empty so ring_buffer_wait() returns immediately.  But
tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1,
meaning it only wants to read a full page.  When the full page is not
available, tracing_buffers_splice_read() tries to wait again with
ring_buffer_wait(), which again returns immediately, and so on.

Fix this by adding a "full" argument to ring_buffer_wait() which will
make ring_buffer_wait() wait until the writer has left the reader's
page, i.e.  until full-page reads will succeed.

Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function")
Cc: stable@vger.kernel.org
Signed-off-by: Rabin Vincent <rabin@rab.in>
---
v2:
- add comment that says that full is only respected for cpu != RING_BUFFER_ALL_CPUS
- move external signal_pending checks into ring_buffer_wait()

 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 81 ++++++++++++++++++++++++++++++---------------
 kernel/trace/trace.c        | 23 ++++---------
 3 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 49a4d6f..e2c13cd 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu);
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full);
 int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94..a56e07c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
  * ring_buffer_wait - wait for input to the ring buffer
  * @buffer: buffer to wait on
  * @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
  *
  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
  * as data is added to any of the @buffer's cpu buffers. Otherwise
  * it will wait for data to be added to a specific cpu buffer.
  */
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
 	DEFINE_WAIT(wait);
 	struct rb_irq_work *work;
+	int ret = 0;
 
 	/*
 	 * Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 	}
 
 
-	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+	while (true) {
+		prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
 
-	/*
-	 * The events can happen in critical sections where
-	 * checking a work queue can cause deadlocks.
-	 * After adding a task to the queue, this flag is set
-	 * only to notify events to try to wake up the queue
-	 * using irq_work.
-	 *
-	 * We don't clear it even if the buffer is no longer
-	 * empty. The flag only causes the next event to run
-	 * irq_work to do the work queue wake up. The worse
-	 * that can happen if we race with !trace_empty() is that
-	 * an event will cause an irq_work to try to wake up
-	 * an empty queue.
-	 *
-	 * There's no reason to protect this flag either, as
-	 * the work queue and irq_work logic will do the necessary
-	 * synchronization for the wake ups. The only thing
-	 * that is necessary is that the wake up happens after
-	 * a task has been queued. It's OK for spurious wake ups.
-	 */
-	work->waiters_pending = true;
+		/*
+		 * The events can happen in critical sections where
+		 * checking a work queue can cause deadlocks.
+		 * After adding a task to the queue, this flag is set
+		 * only to notify events to try to wake up the queue
+		 * using irq_work.
+		 *
+		 * We don't clear it even if the buffer is no longer
+		 * empty. The flag only causes the next event to run
+		 * irq_work to do the work queue wake up. The worse
+		 * that can happen if we race with !trace_empty() is that
+		 * an event will cause an irq_work to try to wake up
+		 * an empty queue.
+		 *
+		 * There's no reason to protect this flag either, as
+		 * the work queue and irq_work logic will do the necessary
+		 * synchronization for the wake ups. The only thing
+		 * that is necessary is that the wake up happens after
+		 * a task has been queued. It's OK for spurious wake ups.
+		 */
+		work->waiters_pending = true;
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+			break;
+
+		if (cpu != RING_BUFFER_ALL_CPUS &&
+		    !ring_buffer_empty_cpu(buffer, cpu)) {
+			unsigned long flags;
+			bool pagebusy;
+
+			if (!full)
+				break;
+
+			raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+			pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+			if (!pagebusy)
+				break;
+		}
 
-	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
-	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
 		schedule();
+	}
 
 	finish_wait(&work->waiters, &wait);
-	return 0;
+
+	return ret;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a52839..1520933 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
 {
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
 		return 0;
 
-	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+				full);
 }
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp)
 
 		mutex_unlock(&iter->mutex);
 
-		ret = wait_on_pipe(iter);
+		ret = wait_on_pipe(iter, false);
 
 		mutex_lock(&iter->mutex);
 
 		if (ret)
 			return ret;
-
-		if (signal_pending(current))
-			return -EINTR;
 	}
 
 	return 1;
@@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 				goto out_unlock;
 			}
 			mutex_unlock(&trace_types_lock);
-			ret = wait_on_pipe(iter);
+			ret = wait_on_pipe(iter, false);
 			mutex_lock(&trace_types_lock);
 			if (ret) {
 				size = ret;
 				goto out_unlock;
 			}
-			if (signal_pending(current)) {
-				size = -EINTR;
-				goto out_unlock;
-			}
 			goto again;
 		}
 		size = 0;
@@ -5587,14 +5581,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			goto out;
 		}
 		mutex_unlock(&trace_types_lock);
-		ret = wait_on_pipe(iter);
+		ret = wait_on_pipe(iter, true);
 		mutex_lock(&trace_types_lock);
 		if (ret)
 			goto out;
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			goto out;
-		}
+
 		goto again;
 	}
 
-- 
2.1.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-11-10 18:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-06 21:26 [PATCH 1/2] tracing: don't busy wait in buffer splice Rabin Vincent
2014-11-06 21:26 ` [PATCH 2/2] tracing: don't risk busy looping " Rabin Vincent
2014-11-07 18:21 ` [PATCH 1/2] tracing: don't busy wait " Steven Rostedt
2014-11-08  0:04   ` Steven Rostedt
2014-11-10 18:46   ` [PATCHv2] " Rabin Vincent

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.