On Wed, Aug 20, 2014 at 03:36:04PM +0300, Alexander Shishkin wrote: > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c > index f5ee3669f8..3b3a915767 100644 > --- a/kernel/events/ring_buffer.c > +++ b/kernel/events/ring_buffer.c > @@ -242,6 +242,90 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) > spin_lock_init(&rb->event_lock); > } > > +void *perf_aux_output_begin(struct perf_output_handle *handle, > + struct perf_event *event) > +{ > + unsigned long aux_head, aux_tail; > + struct ring_buffer *rb; > + > + rb = ring_buffer_get(event); > + if (!rb) > + return NULL; Yeah, no need to much with ring_buffer_get() here, do as perf_output_begin()/end() and keep the RCU section over the entire output. That avoids the atomic and allows you to always use the parent event. > + > + if (!rb_has_aux(rb)) > + goto err; > + > + /* > + * Nesting is not supported for AUX area, make sure nested > + * writers are caught early > + */ > + if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) > + goto err; > + > + aux_head = local_read(&rb->aux_head); > + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); > + > + handle->rb = rb; > + handle->event = event; > + handle->head = aux_head; > + if (aux_head - aux_tail < perf_aux_size(rb)) > + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); > + else > + handle->size = 0; > + > + if (!handle->size) { > + event->pending_disable = 1; > + event->hw.state = PERF_HES_STOPPED; > + perf_output_wakeup(handle); > + local_set(&rb->aux_nest, 0); > + goto err; > + } This needs a comment on the /* A */ barrier; see the comments in perf_output_put_handle() and perf_output_begin(). I'm not sure we can use the same control dependency that we do for the normal buffers since its the hardware doing the stores, not the regular instruction stream. Please document the order in which the hardware writes vs this software setup and explain the ordering guarantees provided by the hardware wrt regular software. > + return handle->rb->aux_priv; > + > +err: > + ring_buffer_put(rb); > + handle->event = NULL; > + > + return NULL; > +} > + > +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, > + bool truncated) > +{ > + struct ring_buffer *rb = handle->rb; > + > + local_add(size, &rb->aux_head); > + > + smp_wmb(); An uncommented barrier is a bug. > + rb->user_page->aux_head = local_read(&rb->aux_head); > + > + perf_output_wakeup(handle); > + handle->event = NULL; > + > + local_set(&rb->aux_nest, 0); > + ring_buffer_put(rb); > +} Also, should perf_aux_output_end() not generate an event into the regular buffer?