LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Mark Rutland <mark.rutland@arm.com>
Cc: linux-kernel@vger.kernel.org, Ingo Molnar <mingo@redhat.com>,
	Will Deacon <will.deacon@arm.com>
Subject: Re: [PATCH] perf/ring_buffer: ensure atomicity and order of updates
Date: Mon, 14 May 2018 17:02:13 +0200
Message-ID: <20180514150213.GK12235@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20180514112815.GP12217@hirez.programming.kicks-ass.net>

On Mon, May 14, 2018 at 01:28:15PM +0200, Peter Zijlstra wrote:
> On Mon, May 14, 2018 at 12:05:33PM +0100, Mark Rutland wrote:

> > > Also note that in perf_output_put_handle(), where we write ->data_head,
> > > the store is from an 'unsigned long'. So on 32bit that will result in a
> > > zero high word. Similarly, in __perf_output_begin() we read ->data_tail
> > > into an unsigned long, which will discard the high word.
> > 
> > Ah, that's a fair point. So it's just compat userspace that this is
> > potentially borked for. ;)
> 
> Right.. #$$#@ compat. Hurmph.. not sure how to go about fixing that
> there.

How's this?

---
 include/linux/perf_event.h  | 12 ++++++++++++
 kernel/events/core.c        | 31 +++++++++++++++++++++++++++++--
 kernel/events/ring_buffer.c | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..7834dfb6a83b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -517,6 +517,7 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
  */
 #define PERF_EV_CAP_SOFTWARE		BIT(0)
 #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
+#define PERF_EV_CAP_COMPAT		BIT(2)
 
 #define SWEVENT_HLIST_BITS		8
 #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
@@ -1220,6 +1221,11 @@ static inline bool is_write_backward(struct perf_event *event)
 	return !!event->attr.write_backward;
 }
 
+static inline bool is_compat_event(struct perf_event *event)
+{
+	return event->event_caps & PERF_EV_CAP_COMPAT;
+}
+
 static inline bool has_addr_filter(struct perf_event *event)
 {
 	return event->pmu->nr_addr_filters;
@@ -1249,6 +1255,12 @@ extern int perf_output_begin_forward(struct perf_output_handle *handle,
 extern int perf_output_begin_backward(struct perf_output_handle *handle,
 				      struct perf_event *event,
 				      unsigned int size);
+extern int perf_output_begin_forward_compat(struct perf_output_handle *handle,
+				    struct perf_event *event,
+				    unsigned int size);
+extern int perf_output_begin_backward_compat(struct perf_output_handle *handle,
+				      struct perf_event *event,
+				      unsigned int size);
 
 extern void perf_output_end(struct perf_output_handle *handle);
 extern unsigned int perf_output_copy(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..0e60f35442a6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6523,6 +6523,22 @@ perf_event_output_backward(struct perf_event *event,
 	__perf_event_output(event, data, regs, perf_output_begin_backward);
 }
 
+void
+perf_event_output_forward_compat(struct perf_event *event,
+			 struct perf_sample_data *data,
+			 struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_forward_compat);
+}
+
+void
+perf_event_output_backward_compat(struct perf_event *event,
+			   struct perf_sample_data *data,
+			   struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_backward_compat);
+}
+
 void
 perf_event_output(struct perf_event *event,
 		  struct perf_sample_data *data,
@@ -10000,10 +10016,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->overflow_handler	= overflow_handler;
 		event->overflow_handler_context = context;
 	} else if (is_write_backward(event)){
-		event->overflow_handler = perf_event_output_backward;
+		if (is_compat_event(event))
+			event->overflow_handler = perf_event_output_backward_compat;
+		else
+			event->overflow_handler = perf_event_output_backward;
 		event->overflow_handler_context = NULL;
 	} else {
-		event->overflow_handler = perf_event_output_forward;
+		if (is_compat_event(event))
+			event->overflow_handler = perf_event_output_forward_compat;
+		else
+			event->overflow_handler = perf_event_output_forward;
 		event->overflow_handler_context = NULL;
 	}
 
@@ -10266,6 +10288,8 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (is_write_backward(output_event) != is_write_backward(event))
 		goto out;
 
+	if (is_compat_event(output_event) != is_compat_event(event))
+		goto out;
 	/*
 	 * If both events generate aux data, they must be on the same PMU
 	 */
@@ -10499,6 +10523,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_cred;
 	}
 
+	if (in_compat_syscall())
+		event->event_caps |= PERF_EV_CAP_COMPAT;
+
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -EOPNOTSUPP;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 1d8ca9ea9979..8a8ca117e5de 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -85,7 +85,10 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
 	 * See perf_output_begin().
 	 */
 	smp_wmb(); /* B, matches C */
-	rb->user_page->data_head = head;
+	/*
+	 * 'unsigned long' to 'u64' promotion will zero the high word on 32 bit.
+	 */
+	WRITE_ONCE(rb->user_page->data_head, head);
 
 	/*
 	 * Now check if we missed an update -- rely on previous implied
@@ -117,7 +120,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
 static int __always_inline
 __perf_output_begin(struct perf_output_handle *handle,
 		    struct perf_event *event, unsigned int size,
-		    bool backward)
+		    bool backward, bool compat)
 {
 	struct ring_buffer *rb;
 	unsigned long tail, offset, head;
@@ -158,7 +161,13 @@ __perf_output_begin(struct perf_output_handle *handle,
 	perf_output_get_handle(handle);
 
 	do {
+		/*
+		 * 'u64' to 'unsigned long' demotion looses the high word on 32 bit.
+		 */
 		tail = READ_ONCE(rb->user_page->data_tail);
+		if (compat)
+			tail &= UINT_MAX;
+
 		offset = head = local_read(&rb->head);
 		if (!rb->overwrite) {
 			if (unlikely(!ring_buffer_has_space(head, tail,
@@ -183,6 +192,13 @@ __perf_output_begin(struct perf_output_handle *handle,
 			head += size;
 		else
 			head -= size;
+
+		/*
+		 * Ensure rb->head has the high word clear for compat; this
+		 * avoids having to muck with perf_output_put_handle().
+		 */
+		if (compat)
+			head &= UINT_MAX;
 	} while (local_cmpxchg(&rb->head, offset, head) != offset);
 
 	if (backward) {
@@ -234,13 +250,25 @@ __perf_output_begin(struct perf_output_handle *handle,
 int perf_output_begin_forward(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, false);
+	return __perf_output_begin(handle, event, size, false, false);
 }
 
 int perf_output_begin_backward(struct perf_output_handle *handle,
 			       struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, true);
+	return __perf_output_begin(handle, event, size, true, false);
+}
+
+int perf_output_begin_forward_compat(struct perf_output_handle *handle,
+			     struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, false, true);
+}
+
+int perf_output_begin_backward_compat(struct perf_output_handle *handle,
+			       struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, true, true);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
@@ -248,7 +276,8 @@ int perf_output_begin(struct perf_output_handle *handle,
 {
 
 	return __perf_output_begin(handle, event, size,
-				   unlikely(is_write_backward(event)));
+				   unlikely(is_write_backward(event)),
+				   unlikely(is_compat_event(event));
 }
 
 unsigned int perf_output_copy(struct perf_output_handle *handle,

  reply index

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-10 13:06 Mark Rutland
2018-05-11  1:19 ` kbuild test robot
2018-05-11  1:19 ` kbuild test robot
2018-05-11 10:59 ` Mark Rutland
2018-05-11 16:22   ` Peter Zijlstra
2018-05-14 11:05     ` Mark Rutland
2018-05-14 11:28       ` Peter Zijlstra
2018-05-14 15:02         ` Peter Zijlstra [this message]
2018-05-14 15:20           ` Mark Rutland
2018-05-14 15:24             ` Peter Zijlstra
2018-05-23 16:42 ` Will Deacon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180514150213.GK12235@hirez.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=will.deacon@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git