All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>,
	linux-kernel@vger.kernel.org, acme@redhat.com,
	kirill.shutemov@linux.intel.com, Borislav Petkov <bp@alien8.de>,
	rric@kernel.org,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>
Subject: [RFC PATCH 02/17] perf: Factor out mlock accounting
Date: Tue,  5 Sep 2017 16:30:11 +0300	[thread overview]
Message-ID: <20170905133026.13689-3-alexander.shishkin@linux.intel.com> (raw)
In-Reply-To: <20170905133026.13689-1-alexander.shishkin@linux.intel.com>

This patch moves ring buffer memory accounting down the rb_alloc() path
so that its callers won't have to worry about it. This also serves the
additional purpose of slightly cleaning up perf_mmap().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 kernel/events/core.c        |  67 +++-----------------
 kernel/events/internal.h    |   5 +-
 kernel/events/ring_buffer.c | 145 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9389e27cb0..24099ed9e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5122,6 +5122,8 @@ void ring_buffer_put(struct ring_buffer *rb)
 	if (!atomic_dec_and_test(&rb->refcount))
 		return;
 
+	ring_buffer_unaccount(rb, false);
+
 	WARN_ON_ONCE(!list_empty(&rb->event_list));
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
@@ -5156,9 +5158,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	struct ring_buffer *rb = ring_buffer_get(event);
-	struct user_struct *mmap_user = rb->mmap_user;
-	int mmap_locked = rb->mmap_locked;
-	unsigned long size = perf_data_size(rb);
 
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5178,11 +5177,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 		 */
 		perf_pmu_output_stop(event);
 
-		/* now it's safe to free the pages */
-		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
-		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
-
-		/* this has to be the last one */
+		/* now it's safe to free the pages; ought to be the last one */
 		rb_free_aux(rb);
 		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
 
@@ -5243,19 +5238,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	}
 	rcu_read_unlock();
 
-	/*
-	 * It could be there's still a few 0-ref events on the list; they'll
-	 * get cleaned up by free_event() -- they'll also still have their
-	 * ref on the rb and will free it whenever they are done with it.
-	 *
-	 * Aside from that, this buffer is 'fully' detached and unmapped,
-	 * undo the VM accounting.
-	 */
-
-	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-	vma->vm_mm->pinned_vm -= mmap_locked;
-	free_uid(mmap_user);
-
 out_put:
 	ring_buffer_put(rb); /* could be last */
 }
@@ -5270,13 +5252,9 @@ static const struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_event *event = file->private_data;
-	unsigned long user_locked, user_lock_limit;
-	struct user_struct *user = current_user();
-	unsigned long locked, lock_limit;
 	struct ring_buffer *rb = NULL;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	long user_extra = 0, extra = 0;
 	int ret = 0, flags = 0;
 
 	/*
@@ -5347,7 +5325,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		}
 
 		atomic_set(&rb->aux_mmap_count, 1);
-		user_extra = nr_pages;
 
 		goto accounting;
 	}
@@ -5384,49 +5361,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	user_extra = nr_pages + 1;
-
 accounting:
-	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
-
-	/*
-	 * Increase the limit linearly with more CPUs:
-	 */
-	user_lock_limit *= num_online_cpus();
-
-	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-	if (user_locked > user_lock_limit)
-		extra = user_locked - user_lock_limit;
-
-	lock_limit = rlimit(RLIMIT_MEMLOCK);
-	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->pinned_vm + extra;
-
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-		!capable(CAP_IPC_LOCK)) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
 	WARN_ON(!rb && event->rb);
 
 	if (vma->vm_flags & VM_WRITE)
 		flags |= RING_BUFFER_WRITABLE;
 
 	if (!rb) {
-		rb = rb_alloc(nr_pages,
+		rb = rb_alloc(vma->vm_mm, nr_pages,
 			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
 			      event->cpu, flags);
 
-		if (!rb) {
-			ret = -ENOMEM;
+		if (IS_ERR_OR_NULL(rb)) {
+			ret = PTR_ERR(rb);
+			rb = NULL;
 			goto unlock;
 		}
 
 		atomic_set(&rb->mmap_count, 1);
-		rb->mmap_user = get_current_user();
-		rb->mmap_locked = extra;
 
 		ring_buffer_attach(event, rb);
 
@@ -5435,15 +5387,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	} else {
 		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
 				   event->attr.aux_watermark, flags);
-		if (!ret)
-			rb->aux_mmap_locked = extra;
 	}
 
 unlock:
 	if (!ret) {
-		atomic_long_add(user_extra, &user->locked_vm);
-		vma->vm_mm->pinned_vm += extra;
-
 		atomic_inc(&event->mmap_count);
 	} else if (rb) {
 		atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 843e970473..3e603c45eb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,7 @@ struct ring_buffer {
 	atomic_t			mmap_count;
 	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
+	struct mm_struct		*mmap_mapping;
 
 	/* AUX area */
 	long				aux_head;
@@ -56,6 +57,7 @@ struct ring_buffer {
 };
 
 extern void rb_free(struct ring_buffer *rb);
+extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);
 
 static inline void rb_free_rcu(struct rcu_head *rcu_head)
 {
@@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
 }
 
 extern struct ring_buffer *
-rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+	 int flags);
 extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 			pgoff_t pgoff, int nr_pages, long watermark, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12..d36f169cae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -505,6 +505,88 @@ void *perf_get_aux(struct perf_output_handle *handle)
 	return handle->rb->aux_priv;
 }
 
+/*
+ * Check if the current user can afford @nr_pages, considering the
+ * perf_event_mlock sysctl and their mlock limit. If the former is exceeded,
+ * pin the remainder on their mm, if the latter is not sufficient either,
+ * error out. Otherwise, keep track of the pages used in the ring_buffer so
+ * that the accounting can be undone when the pages are freed.
+ */
+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+			       unsigned long nr_pages, bool aux)
+{
+	unsigned long total, limit, pinned;
+
+	if (!mm)
+		mm = rb->mmap_mapping;
+
+	rb->mmap_user = current_user();
+
+	limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	limit *= num_online_cpus();
+
+	total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;
+
+	pinned = 0;
+	if (total > limit) {
+		/*
+		 * Everything that's over the sysctl_perf_event_mlock
+		 * limit needs to be accounted to the consumer's mm.
+		 */
+		if (!mm)
+			return -EPERM;
+
+		pinned = total - limit;
+
+		limit = rlimit(RLIMIT_MEMLOCK);
+		limit >>= PAGE_SHIFT;
+		total = mm->pinned_vm + pinned;
+
+		if ((total > limit) && perf_paranoid_tracepoint_raw() &&
+		    !capable(CAP_IPC_LOCK)) {
+			return -EPERM;
+		}
+
+		if (aux)
+			rb->aux_mmap_locked = pinned;
+		else
+			rb->mmap_locked = pinned;
+
+		mm->pinned_vm += pinned;
+	}
+
+	if (!rb->mmap_mapping)
+		rb->mmap_mapping = mm;
+
+	/* account for user page */
+	if (!aux)
+		nr_pages++;
+
+	rb->mmap_user = get_current_user();
+	atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);
+
+	return 0;
+}
+
+/*
+ * Undo the mlock pages accounting done in ring_buffer_account().
+ */
+void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
+{
+	unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1;
+	unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked;
+
+	atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
+	if (rb->mmap_mapping)
+		rb->mmap_mapping->pinned_vm -= pinned;
+
+	free_uid(rb->mmap_user);
+}
+
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
 static struct page *rb_alloc_aux_page(int node, int order)
@@ -574,11 +656,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 {
 	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
-	int ret = -ENOMEM, max_order = 0;
+	int ret, max_order = 0;
 
 	if (!has_aux(event))
 		return -EOPNOTSUPP;
 
+	ret = ring_buffer_account(rb, NULL, nr_pages, true);
+	if (ret)
+		return ret;
+
+	ret = -ENOMEM;
 	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
 		/*
 		 * We need to start with the max_order that fits in nr_pages,
@@ -593,7 +680,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
 		    !overwrite) {
 			if (!max_order)
-				return -EINVAL;
+				goto out;
 
 			max_order--;
 		}
@@ -654,18 +741,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
 
 out:
-	if (!ret)
+	if (!ret) {
 		rb->aux_pgoff = pgoff;
-	else
+	} else {
+		ring_buffer_unaccount(rb, true);
 		__rb_free_aux(rb);
+	}
 
 	return ret;
 }
 
 void rb_free_aux(struct ring_buffer *rb)
 {
-	if (atomic_dec_and_test(&rb->aux_refcount))
+	if (atomic_dec_and_test(&rb->aux_refcount)) {
+		ring_buffer_unaccount(rb, true);
+
 		__rb_free_aux(rb);
+	}
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
@@ -699,22 +791,25 @@ static void *perf_mmap_alloc_page(int cpu)
 	return page_address(page);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+			     int cpu, int flags)
 {
+	unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
 	struct ring_buffer *rb;
-	unsigned long size;
-	int i;
-
-	size = sizeof(struct ring_buffer);
-	size += nr_pages * sizeof(void *);
+	int i, ret = -ENOMEM;
 
 	rb = kzalloc(size, GFP_KERNEL);
 	if (!rb)
 		goto fail;
 
+	ret = ring_buffer_account(rb, mm, nr_pages, false);
+	if (ret)
+		goto fail_free_rb;
+
+	ret = -ENOMEM;
 	rb->user_page = perf_mmap_alloc_page(cpu);
 	if (!rb->user_page)
-		goto fail_user_page;
+		goto fail_unaccount;
 
 	for (i = 0; i < nr_pages; i++) {
 		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
@@ -734,11 +829,14 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 
 	free_page((unsigned long)rb->user_page);
 
-fail_user_page:
+fail_unaccount:
+	ring_buffer_unaccount(rb, false);
+
+fail_free_rb:
 	kfree(rb);
 
 fail:
-	return NULL;
+	return ERR_PTR(ret);
 }
 
 static void perf_mmap_free_page(unsigned long addr)
@@ -805,19 +903,23 @@ void rb_free(struct ring_buffer *rb)
 	schedule_work(&rb->work);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+			     int cpu, int flags)
 {
+	unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
 	struct ring_buffer *rb;
-	unsigned long size;
 	void *all_buf;
-
-	size = sizeof(struct ring_buffer);
-	size += sizeof(void *);
+	int ret = -ENOMEM;
 
 	rb = kzalloc(size, GFP_KERNEL);
 	if (!rb)
 		goto fail;
 
+	ret = ring_buffer_account(rb, mm, nr_pages, false);
+	if (ret)
+		goto fail_free;
+
+	ret = -ENOMEM;
 	INIT_WORK(&rb->work, rb_free_work);
 
 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
@@ -836,10 +938,13 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	return rb;
 
 fail_all_buf:
+	ring_buffer_unaccount(rb, false);
+
+fail_free:
 	kfree(rb);
 
 fail:
-	return NULL;
+	return ERR_PTR(ret);
 }
 
 #endif
-- 
2.14.1

  parent reply	other threads:[~2017-09-05 13:31 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-05 13:30 [RFC PATCH 00/17] perf: Detached events Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 01/17] perf: Allow mmapping only user page Alexander Shishkin
2017-09-06 16:28   ` Borislav Petkov
2017-09-13 11:35     ` Alexander Shishkin
2017-09-13 12:58       ` Borislav Petkov
2017-09-05 13:30 ` Alexander Shishkin [this message]
2017-09-05 13:30 ` [RFC PATCH 03/17] tracefs: De-globalize instances' callbacks Alexander Shishkin
2018-01-24 18:54   ` Steven Rostedt
2017-09-05 13:30 ` [RFC PATCH 04/17] tracefs: Add ->unlink callback to tracefs_dir_ops Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 05/17] perf: Introduce detached events Alexander Shishkin
2017-10-03 14:34   ` Peter Zijlstra
2017-10-06 11:23     ` Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 06/17] perf: Add buffers to the " Alexander Shishkin
2017-10-03 14:36   ` Peter Zijlstra
2017-09-05 13:30 ` [RFC PATCH 07/17] perf: Add pmu_info to user page Alexander Shishkin
2017-10-03 14:40   ` Peter Zijlstra
2017-09-05 13:30 ` [RFC PATCH 08/17] perf: Allow inheritance for detached events Alexander Shishkin
2017-10-03 14:42   ` Peter Zijlstra
2017-10-06 11:40     ` Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 09/17] perf: Use shmemfs pages for userspace-only per-thread " Alexander Shishkin
2017-10-03 14:43   ` Peter Zijlstra
2017-10-06 11:52     ` Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 10/17] perf: Implement pinning and scheduling for SHMEM events Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 11/17] perf: Implement mlock accounting for shmem ring buffers Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 12/17] perf: Track pinned events per user Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 13/17] perf: Re-inject shmem buffers after exec Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 14/17] perf: Add ioctl(REATTACH) for detached events Alexander Shishkin
2017-10-03 14:50   ` Peter Zijlstra
2017-09-05 13:30 ` [RFC PATCH 15/17] perf: Allow controlled non-root access to " Alexander Shishkin
2017-10-03 14:53   ` Peter Zijlstra
2017-09-05 13:30 ` [RFC PATCH 16/17] perf/x86/intel/pt: Add PMU info Alexander Shishkin
2017-09-05 13:30 ` [RFC PATCH 17/17] perf/x86/intel/bts: " Alexander Shishkin
2017-09-06 16:24 ` [RFC PATCH 00/17] perf: Detached events Borislav Petkov
2017-09-13 11:54   ` Alexander Shishkin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170905133026.13689-3-alexander.shishkin@linux.intel.com \
    --to=alexander.shishkin@linux.intel.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=acme@redhat.com \
    --cc=bp@alien8.de \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=rric@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.