[RFC PATCH 02/17] perf: Factor out mlock accounting

From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>,
	linux-kernel@vger.kernel.org, acme@redhat.com,
	kirill.shutemov@linux.intel.com, Borislav Petkov <bp@alien8.de>,
	rric@kernel.org,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>
Subject: [RFC PATCH 02/17] perf: Factor out mlock accounting
Date: Tue,  5 Sep 2017 16:30:11 +0300	[thread overview]
Message-ID: <20170905133026.13689-3-alexander.shishkin@linux.intel.com> (raw)
In-Reply-To: <20170905133026.13689-1-alexander.shishkin@linux.intel.com>

This patch moves ring buffer memory accounting down the rb_alloc() path
so that its callers won't have to worry about it. This also serves the
additional purpose of slightly cleaning up perf_mmap().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 kernel/events/core.c        |  67 +++-----------------
 kernel/events/internal.h    |   5 +-
 kernel/events/ring_buffer.c | 145 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9389e27cb0..24099ed9e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5122,6 +5122,8 @@ void ring_buffer_put(struct ring_buffer *rb)
 	if (!atomic_dec_and_test(&rb->refcount))
 		return;
 
+	ring_buffer_unaccount(rb, false);
+
 	WARN_ON_ONCE(!list_empty(&rb->event_list));
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
@@ -5156,9 +5158,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	struct ring_buffer *rb = ring_buffer_get(event);
-	struct user_struct *mmap_user = rb->mmap_user;
-	int mmap_locked = rb->mmap_locked;
-	unsigned long size = perf_data_size(rb);
 
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5178,11 +5177,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 		 */
 		perf_pmu_output_stop(event);
 
-		/* now it's safe to free the pages */
-		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
-		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
-
-		/* this has to be the last one */
+		/* now it's safe to free the pages; ought to be the last one */
 		rb_free_aux(rb);
 		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
 
@@ -5243,19 +5238,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	}
 	rcu_read_unlock();
 
-	/*
-	 * It could be there's still a few 0-ref events on the list; they'll
-	 * get cleaned up by free_event() -- they'll also still have their
-	 * ref on the rb and will free it whenever they are done with it.
-	 *
-	 * Aside from that, this buffer is 'fully' detached and unmapped,
-	 * undo the VM accounting.
-	 */
-
-	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-	vma->vm_mm->pinned_vm -= mmap_locked;
-	free_uid(mmap_user);
-
 out_put:
 	ring_buffer_put(rb); /* could be last */
 }
@@ -5270,13 +5252,9 @@ static const struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_event *event = file->private_data;
-	unsigned long user_locked, user_lock_limit;
-	struct user_struct *user = current_user();
-	unsigned long locked, lock_limit;
 	struct ring_buffer *rb = NULL;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	long user_extra = 0, extra = 0;
 	int ret = 0, flags = 0;
 
 	/*
@@ -5347,7 +5325,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		}
 
 		atomic_set(&rb->aux_mmap_count, 1);
-		user_extra = nr_pages;
 
 		goto accounting;
 	}
@@ -5384,49 +5361,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	user_extra = nr_pages + 1;
-
 accounting:
-	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
-
-	/*
-	 * Increase the limit linearly with more CPUs:
-	 */
-	user_lock_limit *= num_online_cpus();
-
-	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-	if (user_locked > user_lock_limit)
-		extra = user_locked - user_lock_limit;
-
-	lock_limit = rlimit(RLIMIT_MEMLOCK);
-	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->pinned_vm + extra;
-
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-		!capable(CAP_IPC_LOCK)) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
 	WARN_ON(!rb && event->rb);
 
 	if (vma->vm_flags & VM_WRITE)
 		flags |= RING_BUFFER_WRITABLE;
 
 	if (!rb) {
-		rb = rb_alloc(nr_pages,
+		rb = rb_alloc(vma->vm_mm, nr_pages,
 			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
 			      event->cpu, flags);
 
-		if (!rb) {
-			ret = -ENOMEM;
+		if (IS_ERR_OR_NULL(rb)) {
+			ret = PTR_ERR(rb);
+			rb = NULL;
 			goto unlock;
 		}
 
 		atomic_set(&rb->mmap_count, 1);
-		rb->mmap_user = get_current_user();
-		rb->mmap_locked = extra;
 
 		ring_buffer_attach(event, rb);
 
@@ -5435,15 +5387,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	} else {
 		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
 				   event->attr.aux_watermark, flags);
-		if (!ret)
-			rb->aux_mmap_locked = extra;
 	}
 
 unlock:
 	if (!ret) {
-		atomic_long_add(user_extra, &user->locked_vm);
-		vma->vm_mm->pinned_vm += extra;
-
 		atomic_inc(&event->mmap_count);
 	} else if (rb) {
 		atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 843e970473..3e603c45eb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,7 @@ struct ring_buffer {
 	atomic_t			mmap_count;
 	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
+	struct mm_struct		*mmap_mapping;
 
 	/* AUX area */
 	long				aux_head;
@@ -56,6 +57,7 @@ struct ring_buffer {
 };
 
 extern void rb_free(struct ring_buffer *rb);
+extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);
 
 static inline void rb_free_rcu(struct rcu_head *rcu_head)
 {
@@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
 }
 
 extern struct ring_buffer *
-rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+	 int flags);
 extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 			pgoff_t pgoff, int nr_pages, long watermark, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12..d36f169cae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -505,6 +505,88 @@ void *perf_get_aux(struct perf_output_handle *handle)
 	return handle->rb->aux_priv;
 }
 
+/*
+ * Check if the current user can afford @nr_pages, considering the
+ * perf_event_mlock sysctl and their mlock limit. If the former is exceeded,
+ * pin the remainder on their mm, if the latter is not sufficient either,
+ * error out. Otherwise, keep track of the pages used in the ring_buffer so
+ * that the accounting can be undone when the pages are freed.
+ */
+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+			       unsigned long nr_pages, bool aux)
+{
+	unsigned long total, limit, pinned;
+
+	if (!mm)
+		mm = rb->mmap_mapping;
+
+	rb->mmap_user = current_user();
+
+	limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	limit *= num_online_cpus();
+
+	total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;
+
+	pinned = 0;
+	if (total > limit) {
+		/*
+		 * Everything that's over the sysctl_perf_event_mlock
+		 * limit needs to be accounted to the consumer's mm.
+		 */
+		if (!mm)
+			return -EPERM;
+
+		pinned = total - limit;
+
+		limit = rlimit(RLIMIT_MEMLOCK);
+		limit >>= PAGE_SHIFT;
+		total = mm->pinned_vm + pinned;
+
+		if ((total > limit) && perf_paranoid_tracepoint_raw() &&
+		    !capable(CAP_IPC_LOCK)) {
+			return -EPERM;
+		}
+
+		if (aux)
+			rb->aux_mmap_locked = pinned;
+		else
+			rb->mmap_locked = pinned;
+
+		mm->pinned_vm += pinned;
+	}
+
+	if (!rb->mmap_mapping)
+		rb->mmap_mapping = mm;
+
+	/* account for user page */
+	if (!aux)
+		nr_pages++;
+
+	rb->mmap_user = get_current_user();
+	atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);
+
+	return 0;
+}
+
+/*
+ * Undo the mlock pages accounting done in ring_buffer_account().
+ */
+void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
+{
+	unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1;
+	unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked;
+
+	atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
+	if (rb->mmap_mapping)
+		rb->mmap_mapping->pinned_vm -= pinned;
+
+	free_uid(rb->mmap_user);
+}
+
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
 static struct page *rb_alloc_aux_page(int node, int order)
@@ -574,11 +656,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 {
 	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
-	int ret = -ENOMEM, max_order = 0;
+	int ret, max_order = 0;
 
 	if (!has_aux(event))
 		return -EOPNOTSUPP;
 
+	ret = ring_buffer_account(rb, NULL, nr_pages, true);
+	if (ret)
+		return ret;
+
+	ret = -ENOMEM;
 	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
 		/*
 		 * We need to start with the max_order that fits in nr_pages,
@@ -593,7 +680,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
 		    !overwrite) {
 			if (!max_order)
-				return -EINVAL;
+				goto out;
 
 			max_order--;
 		}
@@ -654,18 +741,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
 
 out:
-	if (!ret)
+	if (!ret) {
 		rb->aux_pgoff = pgoff;
-	else
+	} else {
+		ring_buffer_unaccount(rb, true);
 		__rb_free_aux(rb);
+	}
 
 	return ret;
 }
 
 void rb_free_aux(struct ring_buffer *rb)
 {
-	if (atomic_dec_and_test(&rb->aux_refcount))
+	if (atomic_dec_and_test(&rb->aux_refcount)) {
+		ring_buffer_unaccount(rb, true);
+
 		__rb_free_aux(rb);
+	}
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
@@ -699,22 +791,25 @@ static void *perf_mmap_alloc_page(int cpu)
 	return page_address(page);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+			     int cpu, int flags)
 {
+	unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
 	struct ring_buffer *rb;
-	unsigned long size;
-	int i;
-
-	size = sizeof(struct ring_buffer);
-	size += nr_pages * sizeof(void *);
+	int i, ret = -ENOMEM;
 
 	rb = kzalloc(size, GFP_KERNEL);
 	if (!rb)
 		goto fail;
 
+	ret = ring_buffer_account(rb, mm, nr_pages, false);
+	if (ret)
+		goto fail_free_rb;
+
+	ret = -ENOMEM;
 	rb->user_page = perf_mmap_alloc_page(cpu);
 	if (!rb->user_page)
-		goto fail_user_page;
+		goto fail_unaccount;
 
 	for (i = 0; i < nr_pages; i++) {
 		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
@@ -734,11 +829,14 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 
 	free_page((unsigned long)rb->user_page);
 
-fail_user_page:
+fail_unaccount:
+	ring_buffer_unaccount(rb, false);
+
+fail_free_rb:
 	kfree(rb);
 
 fail:
-	return NULL;
+	return ERR_PTR(ret);
 }
 
 static void perf_mmap_free_page(unsigned long addr)
@@ -805,19 +903,23 @@ void rb_free(struct ring_buffer *rb)
 	schedule_work(&rb->work);
 }
 
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+			     int cpu, int flags)
 {
+	unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
 	struct ring_buffer *rb;
-	unsigned long size;
 	void *all_buf;
-
-	size = sizeof(struct ring_buffer);
-	size += sizeof(void *);
+	int ret = -ENOMEM;
 
 	rb = kzalloc(size, GFP_KERNEL);
 	if (!rb)
 		goto fail;
 
+	ret = ring_buffer_account(rb, mm, nr_pages, false);
+	if (ret)
+		goto fail_free;
+
+	ret = -ENOMEM;
 	INIT_WORK(&rb->work, rb_free_work);
 
 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
@@ -836,10 +938,13 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	return rb;
 
 fail_all_buf:
+	ring_buffer_unaccount(rb, false);
+
+fail_free:
 	kfree(rb);
 
 fail:
-	return NULL;
+	return ERR_PTR(ret);
 }
 
 #endif
-- 
2.14.1