All of lore.kernel.org
 help / color / mirror / Atom feed
From: Beau Belgrave <beaub@linux.microsoft.com>
To: rostedt@goodmis.org, mhiramat@kernel.org,
	mathieu.desnoyers@efficios.com, dcook@linux.microsoft.com,
	alanau@linux.microsoft.com, brauner@kernel.org,
	akpm@linux-foundation.org
Cc: linux-trace-devel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH v6 04/11] tracing/user_events: Fixup enable faults asyncly
Date: Wed,  4 Jan 2023 13:42:23 -0800	[thread overview]
Message-ID: <20230104214230.26349-5-beaub@linux.microsoft.com> (raw)
In-Reply-To: <20230104214230.26349-1-beaub@linux.microsoft.com>

When events are enabled within the various tracing facilities, such as
ftrace/perf, the event_mutex is held. As events are enabled pages are
accessed. We do not want page faults to occur under this lock. Instead
queue the fault to a workqueue to be handled in a process context safe
way without the lock.

The enable address is marked faulting while the async fault-in occurs.
This ensures that we don't attempt to fault-in more than is necessary.
Once the page has been faulted in, an address write is re-attempted.
If the page couldn't fault-in, then we wait until the next time the
event is enabled to prevent any potential infinite loops.

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
 kernel/trace/trace_events_user.c | 120 +++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e2b3e42d256d..83f6832f891d 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -99,9 +99,23 @@ struct user_event_enabler {
 /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
 #define ENABLE_VAL_BIT_MASK 0x3F
 
+/* Bit 6 is for faulting status of enablement */
+#define ENABLE_VAL_FAULTING_BIT 6
+
 /* Only duplicate the bit value */
 #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
 
+#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values)
+
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+	struct work_struct work;
+	struct user_event_mm *mm;
+	struct user_event_enabler *enabler;
+};
+
+static struct kmem_cache *fault_cache;
+
 /* Global list of memory descriptors using user_events */
 static LIST_HEAD(user_event_mms);
 static DEFINE_SPINLOCK(user_event_mms_lock);
@@ -263,7 +277,85 @@ static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
 }
 
 static int user_event_enabler_write(struct user_event_mm *mm,
-				    struct user_event_enabler *enabler)
+				    struct user_event_enabler *enabler,
+				    bool fixup_fault);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+	struct user_event_enabler_fault *fault = container_of(
+		work, struct user_event_enabler_fault, work);
+	struct user_event_enabler *enabler = fault->enabler;
+	struct user_event_mm *mm = fault->mm;
+	unsigned long uaddr = enabler->addr;
+	int ret;
+
+	ret = user_event_mm_fault_in(mm, uaddr);
+
+	if (ret && ret != -ENOENT) {
+		struct user_event *user = enabler->event;
+
+		pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+			mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
+	}
+
+	/* Prevent state changes from racing */
+	mutex_lock(&event_mutex);
+
+	/*
+	 * If we managed to get the page, re-issue the write. We do not
+	 * want to get into a possible infinite loop, which is why we only
+	 * attempt again directly if the page came in. If we couldn't get
+	 * the page here, then we will try again the next time the event is
+	 * enabled/disabled.
+	 */
+	clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+	if (!ret) {
+		mmap_read_lock(mm->mm);
+		user_event_enabler_write(mm, enabler, true);
+		mmap_read_unlock(mm->mm);
+	}
+
+	mutex_unlock(&event_mutex);
+
+	/* In all cases we no longer need the mm or fault */
+	user_event_mm_put(mm);
+	kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
+					   struct user_event_enabler *enabler)
+{
+	struct user_event_enabler_fault *fault;
+
+	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+	if (!fault)
+		return false;
+
+	INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+	fault->mm = user_event_mm_get(mm);
+	fault->enabler = enabler;
+
+	/* Don't try to queue in again while we have a pending fault */
+	set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+	if (!schedule_work(&fault->work)) {
+		/* Allow another attempt later */
+		clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+		user_event_mm_put(mm);
+		kmem_cache_free(fault_cache, fault);
+
+		return false;
+	}
+
+	return true;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+				    struct user_event_enabler *enabler,
+				    bool fixup_fault)
 {
 	unsigned long uaddr = enabler->addr;
 	unsigned long *ptr;
@@ -278,11 +370,19 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 	if (refcount_read(&mm->tasks) == 0)
 		return -ENOENT;
 
+	if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))))
+		return -EBUSY;
+
 	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
 				    &page, NULL, NULL);
 
-	if (ret <= 0) {
-		pr_warn("user_events: Enable write failed\n");
+	if (unlikely(ret <= 0)) {
+		if (!fixup_fault)
+			return -EFAULT;
+
+		if (!user_event_enabler_queue_fault(mm, enabler))
+			pr_warn("user_events: Unable to queue fault handler\n");
+
 		return -EFAULT;
 	}
 
@@ -314,7 +414,7 @@ static void user_event_enabler_update(struct user_event *user)
 
 		list_for_each_entry_rcu(enabler, &mm->enablers, link)
 			if (enabler->event == user)
-				user_event_enabler_write(mm, enabler);
+				user_event_enabler_write(mm, enabler, true);
 
 		rcu_read_unlock();
 		mmap_read_unlock(mm->mm);
@@ -562,7 +662,7 @@ static struct user_event_enabler
 
 	/* Attempt to reflect the current state within the process */
 	mmap_read_lock(user_mm->mm);
-	*write_result = user_event_enabler_write(user_mm, enabler);
+	*write_result = user_event_enabler_write(user_mm, enabler, false);
 	mmap_read_unlock(user_mm->mm);
 
 	/*
@@ -2197,16 +2297,24 @@ static int __init trace_events_user_init(void)
 {
 	int ret;
 
+	fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+	if (!fault_cache)
+		return -ENOMEM;
+
 	init_group = user_event_group_create(&init_user_ns);
 
-	if (!init_group)
+	if (!init_group) {
+		kmem_cache_destroy(fault_cache);
 		return -ENOMEM;
+	}
 
 	ret = create_user_tracefs();
 
 	if (ret) {
 		pr_warn("user_events could not register with tracefs\n");
 		user_event_group_destroy(init_group);
+		kmem_cache_destroy(fault_cache);
 		init_group = NULL;
 		return ret;
 	}
-- 
2.25.1


  parent reply	other threads:[~2023-01-04 21:42 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-04 21:42 [PATCH v6 00/11] tracing/user_events: Remote write ABI Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 01/11] tracing/user_events: Split header into uapi and kernel Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 02/11] tracing/user_events: Track fork/exec/exit for mm lifetime Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 03/11] tracing/user_events: Use remote writes for event enablement Beau Belgrave
2023-01-04 21:42 ` Beau Belgrave [this message]
2023-01-04 21:42 ` [PATCH v6 05/11] tracing/user_events: Add ioctl for disabling addresses Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 06/11] tracing/user_events: Update self-tests to write ABI Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 07/11] tracing/user_events: Add ABI self-test Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 08/11] tracing/user_events: Use write ABI in example Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 09/11] tracing/user_events: Update documentation for ABI Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 10/11] tracing/user_events: Charge event allocs to cgroups Beau Belgrave
2023-01-04 21:42 ` [PATCH v6 11/11] tracing/user_events: Limit global user_event count Beau Belgrave

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230104214230.26349-5-beaub@linux.microsoft.com \
    --to=beaub@linux.microsoft.com \
    --cc=akpm@linux-foundation.org \
    --cc=alanau@linux.microsoft.com \
    --cc=brauner@kernel.org \
    --cc=dcook@linux.microsoft.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-devel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.