All of lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Wilcox <willy@infradead.org>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>, Will Deacon <will@kernel.org>,
	Waiman Long <longman@redhat.com>,
	linux-kernel@vger.kernel.org,
	Suren Baghdasaryan <surenb@google.com>,
	"Liam R. Howlett" <liam.howlett@oracle.com>,
	"Paul E. McKenney" <paulmck@kernel.org>
Subject: [RFC] Sleep waiting for an rwsem to be unlocked
Date: Tue, 9 Jan 2024 17:12:06 +0000	[thread overview]
Message-ID: <ZZ1+ZicgN8dZ3zj3@casper.infradead.org> (raw)

The problem we're trying to solve is a lock-free walk of
/proc/$pid/maps. If the process is modifying the VMAs at the same time
the reader is walking them, it can see garbage.  For page faults, we
handle this by taking the mmap_lock for read and retrying the page fault
(excluding any further modifications).

We don't want to take that approach for the maps file.  The monitoring
task may have a significantly lower process priority, and so taking
the mmap_lock for read can block it for a significant period of time.
The obvious answer is to do some kind of backoff+sleep.  But we already
have a wait queue, so why not use it?

I haven't done the rwbase version; this is just a demonstration of what
we could do.  It's also untested other than by compilation.  It might
well be missing something.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/rwsem.h  |   6 +++
 kernel/locking/rwsem.c | 104 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 4f1c18992f76..e7bf9dfc471a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -250,6 +250,12 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
  */
 extern void downgrade_write(struct rw_semaphore *sem);
 
+/*
+ * wait for current writer to be finished
+ */
+void rwsem_wait(struct rw_semaphore *sem);
+int __must_check rwsem_wait_killable(struct rw_semaphore *sem);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * nested locking. NOTE: rwsems are not allowed to recurse
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2340b6d90ec6..7c8096c5586f 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -332,7 +332,8 @@ EXPORT_SYMBOL(__init_rwsem);
 
 enum rwsem_waiter_type {
 	RWSEM_WAITING_FOR_WRITE,
-	RWSEM_WAITING_FOR_READ
+	RWSEM_WAITING_FOR_READ,
+	RWSEM_WAITING_FOR_RELEASE,
 };
 
 struct rwsem_waiter {
@@ -511,7 +512,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			continue;
 
-		woken++;
+		if (waiter->type == RWSEM_WAITING_FOR_READ)
+			woken++;
 		list_move_tail(&waiter->list, &wlist);
 
 		/*
@@ -1401,6 +1403,67 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	preempt_enable();
 }
 
+static inline int __wait_read_common(struct rw_semaphore *sem, int state)
+{
+	int ret = 0;
+	long adjustment = 0;
+	struct rwsem_waiter waiter;
+	DEFINE_WAKE_Q(wake_q);
+
+	waiter.task = current;
+	waiter.type = RWSEM_WAITING_FOR_RELEASE;
+	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+	waiter.handoff_set = false;
+
+	preempt_disable();
+	raw_spin_lock_irq(&sem->wait_lock);
+	if (list_empty(&sem->wait_list)) {
+		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
+			/* Provide lock ACQUIRE */
+			smp_acquire__after_ctrl_dep();
+			raw_spin_unlock_irq(&sem->wait_lock);
+			goto done;
+		}
+		adjustment = RWSEM_FLAG_WAITERS;
+	}
+	rwsem_add_waiter(sem, &waiter);
+	if (adjustment) {
+		long count = atomic_long_add_return(adjustment, &sem->count);
+		rwsem_cond_wake_waiter(sem, count, &wake_q);
+	}
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
+
+	for (;;) {
+		set_current_state(state);
+		if (!smp_load_acquire(&waiter.task)) {
+			/* Matches rwsem_mark_wake()'s smp_store_release(). */
+			break;
+		}
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter.task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+			break;
+		}
+		schedule_preempt_disabled();
+	}
+
+	__set_current_state(TASK_RUNNING);
+done:
+	preempt_enable();
+	return ret;
+out_nolock:
+	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+	__set_current_state(TASK_RUNNING);
+	ret = -EINTR;
+	goto done;
+}
+
 #else /* !CONFIG_PREEMPT_RT */
 
 #define RT_MUTEX_BUILD_MUTEX
@@ -1500,6 +1563,11 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	rwbase_write_downgrade(&sem->rwbase);
 }
 
+static inline int __wait_read_killable(struct rw_semaphore *sem)
+{
+	return rwbase_wait_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
 /* Debug stubs for the common API */
 #define DEBUG_RWSEMS_WARN_ON(c, sem)
 
@@ -1643,6 +1711,38 @@ void downgrade_write(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(downgrade_write);
 
+/**
+ * rwsem_wait_killable - Wait for current write lock holder to release lock
+ * @sem: The semaphore to wait on.
+ *
+ * This is equivalent to calling down_read(); up_read() but avoids the
+ * possibility that the thread will be preempted while holding the lock
+ * causing threads that want to take the lock for writes to block.  The
+ * intended use case is for lockless readers who notice an inconsistent
+ * state and want to wait for the current writer to finish.
+ */
+int rwsem_wait_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
+	return __wait_read_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_wait_killable);
+
+void rwsem_wait(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
+	__wait_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_wait);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void down_read_nested(struct rw_semaphore *sem, int subclass)
-- 
2.43.0


             reply	other threads:[~2024-01-09 17:12 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-09 17:12 Matthew Wilcox [this message]
2024-01-09 21:04 ` [RFC] Sleep waiting for an rwsem to be unlocked Waiman Long
2024-01-09 21:42   ` Matthew Wilcox
2024-01-10  1:54     ` Waiman Long
2024-01-11 11:22 ` Paul E. McKenney
2024-01-15 18:44 ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZZ1+ZicgN8dZ3zj3@casper.infradead.org \
    --to=willy@infradead.org \
    --cc=liam.howlett@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=surenb@google.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.