All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] locking/percpu-rwsem: use this_cpu_{inc|dec}() for read_count
@ 2020-09-15 14:07 Hou Tao
  2020-09-15 15:06 ` peterz
  2020-09-18  8:36 ` [tip: locking/urgent] locking/percpu-rwsem: Use this_cpu_{inc,dec}() " tip-bot2 for Hou Tao
  0 siblings, 2 replies; 33+ messages in thread
From: Hou Tao @ 2020-09-15 14:07 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Oleg Nesterov, Will Deacon
  Cc: Dennis Zhou, Tejun Heo, Christoph Lameter, linux-kernel,
	linux-fsdevel, houtao1

Under aarch64, __this_cpu_inc() is neither IRQ-safe nor atomic, so
when percpu_up_read() is invoked under IRQ-context (e.g. aio completion),
and it interrupts the process on the same CPU which is invoking
percpu_down_read(), the decreasement on read_count may lost and
the final value of read_count on the CPU will be unexpected
as shown below:

  CPU 0          CPU 0

  io_submit_one
  __sb_start_write
  percpu_down_read
  __this_cpu_inc
  // there is already an inflight IO, so
  // reading *raw_cpu_ptr(&pcp) returns 1
  // half complete, then being interrupted
  *raw_cpu_ptr(&pcp)) += 1

  		nvme_irq
  		nvme_complete_cqes
  		blk_mq_complete_request
  		nvme_pci_complete_rq
  		nvme_complete_rq
  		blk_mq_end_request
  		blk_update_request
  		bio_endio
  		dio_bio_end_aio
  		aio_complete_rw
  		__sb_end_write
  		percpu_up_read
  		*raw_cpu_ptr(&pcp)) -= 1
  		// *raw_cpu_ptr(&pcp) is 0

  // the decreasement is overwritten by the increasement
  *raw_cpu_ptr(&pcp)) += 1
  // the final value is 1 + 1 = 2 instead of 1

Fixing it by using the IRQ-safe helper this_cpu_inc|dec() for
operations on read_count.

Another plausible fix is to state that percpu-rwsem can NOT be
used under IRQ context and convert all users which may
use it under IRQ context.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/percpu-rwsem.h  | 8 ++++----
 kernel/locking/percpu-rwsem.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5e033fe1ff4e9..5fda40f97fe91 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -60,7 +60,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	 * anything we did within this RCU-sched read-size critical section.
 	 */
 	if (likely(rcu_sync_is_idle(&sem->rss)))
-		__this_cpu_inc(*sem->read_count);
+		this_cpu_inc(*sem->read_count);
 	else
 		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	/*
@@ -79,7 +79,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	 * Same as in percpu_down_read().
 	 */
 	if (likely(rcu_sync_is_idle(&sem->rss)))
-		__this_cpu_inc(*sem->read_count);
+		this_cpu_inc(*sem->read_count);
 	else
 		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
@@ -103,7 +103,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	 * Same as in percpu_down_read().
 	 */
 	if (likely(rcu_sync_is_idle(&sem->rss))) {
-		__this_cpu_dec(*sem->read_count);
+		this_cpu_dec(*sem->read_count);
 	} else {
 		/*
 		 * slowpath; reader will only ever wake a single blocked
@@ -115,7 +115,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 		 * aggregate zero, as that is the only time it matters) they
 		 * will also see our critical section.
 		 */
-		__this_cpu_dec(*sem->read_count);
+		this_cpu_dec(*sem->read_count);
 		rcuwait_wake_up(&sem->writer);
 	}
 	preempt_enable();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3e5203d..70a32a576f3f2 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	__this_cpu_inc(*sem->read_count);
+	this_cpu_inc(*sem->read_count);
 
 	/*
 	 * Due to having preemption disabled the decrement happens on
@@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	if (likely(!atomic_read_acquire(&sem->block)))
 		return true;
 
-	__this_cpu_dec(*sem->read_count);
+	this_cpu_dec(*sem->read_count);
 
 	/* Prod writer to re-evaluate readers_active_check() */
 	rcuwait_wake_up(&sem->writer);
-- 
2.25.0.4.g0ad7144999


^ permalink raw reply related	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2020-09-29 18:07 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-15 14:07 [RFC PATCH] locking/percpu-rwsem: use this_cpu_{inc|dec}() for read_count Hou Tao
2020-09-15 15:06 ` peterz
2020-09-15 15:31   ` Oleg Nesterov
2020-09-15 15:51     ` peterz
2020-09-15 16:03       ` peterz
2020-09-15 16:11         ` Will Deacon
2020-09-15 18:11           ` peterz
2020-09-16  8:20             ` Will Deacon
2020-09-15 16:47         ` Oleg Nesterov
2020-09-16 12:32         ` Hou Tao
2020-09-16 12:51           ` peterz
2020-09-17  8:48           ` Will Deacon
2020-09-24 11:55             ` Hou Tao
2020-09-29 17:49               ` Will Deacon
2020-09-29 18:07                 ` Ard Biesheuvel
2020-09-17 10:51           ` Boaz Harrosh
2020-09-17 12:01             ` Oleg Nesterov
2020-09-17 12:48               ` Matthew Wilcox
2020-09-17 13:22                 ` peterz
2020-09-17 13:34                 ` Oleg Nesterov
2020-09-17 13:46                 ` Boaz Harrosh
2020-09-17 14:46                   ` Christoph Hellwig
2020-09-18  9:07               ` Jan Kara
2020-09-18 10:01                 ` peterz
2020-09-18 10:04                   ` peterz
2020-09-18 10:07                     ` peterz
2020-09-18 10:12                   ` peterz
2020-09-18 10:48                     ` Oleg Nesterov
2020-09-18 11:03                       ` peterz
2020-09-18 13:09                         ` Oleg Nesterov
2020-09-18 13:26                           ` Jan Kara
2020-09-20 23:49                             ` Dave Chinner
2020-09-18  8:36 ` [tip: locking/urgent] locking/percpu-rwsem: Use this_cpu_{inc,dec}() " tip-bot2 for Hou Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.