All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 5.8 11/70] io_wq: Make io_wqe::lock a raw_spinlock_t
Date: Sat, 31 Oct 2020 12:35:43 +0100	[thread overview]
Message-ID: <20201031113500.044462247@linuxfoundation.org> (raw)
In-Reply-To: <20201031113459.481803250@linuxfoundation.org>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

commit 95da84659226d75698a1ab958be0af21d9cc2a9c upstream.

During a context switch the scheduler invokes wq_worker_sleeping() with
disabled preemption. Disabling preemption is needed because it protects
access to `worker->sleeping'. As an optimisation it avoids invoking
schedule() within the schedule path as part of possible wake up (thus
preempt_enable_no_resched() afterwards).

The io-wq has been added to the mix in the same section with disabled
preemption. This breaks on PREEMPT_RT because io_wq_worker_sleeping()
acquires a spinlock_t. Also within the schedule() the spinlock_t must be
acquired after tsk_is_pi_blocked() otherwise it will block on the
sleeping lock again while scheduling out.

While playing with `io_uring-bench' I didn't notice a significant
latency spike after converting io_wqe::lock to a raw_spinlock_t. The
latency was more or less the same.

In order to keep the spinlock_t it would have to be moved after the
tsk_is_pi_blocked() check which would introduce a branch instruction
into the hot path.

The lock is used to maintain the `work_list' and wakes one task up at
most.
Should io_wqe_cancel_pending_work() cause latency spikes, while
searching for a specific item, then it would need to drop the lock
during iterations.
revert_creds() is also invoked under the lock. According to debug
cred::non_rcu is 0. Otherwise it should be moved outside of the locked
section because put_cred_rcu()->free_uid() acquires a sleeping lock.

Convert io_wqe::lock to a raw_spinlock_t.c

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/io-wq.c |   52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -88,7 +88,7 @@ enum {
  */
 struct io_wqe {
 	struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		struct io_wq_work_list work_list;
 		unsigned long hash_map;
 		unsigned flags;
@@ -149,7 +149,7 @@ static bool __io_worker_unuse(struct io_
 
 	if (current->files != worker->restore_files) {
 		__acquire(&wqe->lock);
-		spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock_irq(&wqe->lock);
 		dropped_lock = true;
 
 		task_lock(current);
@@ -168,7 +168,7 @@ static bool __io_worker_unuse(struct io_
 	if (worker->mm) {
 		if (!dropped_lock) {
 			__acquire(&wqe->lock);
-			spin_unlock_irq(&wqe->lock);
+			raw_spin_unlock_irq(&wqe->lock);
 			dropped_lock = true;
 		}
 		__set_current_state(TASK_RUNNING);
@@ -222,17 +222,17 @@ static void io_worker_exit(struct io_wor
 	worker->flags = 0;
 	preempt_enable();
 
-	spin_lock_irq(&wqe->lock);
+	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
 	if (__io_worker_unuse(wqe, worker)) {
 		__release(&wqe->lock);
-		spin_lock_irq(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 	}
 	acct->nr_workers--;
 	nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
 			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
-	spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock_irq(&wqe->lock);
 
 	/* all workers gone, wq exit can proceed */
 	if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
@@ -508,7 +508,7 @@ get_next:
 		else if (!wq_list_empty(&wqe->work_list))
 			wqe->flags |= IO_WQE_FLAG_STALLED;
 
-		spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock_irq(&wqe->lock);
 		if (!work)
 			break;
 		io_assign_current_work(worker, work);
@@ -543,7 +543,7 @@ get_next:
 				io_wqe_enqueue(wqe, linked);
 
 			if (hash != -1U && !next_hashed) {
-				spin_lock_irq(&wqe->lock);
+				raw_spin_lock_irq(&wqe->lock);
 				wqe->hash_map &= ~BIT_ULL(hash);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
 				/* dependent work is not hashed */
@@ -551,11 +551,11 @@ get_next:
 				/* skip unnecessary unlock-lock wqe->lock */
 				if (!work)
 					goto get_next;
-				spin_unlock_irq(&wqe->lock);
+				raw_spin_unlock_irq(&wqe->lock);
 			}
 		} while (work);
 
-		spin_lock_irq(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 	} while (1);
 }
 
@@ -570,7 +570,7 @@ static int io_wqe_worker(void *data)
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 loop:
-		spin_lock_irq(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			__set_current_state(TASK_RUNNING);
 			io_worker_handle_work(worker);
@@ -581,7 +581,7 @@ loop:
 			__release(&wqe->lock);
 			goto loop;
 		}
-		spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock_irq(&wqe->lock);
 		if (signal_pending(current))
 			flush_signals(current);
 		if (schedule_timeout(WORKER_IDLE_TIMEOUT))
@@ -593,11 +593,11 @@ loop:
 	}
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
-		spin_lock_irq(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 		if (!wq_list_empty(&wqe->work_list))
 			io_worker_handle_work(worker);
 		else
-			spin_unlock_irq(&wqe->lock);
+			raw_spin_unlock_irq(&wqe->lock);
 	}
 
 	io_worker_exit(worker);
@@ -637,9 +637,9 @@ void io_wq_worker_sleeping(struct task_s
 
 	worker->flags &= ~IO_WORKER_F_RUNNING;
 
-	spin_lock_irq(&wqe->lock);
+	raw_spin_lock_irq(&wqe->lock);
 	io_wqe_dec_running(wqe, worker);
-	spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock_irq(&wqe->lock);
 }
 
 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -663,7 +663,7 @@ static bool create_io_worker(struct io_w
 		return false;
 	}
 
-	spin_lock_irq(&wqe->lock);
+	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
@@ -672,7 +672,7 @@ static bool create_io_worker(struct io_w
 	if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
 		worker->flags |= IO_WORKER_F_FIXED;
 	acct->nr_workers++;
-	spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock_irq(&wqe->lock);
 
 	if (index == IO_WQ_ACCT_UNBOUND)
 		atomic_inc(&wq->user->processes);
@@ -727,12 +727,12 @@ static int io_wq_manager(void *data)
 			if (!node_online(node))
 				continue;
 
-			spin_lock_irq(&wqe->lock);
+			raw_spin_lock_irq(&wqe->lock);
 			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
 				fork_worker[IO_WQ_ACCT_BOUND] = true;
 			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
 				fork_worker[IO_WQ_ACCT_UNBOUND] = true;
-			spin_unlock_irq(&wqe->lock);
+			raw_spin_unlock_irq(&wqe->lock);
 			if (fork_worker[IO_WQ_ACCT_BOUND])
 				create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
 			if (fork_worker[IO_WQ_ACCT_UNBOUND])
@@ -829,10 +829,10 @@ static void io_wqe_enqueue(struct io_wqe
 	}
 
 	work_flags = work->flags;
-	spin_lock_irqsave(&wqe->lock, flags);
+	raw_spin_lock_irqsave(&wqe->lock, flags);
 	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
-	spin_unlock_irqrestore(&wqe->lock, flags);
+	raw_spin_unlock_irqrestore(&wqe->lock, flags);
 
 	if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 	    !atomic_read(&acct->nr_running))
@@ -959,13 +959,13 @@ static void io_wqe_cancel_pending_work(s
 	unsigned long flags;
 
 retry:
-	spin_lock_irqsave(&wqe->lock, flags);
+	raw_spin_lock_irqsave(&wqe->lock, flags);
 	wq_list_for_each(node, prev, &wqe->work_list) {
 		work = container_of(node, struct io_wq_work, list);
 		if (!match->fn(work, match->data))
 			continue;
 		io_wqe_remove_pending(wqe, work, prev);
-		spin_unlock_irqrestore(&wqe->lock, flags);
+		raw_spin_unlock_irqrestore(&wqe->lock, flags);
 		io_run_cancel(work, wqe);
 		match->nr_pending++;
 		if (!match->cancel_all)
@@ -974,7 +974,7 @@ retry:
 		/* not safe to continue after unlock */
 		goto retry;
 	}
-	spin_unlock_irqrestore(&wqe->lock, flags);
+	raw_spin_unlock_irqrestore(&wqe->lock, flags);
 }
 
 static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1082,7 +1082,7 @@ struct io_wq *io_wq_create(unsigned boun
 		}
 		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
 		wqe->wq = wq;
-		spin_lock_init(&wqe->lock);
+		raw_spin_lock_init(&wqe->lock);
 		INIT_WQ_LIST(&wqe->work_list);
 		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
 		INIT_LIST_HEAD(&wqe->all_list);



  parent reply	other threads:[~2020-10-31 11:40 UTC|newest]

Thread overview: 74+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-31 11:35 [PATCH 5.8 00/70] 5.8.18-rc1 review Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 01/70] netfilter: nftables_offload: KASAN slab-out-of-bounds Read in nft_flow_rule_create Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 02/70] io_uring: dont run task work on an exiting task Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 03/70] io_uring: allow timeout/poll/files killing to take task into account Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 04/70] io_uring: move dropping of files into separate helper Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 05/70] io_uring: stash ctx task reference for SQPOLL Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 06/70] io_uring: unconditionally grab req->task Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 07/70] io_uring: return cancelation status from poll/timeout/files handlers Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 08/70] io_uring: enable task/files specific overflow flushing Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 09/70] io_uring: dont rely on weak ->files references Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 10/70] io_uring: reference ->nsproxy for file table commands Greg Kroah-Hartman
2020-10-31 11:35 ` Greg Kroah-Hartman [this message]
2020-10-31 11:35 ` [PATCH 5.8 12/70] io-wq: fix use-after-free in io_wq_worker_running Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 13/70] io_uring: no need to call xa_destroy() on empty xarray Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 14/70] io_uring: Fix use of XArray in __io_uring_files_cancel Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 15/70] io_uring: Fix XArray usage in io_uring_add_task_file Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 16/70] io_uring: Convert advanced XArray uses to the normal API Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 17/70] scripts/setlocalversion: make git describe output more reliable Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 18/70] efi/arm64: libstub: Deal gracefully with EFI_RNG_PROTOCOL failure Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 19/70] fs/kernel_read_file: Remove FIRMWARE_EFI_EMBEDDED enum Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 20/70] arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 21/70] arm64: Run ARCH_WORKAROUND_2 " Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 22/70] arm64: link with -z norelro regardless of CONFIG_RELOCATABLE Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 23/70] x86/PCI: Fix intel_mid_pci.c build error when ACPI is not enabled Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 24/70] x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}() Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 25/70] x86/copy_mc: Introduce copy_mc_enhanced_fast_string() Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 26/70] efivarfs: Replace invalid slashes with exclamation marks in dentries Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 27/70] bnxt_en: Check abort error state in bnxt_open_nic() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 28/70] bnxt_en: Fix regression in workqueue cleanup logic in bnxt_remove_one() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 29/70] bnxt_en: Invoke cancel_delayed_work_sync() for PFs also Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 30/70] bnxt_en: Re-write PCI BARs after PCI fatal error Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 31/70] bnxt_en: Send HWRM_FUNC_RESET fw command unconditionally Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 32/70] chelsio/chtls: fix deadlock issue Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 33/70] chelsio/chtls: fix memory leaks in CPL handlers Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 34/70] chelsio/chtls: fix tls record info to user Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 35/70] cxgb4: set up filter action after rewrites Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 36/70] gtp: fix an use-before-init in gtp_newlink() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 37/70] ibmveth: Fix use of ibmveth in a bridge Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 38/70] ibmvnic: fix ibmvnic_set_mac Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 39/70] mlxsw: core: Fix memory leak on module removal Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 40/70] netem: fix zero division in tabledist Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 41/70] net: hns3: Clear the CMDQ registers before unmapping BAR region Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 42/70] net: ipa: command payloads already mapped Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 43/70] net/sched: act_mpls: Add softdep on mpls_gso.ko Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 44/70] r8169: fix issue with forced threading in combination with shared interrupts Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 45/70] ravb: Fix bit fields checking in ravb_hwtstamp_get() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 46/70] tcp: Prevent low rmem stalls with SO_RCVLOWAT Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 47/70] tipc: fix memory leak caused by tipc_buf_append() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 48/70] net: protect tcf_block_unbind with block lock Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 49/70] erofs: avoid duplicated permission check for "trusted." xattrs Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 50/70] arch/x86/amd/ibs: Fix re-arming IBS Fetch Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 51/70] x86/traps: Fix #DE Oops message regression Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 52/70] x86/xen: disable Firmware First mode for correctable memory errors Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 53/70] PCI: aardvark: Fix initialization with old Marvells Arm Trusted Firmware Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 54/70] ata: ahci: mvebu: Make SATA PHY optional for Armada 3720 Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 55/70] fuse: fix page dereference after free Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 56/70] bpf: Fix comment for helper bpf_current_task_under_cgroup() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 57/70] evm: Check size of security.evm before using it Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 58/70] p54: avoid accessing the data mapped to streaming DMA Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 59/70] cxl: Rework error message for incompatible slots Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 60/70] RDMA/addr: Fix race with netevent_callback()/rdma_addr_cancel() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 61/70] mtd: lpddr: Fix bad logic in print_drs_error Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 62/70] drm/i915/gem: Serialise debugfs i915_gem_objects with ctx->mutex Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 63/70] serial: qcom_geni_serial: To correct QUP Version detection logic Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 64/70] serial: pl011: Fix lockdep splat when handling magic-sysrq interrupt Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 65/70] PM: runtime: Fix timer_expires data type on 32-bit arches Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 66/70] ata: sata_rcar: Fix DMA boundary mask Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 67/70] xen/gntdev.c: Mark pages as dirty Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 68/70] openrisc: Fix issue with get_user for 64-bit values Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 69/70] misc: rtsx: do not setting OC_POWER_DOWN reg in rtsx_pci_init_ocp() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 70/70] phy: marvell: comphy: Convert internal SMCC firmware return codes to errno Greg Kroah-Hartman
2020-10-31 20:08 ` [PATCH 5.8 00/70] 5.8.18-rc1 review Guenter Roeck
2020-11-01  7:19 ` Naresh Kamboju
2020-11-02  9:53 ` Jon Hunter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201031113500.044462247@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=axboe@kernel.dk \
    --cc=bigeasy@linutronix.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.