linux-rt-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: Unfair qspinlocks on ARM64 without LSE atomics => 3ms delay in interrupt handling
@ 2023-04-26 12:03 Bouska, Zdenek
  2023-04-26 21:29 ` Thomas Gleixner
  0 siblings, 1 reply; 13+ messages in thread
From: Bouska, Zdenek @ 2023-04-26 12:03 UTC (permalink / raw)
  To: Will Deacon, Catalin Marinas
  Cc: Thomas Gleixner, linux-arm-kernel, linux-kernel, Kiszka, Jan,
	linux-rt-users, Nishanth Menon, Puranjay Mohan

Hello,

following patch is my current approach for fixing this issue. I introduced
big_cpu_relax(), which uses Will's implementation [1] on ARM64 without
LSE atomics and original cpu_relax() on any other CPU.

Anyone has a better idea how to solve this issue properly?

[1] https://lore.kernel.org/lkml/20170728092831.GA24839@arm.com/

Zdenek Bouska

--
Siemens, s.r.o
Siemens Advanta Development

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 3918f2a67970..f3861ab9f541 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -367,6 +367,23 @@ static inline void spin_lock_prefetch(const void *ptr)
 		     "nop") : : "p" (ptr));
 }
 
+void armv8_big_cpu_relax(unsigned long pc);
+
+static inline void _big_cpu_relax(void)
+{
+	armv8_big_cpu_relax(_THIS_IP_);
+}
+
+#define ARCH_HAS_BIG_CPU_RELAX
+static inline void big_cpu_relax(void)
+{
+	if (system_uses_lse_atomics()) {
+		cpu_relax();
+	} else {
+		_big_cpu_relax();
+	}
+}
+
 extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
 extern void __init minsigstksz_setup(void);
 
diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index 5b7890139bc2..3f4fd24bd4b2 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -67,3 +67,29 @@ void __ndelay(unsigned long nsecs)
 	__const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */
 }
 EXPORT_SYMBOL(__ndelay);
+
+static DEFINE_PER_CPU(u64, __cpu_relax_data);
+
+#define CPU_RELAX_WFE_THRESHOLD	10000
+void armv8_big_cpu_relax(unsigned long pc)
+{
+	u64 new, old = raw_cpu_read(__cpu_relax_data);
+	u32 old_pc, new_pc;
+	bool wfe = false;
+
+	old_pc = (u32)old;
+	new = new_pc = (u32)pc;
+
+	if (old_pc == new_pc) {
+		if ((old >> 32) > CPU_RELAX_WFE_THRESHOLD) {
+			asm volatile("sevl; wfe; wfe\n" ::: "memory");
+			wfe = true;
+		} else {
+			new = old + (1UL << 32);
+		}
+	}
+
+	if (this_cpu_cmpxchg(__cpu_relax_data, old, new) == old && !wfe)
+		asm volatile("yield" ::: "memory");
+}
+EXPORT_SYMBOL(armv8_big_cpu_relax);
diff --git a/include/linux/processor.h b/include/linux/processor.h
index dc78bdc7079a..3dc5e3fcb400 100644
--- a/include/linux/processor.h
+++ b/include/linux/processor.h
@@ -59,4 +59,8 @@ do {								\
 
 #endif
 
+#ifndef ARCH_HAS_BIG_CPU_RELAX
+#define big_cpu_relax() cpu_relax()
+#endif
+
 #endif /* _LINUX_PROCESSOR_H */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8ce75495e04f..cc8445de1006 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -21,6 +21,7 @@
 #include <linux/sched/isolation.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/task_work.h>
+#include <linux/processor.h>
 
 #include "internals.h"
 
@@ -1101,7 +1102,7 @@ static void irq_finalize_oneshot(struct irq_desc *desc,
 	if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
 		raw_spin_unlock_irq(&desc->lock);
 		chip_bus_sync_unlock(desc);
-		cpu_relax();
+		big_cpu_relax();
 		goto again;
 	}
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread
* Unfair qspinlocks on ARM64 without LSE atomics => 3ms delay in interrupt handling
@ 2023-03-24  8:43 Bouska, Zdenek
  2023-03-24 17:01 ` Catalin Marinas
  0 siblings, 1 reply; 13+ messages in thread
From: Bouska, Zdenek @ 2023-03-24  8:43 UTC (permalink / raw)
  To: Thomas Gleixner, Will Deacon, linux-arm-kernel, linux-kernel
  Cc: Kiszka, Jan, Catalin Marinas, linux-rt-users, Nishanth Menon,
	Puranjay Mohan

[-- Attachment #1: Type: text/plain, Size: 6114 bytes --]

Hello,

I have seen ~3 ms delay in interrupt handling on ARM64.

I have traced it down to raw_spin_lock() call in handle_irq_event() in
kernel/irq/handle.c:

irqreturn_t handle_irq_event(struct irq_desc *desc)
{
    irqreturn_t ret;

    desc->istate &= ~IRQS_PENDING;
    irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
    raw_spin_unlock(&desc->lock);

    ret = handle_irq_event_percpu(desc);

--> raw_spin_lock(&desc->lock);
    irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
    return ret;
}

It took ~3 ms for this raw_spin_lock() to lock.

During this time irq_finalize_oneshot() from kernel/irq/manage.c locks and
unlocks the same raw spin lock more than 1000 times:

static void irq_finalize_oneshot(struct irq_desc *desc,
                 struct irqaction *action)
{
    if (!(desc->istate & IRQS_ONESHOT) ||
        action->handler == irq_forced_secondary_handler)
        return;
again:
    chip_bus_lock(desc);
--> raw_spin_lock_irq(&desc->lock);

    /*
     * Implausible though it may be we need to protect us against
     * the following scenario:
     *
     * The thread is faster done than the hard interrupt handler
     * on the other CPU. If we unmask the irq line then the
     * interrupt can come in again and masks the line, leaves due
     * to IRQS_INPROGRESS and the irq line is masked forever.
     *
     * This also serializes the state of shared oneshot handlers
     * versus "desc->threads_oneshot |= action->thread_mask;" in
     * irq_wake_thread(). See the comment there which explains the
     * serialization.
     */
    if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
-->     raw_spin_unlock_irq(&desc->lock);
        chip_bus_sync_unlock(desc);
        cpu_relax();
        goto again;
    }
...

I have created a workaround for this problem by calling cpu_relax() 50
times after 100 failed tries. See attached patch
3ms_tx_delay_workaround.patch.

I have created custom kernel module with 2 threads, one similar to
irq_finalize_oneshot(), second similar to handle_irq_event(). I have used
latest Linux 6.3-rc3 with no added patches and I confirmed that even there
qspinlocks are not fair on my ARM64 board.

I copied qspinlocks code to the module twice and I have put traces only to
one thread, the one which takes several ms to lock and is originally
called from handle_irq_event(). I have found out that the
queued_fetch_set_pending_acquire() takes those 3 ms to finish. On ARM64
queued_fetch_set_pending_acquire() is implemented as
atomic_fetch_or_acquire().

I have found out that my CPU doesn't know LSE atomic instructions and it
looks like atomic operations could be quite slow there. Assembler code in
arch/arm64/include/asm/atomic_ll_sc.h has loop inside:

#define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint) \
static __always_inline int                      \
__ll_sc_atomic_fetch_##op##name(int i, atomic_t *v)         \
{                                   \
    unsigned long tmp;                      \
    int val, result;                        \
                                    \
    asm volatile("// atomic_fetch_" #op #name "\n"          \
    "   prfm    pstl1strm, %3\n"                \
    "1: ld" #acq "xr    %w0, %3\n"              \
    "   " #asm_op " %w1, %w0, %w4\n"            \
    "   st" #rel "xr    %w2, %w1, %3\n"             \
--> "   cbnz    %w2, 1b\n"                  \
    "   " #mb                           \
    : "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)   \
    : __stringify(constraint) "r" (i)               \
    : cl);                              \
                                    \
    return result;                          \
}

Most importantly, these atomic operations seem to make one CPU dominate
the cache line so that the other is unable to take the lock. And that is
problematic in combination with the retry loop in irq_finalize_oneshot().

To confirm it I have created small userspace program, which just calls
__ll_sc_atomic_fetch_or_acquire() from two threads. See attached
unfair_arm64_asm_atomic_ll_sc_demonstration.tar.gz. Bellow you can see
that it took 16 ms for one atomic operation.

# ./contested
load thread started
evaluation thread started
new max duration: 6420 ns
new max duration: 9355 ns
new max duration: 22240 ns
new max duration: 23180 ns
new max duration: 70465 ns
new max duration: 77860 ns
new max duration: 83100 ns
new max duration: 105115 ns
new max duration: 127695 ns
new max duration: 128840 ns
new max duration: 1265595 ns
new max duration: 3713430 ns
new max duration: 3750810 ns
new max duration: 7996020 ns
new max duration: 7998890 ns
new max duration: 7999340 ns
new max duration: 7999490 ns
new max duration: 12000210 ns
new max duration: 15999700 ns
new max duration: 16000000 ns
new max duration: 16000030 ns

So I confirmed that atomic operations from
arch/arm64/include/asm/atomic_ll_sc.h can be quite slow when they are
contested from second CPU.

Do you think that it is possible to create fair qspinlock implementation
on top of atomic instructions supported by ARM64 version 8 (no LSE atomic
instructions) without compromising performance in the uncontested case?
For example ARM64 could have custom queued_fetch_set_pending_acquire
implementation same as x86 has in arch/x86/include/asm/qspinlock.h. Is the
retry loop in irq_finalize_oneshot() ok together with the current ARM64
cpu_relax() implementation for processor with no LSE atomic instructions?

I reproduced the real life scenario of TX delay only in ICSSG network
driver (not yet merged to mainline) [1], it was with kernel 5.10 with
patches, CONFIG_PREEMPT_RT and custom ICSSG firmware on Texas Instruments
AM65x IDK [2] with ARM Cortex A53. This custom setup comes with high
interrupt load.

[1] https://lore.kernel.org/all/20220406094358.7895-1-p-mohan@ti.com/
[2] https://www.ti.com/tool/TMDX654IDKEVM

With best regards,
Zdenek Bouska

--
Siemens, s.r.o
Siemens Advanta Development

[-- Attachment #2: unfair_arm64_asm_atomic_ll_sc_demonstration.tar.gz --]
[-- Type: application/x-gzip, Size: 3522 bytes --]

[-- Attachment #3: 3ms_tx_delay_workaround.patch --]
[-- Type: application/octet-stream, Size: 1853 bytes --]

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8ce75495e04f..1f976f36cd56 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1067,51 +1067,59 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 		}
 		schedule();
 	}
 }
 
 /*
  * Oneshot interrupts keep the irq line masked until the threaded
  * handler finished. unmask if the interrupt has not been disabled and
  * is marked MASKED.
  */
 static void irq_finalize_oneshot(struct irq_desc *desc,
 				 struct irqaction *action)
 {
+	int i;
+	int64_t again_count = 0;
 	if (!(desc->istate & IRQS_ONESHOT) ||
 	    action->handler == irq_forced_secondary_handler)
 		return;
 again:
 	chip_bus_lock(desc);
 	raw_spin_lock_irq(&desc->lock);
 
 	/*
 	 * Implausible though it may be we need to protect us against
 	 * the following scenario:
 	 *
 	 * The thread is faster done than the hard interrupt handler
 	 * on the other CPU. If we unmask the irq line then the
 	 * interrupt can come in again and masks the line, leaves due
 	 * to IRQS_INPROGRESS and the irq line is masked forever.
 	 *
 	 * This also serializes the state of shared oneshot handlers
 	 * versus "desc->threads_oneshot |= action->thread_mask;" in
 	 * irq_wake_thread(). See the comment there which explains the
 	 * serialization.
 	 */
 	if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
 		raw_spin_unlock_irq(&desc->lock);
 		chip_bus_sync_unlock(desc);
 		cpu_relax();
+		++again_count;
+		if (again_count > 100) {
+			for (i=0; i < 50; ++i) {
+				cpu_relax();
+			}
+		}
 		goto again;
 	}
 
 	/*
 	 * Now check again, whether the thread should run. Otherwise
 	 * we would clear the threads_oneshot bit of this thread which
 	 * was just set.
 	 */
 	if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
 		goto out_unlock;
 
 	desc->threads_oneshot &= ~action->thread_mask;
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-04-28  7:37 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-26 12:03 Unfair qspinlocks on ARM64 without LSE atomics => 3ms delay in interrupt handling Bouska, Zdenek
2023-04-26 21:29 ` Thomas Gleixner
2023-04-27  9:38   ` Bouska, Zdenek
2023-04-27 10:06     ` Will Deacon
2023-04-27 13:14   ` Jan Kiszka
2023-04-27 13:45     ` Kurt Kanzenbach
2023-04-28  7:30       ` Sebastian Andrzej Siewior
2023-04-28  7:37         ` Kurt Kanzenbach
  -- strict thread matches above, loose matches on Subject: below --
2023-03-24  8:43 Bouska, Zdenek
2023-03-24 17:01 ` Catalin Marinas
2023-03-24 18:09   ` Will Deacon
2023-03-28  9:39     ` Bouska, Zdenek
2023-03-27  5:44   ` Bouska, Zdenek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).