All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
To: LKML <linux-kernel@vger.kernel.org>, X86 Kernel <x86@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	iommu@lists.linux.dev, Thomas Gleixner <tglx@linutronix.de>,
	"Lu Baolu" <baolu.lu@linux.intel.com>,
	kvm@vger.kernel.org, Dave Hansen <dave.hansen@intel.com>,
	Joerg Roedel <joro@8bytes.org>, "H. Peter Anvin" <hpa@zytor.com>,
	"Borislav Petkov" <bp@alien8.de>,
	"Ingo Molnar" <mingo@redhat.com>
Cc: Paul Luse <paul.e.luse@intel.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Jens Axboe <axboe@kernel.dk>, Raj Ashok <ashok.raj@intel.com>,
	"Tian, Kevin" <kevin.tian@intel.com>,
	maz@kernel.org, seanjc@google.com,
	"Robin Murphy" <robin.murphy@arm.com>,
	jim.harris@samsung.com, a.manzanares@samsung.com,
	"Bjorn Helgaas" <helgaas@kernel.org>,
	guang.zeng@intel.com, robert.hoo.linux@gmail.com,
	oliver.sang@intel.com, acme@kernel.org,
	Jacob Pan <jacob.jun.pan@linux.intel.com>
Subject: [PATCH v3  08/12] x86/irq: Install posted MSI notification handler
Date: Tue, 23 Apr 2024 10:41:10 -0700	[thread overview]
Message-ID: <20240423174114.526704-9-jacob.jun.pan@linux.intel.com> (raw)
In-Reply-To: <20240423174114.526704-1-jacob.jun.pan@linux.intel.com>

All MSI vectors are multiplexed into a single notification vector when
posted MSI is enabled. It is the responsibility of the notification
vector handler to demultiplex MSI vectors. In this handler, for each
pending bit, MSI vector handlers are dispatched without IDT delivery.

For example, the interrupt flow will change as follows:
(3 MSIs of different vectors arrive in a a high frequency burst)

BEFORE:
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()

AFTER:
interrupt /* Posted MSI notification vector */
    irq_enter()
	atomic_xchg(PIR)
	handler()
	handler()
	handler()
	pi_clear_on()
    apic_eoi()
    irq_exit()
        process_softirq()

Except for the leading MSI, CPU notifications are skipped/coalesced.

For MSIs arrive at a low frequency, the demultiplexing loop does not
wait for more interrupts to coalesce. Therefore, there's no additional
latency other than the processing time.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>

---
v3:
   - Add support for FRED
   - Fix double eoi if a vector is invalid (Kevin)
v2:
   - Delete extra inline attribute
   - Fix pir pointer in xchg (Zeng Guang)
---
 arch/x86/entry/entry_fred.c     |   2 +
 arch/x86/include/asm/hardirq.h  |   3 +
 arch/x86/include/asm/idtentry.h |   6 ++
 arch/x86/kernel/idt.c           |   3 +
 arch/x86/kernel/irq.c           | 125 +++++++++++++++++++++++++++++++-
 5 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index ac120cbdaaf2..c4a22e3d4a1e 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
 	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
 	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
 	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
+
+	SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	posted_msi_notification),
 };
 
 static bool fred_setup_done __initdata;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index e7ab594b3a7a..c67fa6ad098a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,6 +44,9 @@ typedef struct {
 	unsigned int irq_hv_reenlightenment_count;
 	unsigned int hyperv_stimer0_count;
 #endif
+#ifdef CONFIG_X86_POSTED_MSI
+	unsigned int posted_msi_notification_count;
+#endif
 } ____cacheline_aligned irq_cpustat_t;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 749c7411d2f1..d4f24499b256 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested
 # define fred_sysvec_kvm_posted_intr_nested_ipi		NULL
 #endif
 
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification		NULL
+# endif
+
 #if IS_ENABLED(CONFIG_HYPERV)
 DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,	sysvec_hyperv_callback);
 DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR,	sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index fc37c8d83daf..f445bec516a0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
 # endif
 	INTG(SPURIOUS_APIC_VECTOR,		asm_sysvec_spurious_apic_interrupt),
 	INTG(ERROR_APIC_VECTOR,			asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+	INTG(POSTED_MSI_NOTIFICATION_VECTOR,	asm_sysvec_posted_msi_notification),
+# endif
 #endif
 };
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d652b0481899..578e4f6a5080 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ",
 			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
 	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+	seq_printf(p, "%*s: ", prec, "PMN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->posted_msi_notification_count);
+	seq_puts(p, "  Posted MSI notification event\n");
 #endif
 	return 0;
 }
@@ -242,16 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
 		__handle_irq(desc, regs);
 }
 
-static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
 {
 	struct irq_desc *desc;
+	int ret = 0;
 
 	desc = __this_cpu_read(vector_irq[vector]);
 	if (likely(!IS_ERR_OR_NULL(desc))) {
 		handle_irq(desc, regs);
 	} else {
-		apic_eoi();
-
+		ret = -EINVAL;
 		if (desc == VECTOR_UNUSED) {
 			pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
 					     __func__, smp_processor_id(),
@@ -260,6 +267,8 @@ static __always_inline void call_irq_handler(int vector, struct pt_regs *regs)
 			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
 	}
+
+	return ret;
 }
 
 /*
@@ -273,7 +282,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
 	/* entry code tells RCU that we're not quiescent.  Check it. */
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
 
-	call_irq_handler(vector, regs);
+	if (unlikely(call_irq_handler(vector, regs)))
+		apic_eoi();
+
 	set_irq_regs(old_regs);
 }
 
@@ -361,6 +372,112 @@ void intel_posted_msi_init(void)
 	destination = x2apic_enabled() ? apic_id : apic_id << 8;
 	this_cpu_write(posted_msi_pi_desc.ndst, destination);
 }
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ *   accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ *   for checking and clearing posted interrupt request (PIR), a 256 bit field
+ *   within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ *   line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ *   cache line. The cache line states after each operation are as follows:
+ *   CPU		IOMMU			PID Cache line state
+ *   ---------------------------------------------------------------
+ *...read64					exclusive
+ *...lock xchg64				modified
+ *...			post/atomic swap	invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ *		read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ *		read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+	int i, vec = FIRST_EXTERNAL_VECTOR;
+	unsigned long pir_copy[4];
+	bool handled = false;
+
+	for (i = 0; i < 4; i++)
+		pir_copy[i] = pir[i];
+
+	for (i = 0; i < 4; i++) {
+		if (!pir_copy[i])
+			continue;
+
+		pir_copy[i] = arch_xchg(&pir[i], 0);
+		handled = true;
+	}
+
+	if (handled) {
+		for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+			call_irq_handler(vec, regs);
+	}
+
+	return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct pi_desc *pid;
+	int i = 0;
+
+	pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+	inc_irq_stat(posted_msi_notification_count);
+	irq_enter();
+
+	/*
+	 * Max coalescing count includes the extra round of handle_pending_pir
+	 * after clearing the outstanding notification bit. Hence, at most
+	 * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+	 */
+	while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+		if (!handle_pending_pir(pid->pir64, regs))
+			break;
+	}
+
+	/*
+	 * Clear outstanding notification bit to allow new IRQ notifications,
+	 * do this last to maximize the window of interrupt coalescing.
+	 */
+	pi_clear_on(pid);
+
+	/*
+	 * There could be a race of PI notification and the clearing of ON bit,
+	 * process PIR bits one last time such that handling the new interrupts
+	 * are not delayed until the next IRQ.
+	 */
+	handle_pending_pir(pid->pir64, regs);
+
+	apic_eoi();
+	irq_exit();
+	set_irq_regs(old_regs);
+}
 #endif /* X86_POSTED_MSI */
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.25.1


  parent reply	other threads:[~2024-04-23 17:36 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-23 17:41 [PATCH v3 00/12] Coalesced Interrupt Delivery with posted MSI Jacob Pan
2024-04-23 17:41 ` [PATCH v3 01/12] KVM: VMX: Move posted interrupt descriptor out of vmx code Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] KVM: VMX: Move posted interrupt descriptor out of VMX code tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 02/12] x86/irq: Unionize PID.PIR for 64bit access w/o casting Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 03/12] x86/irq: Remove bitfields in posted interrupt descriptor Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-05-01  7:29   ` [PATCH v3 03/12] " Oliver Sang
2024-04-23 17:41 ` [PATCH v3 04/12] x86/irq: Add a Kconfig option for posted MSI Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 05/12] x86/irq: Reserve a per CPU IDT vector for posted MSIs Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 06/12] x86/irq: Set up per host CPU posted interrupt descriptors Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 07/12] x86/irq: Factor out calling ISR from common_interrupt Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] x86/irq: Factor out handler invocation from common_interrupt() tip-bot2 for Jacob Pan
2024-04-23 17:41 ` Jacob Pan [this message]
2024-04-30 13:28   ` [tip: x86/irq] x86/irq: Install posted MSI notification handler tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 09/12] x86/irq: Factor out common code for checking pending interrupts Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 10/12] x86/irq: Extend checks for pending vectors to posted interrupts Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 11/12] iommu/vt-d: Make posted MSI an opt-in cmdline option Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] iommu/vt-d: Make posted MSI an opt-in command line option tip-bot2 for Jacob Pan
2024-04-23 17:41 ` [PATCH v3 12/12] iommu/vt-d: Enable posted mode for device MSIs Jacob Pan
2024-04-30 13:28   ` [tip: x86/irq] " tip-bot2 for Jacob Pan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240423174114.526704-9-jacob.jun.pan@linux.intel.com \
    --to=jacob.jun.pan@linux.intel.com \
    --cc=a.manzanares@samsung.com \
    --cc=acme@kernel.org \
    --cc=ashok.raj@intel.com \
    --cc=axboe@kernel.dk \
    --cc=baolu.lu@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=guang.zeng@intel.com \
    --cc=helgaas@kernel.org \
    --cc=hpa@zytor.com \
    --cc=iommu@lists.linux.dev \
    --cc=jim.harris@samsung.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=mingo@redhat.com \
    --cc=oliver.sang@intel.com \
    --cc=paul.e.luse@intel.com \
    --cc=peterz@infradead.org \
    --cc=robert.hoo.linux@gmail.com \
    --cc=robin.murphy@arm.com \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.