* [PATCH 1/4][RFC v2] x86/apic: Extend the defination for vector_irq
2017-09-01 5:03 [PATCH 0/4][RFC v2] x86/irq: Spread vectors on different CPUs Chen Yu
@ 2017-09-01 5:03 ` Chen Yu
2017-09-01 5:04 ` [PATCH 2/4][RFC v2] x86/apic: Record the number of vectors assigned on a CPU Chen Yu
` (2 subsequent siblings)
3 siblings, 0 replies; 21+ messages in thread
From: Chen Yu @ 2017-09-01 5:03 UTC (permalink / raw)
To: x86
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Rui Zhang,
linux-kernel, Chen Yu, Rafael J. Wysocki, Len Brown,
Dan Williams
Introduce a variable inside the vector_irq, to record the number
of vectors assigned per CPU. This is to prepare for the vector
spreading work.
No functional change.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
arch/x86/include/asm/hw_irq.h | 8 ++++++--
arch/x86/kernel/apic/vector.c | 25 +++++++++++++------------
arch/x86/kernel/irq.c | 18 +++++++++---------
arch/x86/kernel/irqinit.c | 9 +++++----
arch/x86/lguest/boot.c | 2 +-
5 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index d6dbafb..b2243fe 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -179,8 +179,12 @@ extern char irq_entries_start[];
#define VECTOR_UNUSED NULL
#define VECTOR_RETRIGGERED ((void *)~0UL)
-typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+struct vector_irq_info {
+ struct irq_desc *desc[NR_VECTORS];
+ int alloc;
+};
+
+DECLARE_PER_CPU(struct vector_irq_info, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b3af457..2ce1021 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -179,7 +179,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
goto next;
for_each_cpu(new_cpu, vector_searchmask) {
- if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu).desc[vector]))
goto next;
}
/* Found one! */
@@ -189,7 +189,8 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
if (d->cfg.vector)
cpumask_copy(d->old_domain, d->domain);
for_each_cpu(new_cpu, vector_searchmask)
- per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
+ per_cpu(vector_irq, new_cpu).desc[vector] = irq_to_desc(irq);
+
goto update;
next_cpu:
@@ -268,7 +269,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
vector = data->cfg.vector;
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
data->cfg.vector = 0;
cpumask_clear(data->domain);
@@ -285,9 +286,9 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != desc)
+ if (per_cpu(vector_irq, cpu).desc[vector] != desc)
continue;
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
break;
}
}
@@ -481,17 +482,17 @@ static void __setup_vector_irq(int cpu)
if (!data || !cpumask_test_cpu(cpu, data->domain))
continue;
vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = desc;
+ per_cpu(vector_irq, cpu).desc[vector] = desc;
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
- desc = per_cpu(vector_irq, cpu)[vector];
+ desc = per_cpu(vector_irq, cpu).desc[vector];
if (IS_ERR_OR_NULL(desc))
continue;
data = apic_chip_data(irq_desc_get_irq_data(desc));
if (!cpumask_test_cpu(cpu, data->domain))
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
}
}
@@ -511,7 +512,7 @@ void setup_vector_irq(int cpu)
* legacy vector to irq mapping:
*/
for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
+ per_cpu(vector_irq, cpu).desc[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
__setup_vector_irq(cpu);
}
@@ -596,7 +597,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
unsigned int irr;
retry:
- desc = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq.desc[vector]);
if (IS_ERR_OR_NULL(desc))
continue;
@@ -647,7 +648,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ __this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
@@ -781,7 +782,7 @@ void irq_force_complete_move(struct irq_desc *desc)
* descriptor set in their vector array. Clean it up.
*/
for_each_cpu(cpu, data->old_domain)
- per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, cpu).desc[cfg->old_vector] = VECTOR_UNUSED;
/* Cleanup the left overs of the (half finished) move */
cpumask_clear(data->old_domain);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4ed0aba..ae11e86 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -239,7 +239,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
- desc = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq.desc[vector]);
if (!handle_irq(desc, regs)) {
ack_APIC_irq();
@@ -249,7 +249,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
__func__, smp_processor_id(),
vector);
} else {
- __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ __this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
}
}
@@ -375,7 +375,7 @@ int check_irq_vectors_for_cpu_disable(void)
this_count = 0;
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- desc = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq.desc[vector]);
if (IS_ERR_OR_NULL(desc))
continue;
/*
@@ -433,7 +433,7 @@ int check_irq_vectors_for_cpu_disable(void)
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
if (!test_bit(vector, used_vectors) &&
- IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
+ IS_ERR_OR_NULL(per_cpu(vector_irq, cpu).desc[vector])) {
if (++count == this_count)
return 0;
}
@@ -475,24 +475,24 @@ void fixup_irqs(void)
* nothing else will touch it.
*/
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq.desc[vector])))
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
- desc = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq.desc[vector]);
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
- __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ __this_cpu_write(vector_irq.desc[vector], VECTOR_RETRIGGERED);
}
raw_spin_unlock(&desc->lock);
}
- if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
- __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ if (__this_cpu_read(vector_irq.desc[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
}
}
#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c7fd185..734b54f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,8 +51,9 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+DEFINE_PER_CPU(struct vector_irq_info, vector_irq) = {
+ .desc[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+ .alloc = 0,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +61,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu).desc[vector]))
return 1;
}
@@ -94,7 +95,7 @@ void __init init_IRQ(void)
* irq's migrate etc.
*/
for (i = 0; i < nr_legacy_irqs(); i++)
- per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
+ per_cpu(vector_irq, 0).desc[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
x86_init.irqs.intr_init();
}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9947269..e80758a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -846,7 +846,7 @@ static int lguest_setup_irq(unsigned int irq)
/* Some systems map "vectors" to interrupts weirdly. Not us! */
desc = irq_to_desc(irq);
- __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
+ __this_cpu_write(vector_irq.desc[FIRST_EXTERNAL_VECTOR + irq], desc);
return 0;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH 2/4][RFC v2] x86/apic: Record the number of vectors assigned on a CPU
2017-09-01 5:03 [PATCH 0/4][RFC v2] x86/irq: Spread vectors on different CPUs Chen Yu
2017-09-01 5:03 ` [PATCH 1/4][RFC v2] x86/apic: Extend the defination for vector_irq Chen Yu
@ 2017-09-01 5:04 ` Chen Yu
2017-09-01 5:04 ` [PATCH 3/4] x86/apic: Introduce the per vector cpumask array Chen Yu
2017-09-01 5:04 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Chen Yu
3 siblings, 0 replies; 21+ messages in thread
From: Chen Yu @ 2017-09-01 5:04 UTC (permalink / raw)
To: x86
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Rui Zhang,
linux-kernel, Chen Yu, Rafael J. Wysocki, Len Brown,
Dan Williams
Update the number of vectors assigned on each CPU during
vector allocation/free process. This is to prepare for
the vector spreading work that, we can find out the CPU
with least vectors assigned.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
arch/x86/include/asm/hw_irq.h | 8 ++++++++
arch/x86/kernel/apic/vector.c | 45 ++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kernel/irq.c | 5 ++++-
arch/x86/kernel/irqinit.c | 1 +
arch/x86/lguest/boot.c | 1 +
5 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b2243fe..d1b3c61 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -151,6 +151,10 @@ extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
extern void lock_vector_lock(void);
extern void unlock_vector_lock(void);
extern void setup_vector_irq(int cpu);
+extern void inc_vector_alloc(const struct cpumask *mask,
+ int count);
+extern void dec_vector_alloc(const struct cpumask *mask,
+ int count);
#ifdef CONFIG_SMP
extern void send_cleanup_vector(struct irq_cfg *);
extern void irq_complete_move(struct irq_cfg *cfg);
@@ -163,6 +167,10 @@ extern void apic_ack_edge(struct irq_data *data);
#else /* CONFIG_X86_LOCAL_APIC */
static inline void lock_vector_lock(void) {}
static inline void unlock_vector_lock(void) {}
+static inline void inc_vector_alloc(const struct cpumask *mask,
+ int count) {}
+static inline void dec_vector_alloc(const struct cpumask *mask,
+ int count) {}
#endif /* CONFIG_X86_LOCAL_APIC */
/* Statistics */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 2ce1021..4ff84c0 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -50,6 +50,36 @@ void unlock_vector_lock(void)
raw_spin_unlock(&vector_lock);
}
+static void update_vectors_alloc(const struct cpumask *mask,
+ int count, bool add)
+{
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ int cur_alloc = per_cpu(vector_irq, cpu).alloc;
+
+ /* Update the number of vectors assigned on this CPU. */
+ if (add && (cur_alloc + count <= NR_VECTORS))
+ per_cpu(vector_irq, cpu).alloc += count;
+ else if (!add && cur_alloc >= count)
+ per_cpu(vector_irq, cpu).alloc -= count;
+ else
+ continue;
+ }
+}
+
+void inc_vector_alloc(const struct cpumask *mask,
+ int count)
+{
+ update_vectors_alloc(mask, count, true);
+}
+
+void dec_vector_alloc(const struct cpumask *mask,
+ int count)
+{
+ update_vectors_alloc(mask, count, false);
+}
+
static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
{
if (!irq_data)
@@ -191,6 +221,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
for_each_cpu(new_cpu, vector_searchmask)
per_cpu(vector_irq, new_cpu).desc[vector] = irq_to_desc(irq);
+ inc_vector_alloc(vector_searchmask, 1);
goto update;
next_cpu:
@@ -263,6 +294,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
struct irq_desc *desc;
int cpu, vector;
+ struct cpumask mask;
if (!data->cfg.vector)
return;
@@ -271,6 +303,9 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
+ cpumask_and(&mask, data->domain, cpu_online_mask);
+ dec_vector_alloc(&mask, 1);
+
data->cfg.vector = 0;
cpumask_clear(data->domain);
@@ -289,6 +324,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
if (per_cpu(vector_irq, cpu).desc[vector] != desc)
continue;
per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
+ dec_vector_alloc(cpumask_of(cpu), 1);
break;
}
}
@@ -483,6 +519,7 @@ static void __setup_vector_irq(int cpu)
continue;
vector = data->cfg.vector;
per_cpu(vector_irq, cpu).desc[vector] = desc;
+ inc_vector_alloc(cpumask_of(cpu), 1);
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
@@ -491,8 +528,10 @@ static void __setup_vector_irq(int cpu)
continue;
data = apic_chip_data(irq_desc_get_irq_data(desc));
- if (!cpumask_test_cpu(cpu, data->domain))
+ if (!cpumask_test_cpu(cpu, data->domain)) {
per_cpu(vector_irq, cpu).desc[vector] = VECTOR_UNUSED;
+ dec_vector_alloc(cpumask_of(cpu), 1);
+ }
}
}
@@ -514,6 +553,7 @@ void setup_vector_irq(int cpu)
for (irq = 0; irq < nr_legacy_irqs(); irq++)
per_cpu(vector_irq, cpu).desc[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
+ inc_vector_alloc(cpumask_of(cpu), irq);
__setup_vector_irq(cpu);
}
@@ -649,6 +689,7 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
+ dec_vector_alloc(cpumask_of(me), 1);
cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
@@ -784,6 +825,8 @@ void irq_force_complete_move(struct irq_desc *desc)
for_each_cpu(cpu, data->old_domain)
per_cpu(vector_irq, cpu).desc[cfg->old_vector] = VECTOR_UNUSED;
+ dec_vector_alloc(data->old_domain, 1);
+
/* Cleanup the left overs of the (half finished) move */
cpumask_clear(data->old_domain);
data->move_in_progress = 0;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index ae11e86..67c01b8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -250,6 +250,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
vector);
} else {
__this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
+ inc_vector_alloc(cpumask_of(smp_processor_id()), 1);
}
}
@@ -491,8 +492,10 @@ void fixup_irqs(void)
}
raw_spin_unlock(&desc->lock);
}
- if (__this_cpu_read(vector_irq.desc[vector]) != VECTOR_RETRIGGERED)
+ if (__this_cpu_read(vector_irq.desc[vector]) != VECTOR_RETRIGGERED) {
__this_cpu_write(vector_irq.desc[vector], VECTOR_UNUSED);
+ dec_vector_alloc(cpumask_of(smp_processor_id()), 1);
+ }
}
}
#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 734b54f..dd618c1 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -97,6 +97,7 @@ void __init init_IRQ(void)
for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0).desc[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
+ inc_vector_alloc(cpumask_of(1), i);
x86_init.irqs.intr_init();
}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e80758a..0696354 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -847,6 +847,7 @@ static int lguest_setup_irq(unsigned int irq)
/* Some systems map "vectors" to interrupts weirdly. Not us! */
desc = irq_to_desc(irq);
__this_cpu_write(vector_irq.desc[FIRST_EXTERNAL_VECTOR + irq], desc);
+ inc_vector_alloc(cpumask_of(smp_processor_id()), 1);
return 0;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH 3/4] x86/apic: Introduce the per vector cpumask array
2017-09-01 5:03 [PATCH 0/4][RFC v2] x86/irq: Spread vectors on different CPUs Chen Yu
2017-09-01 5:03 ` [PATCH 1/4][RFC v2] x86/apic: Extend the defination for vector_irq Chen Yu
2017-09-01 5:04 ` [PATCH 2/4][RFC v2] x86/apic: Record the number of vectors assigned on a CPU Chen Yu
@ 2017-09-01 5:04 ` Chen Yu
2017-09-01 5:04 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Chen Yu
3 siblings, 0 replies; 21+ messages in thread
From: Chen Yu @ 2017-09-01 5:04 UTC (permalink / raw)
To: x86
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Rui Zhang,
linux-kernel, Chen Yu, Rafael J. Wysocki, Len Brown,
Dan Williams
This array is a bitmap for sorting the number of vectors
assigned on each CPU, thus to quickly retrieve the CPU
which have assigned the least amount of vectors, then
choose that CPU as a hint to spread the vectors among
multiple CPUs.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
arch/x86/kernel/apic/vector.c | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 4ff84c0..b60cc66 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -32,6 +32,16 @@ struct irq_domain *x86_vector_domain;
EXPORT_SYMBOL_GPL(x86_vector_domain);
static DEFINE_RAW_SPINLOCK(vector_lock);
static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
+/*
+ * vectors_alloc_mask[i] records the CPUs which have assigned i vectors each.
+ * For example, let vectors_alloc_mask[3] = cpumask_or(cpumask_of(8), cpumask_of(9))
+ * which means that:
+ * On CPU8 and CPU9, the number of vectors been assigned is 3.
+ * This bitmap can be used to quickly find out the CPUs who has allocated the least
+ * number of vectors, by checking from vectors_alloc_mask[0] to vectors_alloc_mask[255]
+ * until a non-zero value is found. These CPUs are chosen for vector spreading.
+ */
+static cpumask_var_t vectors_alloc_mask[NR_VECTORS];
static struct irq_chip lapic_controller;
#ifdef CONFIG_X86_IO_APIC
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -65,6 +75,9 @@ static void update_vectors_alloc(const struct cpumask *mask,
per_cpu(vector_irq, cpu).alloc -= count;
else
continue;
+ /* Update the position for this CPU. */
+ cpumask_clear_cpu(cpu, vectors_alloc_mask[cur_alloc]);
+ cpumask_set_cpu(cpu, vectors_alloc_mask[per_cpu(vector_irq, cpu).alloc]);
}
}
@@ -479,6 +492,25 @@ static void __init init_legacy_irqs(void)
static inline void init_legacy_irqs(void) { }
#endif
+static int __init alloc_assigned_vectors_mask(void)
+{
+ int i, ret;
+
+ for (i = 0; i < NR_VECTORS; i++) {
+ ret = alloc_cpumask_var(&vectors_alloc_mask[i], GFP_KERNEL);
+ if (!ret)
+ goto free_mask;
+ }
+ /* Initially all the CPUs have 0 vector assigned. */
+ cpumask_setall(vectors_alloc_mask[0]);
+ return 0;
+
+ free_mask:
+ while (i--)
+ free_cpumask_var(vectors_alloc_mask[i]);
+ return -ENOMEM;
+}
+
int __init arch_early_irq_init(void)
{
struct fwnode_handle *fn;
@@ -499,6 +531,7 @@ int __init arch_early_irq_init(void)
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
+ BUG_ON(alloc_assigned_vectors_mask());
return arch_early_ioapic_init();
}
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-01 5:03 [PATCH 0/4][RFC v2] x86/irq: Spread vectors on different CPUs Chen Yu
` (2 preceding siblings ...)
2017-09-01 5:04 ` [PATCH 3/4] x86/apic: Introduce the per vector cpumask array Chen Yu
@ 2017-09-01 5:04 ` Chen Yu
2017-09-03 18:17 ` Thomas Gleixner
3 siblings, 1 reply; 21+ messages in thread
From: Chen Yu @ 2017-09-01 5:04 UTC (permalink / raw)
To: x86
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Rui Zhang,
linux-kernel, Chen Yu, Rafael J. Wysocki, Len Brown,
Dan Williams
This is the major logic to spread the vectors on different CPUs.
The main idea is to choose the 'idlest' CPU which has assigned
the least number of vectors as the candidate/hint for the vector
allocation domain, in the hope that the vector allocation domain
could leverage this hint to generate corresponding cpumask.
One of the requirements to do this vector spreading work comes from the
following hibernation problem found on a 16 cores server:
CPU 31 disable failed: CPU has 62 vectors assigned and there
are only 0 available.
2 issues were found after investigation:
1. The network driver has declared many vector resources via
pci_enable_msix_range(), say, this driver might likely want
to reserve 6 per logical CPU, then there would be 192 of them.
2. Besides, most of the vectors reserved by this driver are assigned
on CPU0 due to the current code strategy, so there would be
insufficient slots left on CPU0 to receive any migrated IRQs
during CPU offine.
In theory after the commit c5cb83bb337c ("genirq/cpuhotplug: Handle
managed IRQs on CPU hotplug") the issue 1 has been solved with the
managed interrupt mechanism for multi queue devices, which no longer
migrate these interrupts. It simply shuts them down and restarts
them when the CPU comes back online.
However, according to the implementation of this network driver,
the commit mentioned above does not fully fix the issue 1.
Here's the framework of the network driver:
step 1. Reserved enough irq vectors and corresponding IRQs.
step 2. If the network is activated, invoke request_irq() to
register the handler.
step 3. Invoke set_affinity() to spread the IRQs onto different
CPUs, thus to spread the vectors too.
The problem is, if the network cable is not connected, step 2
and 3 will not get invoked, thus the IRQ vectors will not spread
on different CPUs and will still be allocated on CPU0. As a result
the CPUs will still get the offline failure.
Previously there were some discussion in the thread [1] about the
vector spread, and here are the suggestion from Thomas:
Q1:
Rewrite that allocation code from scratch, use per cpu bitmaps,
so we can do fast search over masks and keep track of
the vectors which are used/free per cpu.
A1:
per cpu bitmap was onced considered but this might not be
as fast as the solution proposed here. That is, if we introduce the
per cpu bitmap, we need to count the number of vectors assigned on
each CPUs by cpumask_weight() and sort them in order to get the
CPUs who have the least number of vectors assigned. By contrast,
if we introduce the bitmaps for each vector number, we can get the
'idle' CPU at the cost of nearly O(1).
One scenario of this patch can be illustrated below:
a) After initialization, vector_mask[0] is set with {CPU0...CPU31},
other vector_mask[i] remain zero, which means that, CPU0...CPU31
have zero vectors assigned.
b) During driver probe, CPU0 is chosen to be assigned with one vector.
Thus vector_mask[0] becomes {CPU1...CPU31}, and vector_mask[1] becomes
{CPU0}.
c) The next time the system tries to assign another vector, it searches from
vector[0], because it is non-empty, CPU1 is chosen to place
the vector. Thus vector_mask[0] becomes {CPU2...CPU31}, and vector_mask[1]
becomes {CPU0, CPU1}.
d) and so on, the vectors assignment will spread on different CPUs.
Q2:
"respect the allocation domains"
A2:
This solution provides a CPU as the param for the vector allocation domains,
in the hope that the domains can leverage this hint to generate the cpumask
with less vectors assigned. But yes, the CPU we select upfront gets fed into
apic->vector_allocation_domain() might just say no because the CPU
does not belong to it, but anyway this should somehow spread the vectors,
although I can not find out a graceful solution to this.
BTW, the vector allocation domain is default_vector_allocation_domain()
in our case, thus it works.
Q3:
"set a preference for the node on which the interrupt was allocated"
A3:
This can be achieved by further optimization in the following way:
a) Also record the number of vectors used per node.
b) Each time invoking __assign_irq_vector, adjust the input mask
by cpumask_and(mask, mask, node_mask_with_least_vectors())
[1] https://marc.info/?l=linux-kernel&m=149867663002202
or
[1] https://patchwork.kernel.org/patch/9725227/
Thanks for your time.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
arch/x86/kernel/apic/vector.c | 26 ++++++++++++++++++++++++--
1 file changed, 24 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b60cc66..c7f0e8b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -145,6 +145,28 @@ static void free_apic_chip_data(struct apic_chip_data *data)
}
}
+static int pick_next_cpu_hint(const struct cpumask *mask)
+{
+ int i;
+ struct cpumask search, tmp;
+
+ cpumask_and(&search, mask, cpu_online_mask);
+ /*
+ * Search in the order from vectors_alloc_mask[0] to
+ * vectors_alloc_mask[255], try to find an intersection
+ * between the mask and vectors_alloc_mask[],
+ * return the first CPU in this intersection, thus
+ * this CPU has the least number of vectors allocated.
+ */
+ for (i = 0; i < NR_VECTORS; i++) {
+ cpumask_and(&tmp, &search, vectors_alloc_mask[i]);
+ if (!cpumask_empty(&tmp))
+ return cpumask_first(&tmp);
+ }
+ /* This should not happened...*/
+ return cpumask_first_and(mask, cpu_online_mask);
+}
+
static int __assign_irq_vector(int irq, struct apic_chip_data *d,
const struct cpumask *mask,
struct irq_data *irqdata)
@@ -175,7 +197,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
/* Only try and allocate irqs on cpus that are present */
cpumask_clear(d->old_domain);
cpumask_clear(searched_cpumask);
- cpu = cpumask_first_and(mask, cpu_online_mask);
+ cpu = pick_next_cpu_hint(mask);
while (cpu < nr_cpu_ids) {
int new_cpu, offset;
@@ -247,7 +269,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
*/
cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
cpumask_andnot(vector_cpumask, mask, searched_cpumask);
- cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+ cpu = pick_next_cpu_hint(vector_cpumask);
continue;
}
return -ENOSPC;
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-01 5:04 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Chen Yu
@ 2017-09-03 18:17 ` Thomas Gleixner
2017-09-03 19:18 ` RFD: x86: Sanitize the vector allocator Thomas Gleixner
` (2 more replies)
0 siblings, 3 replies; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-03 18:17 UTC (permalink / raw)
To: Chen Yu
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
On Fri, 1 Sep 2017, Chen Yu wrote:
> This is the major logic to spread the vectors on different CPUs.
> The main idea is to choose the 'idlest' CPU which has assigned
> the least number of vectors as the candidate/hint for the vector
> allocation domain, in the hope that the vector allocation domain
> could leverage this hint to generate corresponding cpumask.
>
> One of the requirements to do this vector spreading work comes from the
> following hibernation problem found on a 16 cores server:
>
> CPU 31 disable failed: CPU has 62 vectors assigned and there
> are only 0 available.
>
> 2 issues were found after investigation:
>
> 1. The network driver has declared many vector resources via
> pci_enable_msix_range(), say, this driver might likely want
> to reserve 6 per logical CPU, then there would be 192 of them.
> 2. Besides, most of the vectors reserved by this driver are assigned
> on CPU0 due to the current code strategy, so there would be
> insufficient slots left on CPU0 to receive any migrated IRQs
> during CPU offine.
>
> In theory after the commit c5cb83bb337c ("genirq/cpuhotplug: Handle
> managed IRQs on CPU hotplug") the issue 1 has been solved with the
> managed interrupt mechanism for multi queue devices, which no longer
> migrate these interrupts. It simply shuts them down and restarts
> them when the CPU comes back online.
>
> However, according to the implementation of this network driver,
> the commit mentioned above does not fully fix the issue 1.
> Here's the framework of the network driver:
>
> step 1. Reserved enough irq vectors and corresponding IRQs.
> step 2. If the network is activated, invoke request_irq() to
> register the handler.
> step 3. Invoke set_affinity() to spread the IRQs onto different
> CPUs, thus to spread the vectors too.
>
> The problem is, if the network cable is not connected, step 2
> and 3 will not get invoked, thus the IRQ vectors will not spread
> on different CPUs and will still be allocated on CPU0. As a result
> the CPUs will still get the offline failure.
So the proper solution for this network driver is to switch to managed
interrupts instead of trying to work around it in some other place. It's
using the wrong mechanism - end of story.
Why are you insisting on implementing a band aid for this particular driver
instead of fixing the underlying problem of that driver which requires to
have 32 queues and interrupts open even if only a single CPU is online?
> Previously there were some discussion in the thread [1] about the
> vector spread, and here are the suggestion from Thomas:
>
> Q1:
> Rewrite that allocation code from scratch, use per cpu bitmaps,
> so we can do fast search over masks and keep track of
> the vectors which are used/free per cpu.
> A1:
> per cpu bitmap was onced considered but this might not be
> as fast as the solution proposed here. That is, if we introduce the
> per cpu bitmap, we need to count the number of vectors assigned on
> each CPUs by cpumask_weight() and sort them in order to get the
> CPUs who have the least number of vectors assigned. By contrast,
> if we introduce the bitmaps for each vector number, we can get the
> 'idle' CPU at the cost of nearly O(1).
The weight accounting with the cpumasks is an orthogonal issue to the per
cpu vector bitmaps.
Once you selected a CPU the current code still loops in circles instead of
just searching a bitmap.
And if the required affinity mask needs more than one CPU then this is
still not giving you the right answer. You assign perhaps a vector to the
least busy CPU, but the other CPUs which happen to be in the mask are going
to be randomly selected.
I'm not against making the vector allocation better, but certainly not by
adding yet more duct tape to something which is well known as one of the
dumbest search algorithms on the planet with a worst case of
nvectors * nr_online_cpus * nr_online_cpus_in_affinity_mask
and your mechanism nests another loop of potentially NR_VECTORS into
that. Which is pointless as the actual assignable vector space is
smaller. While x86 has 256 vectors the assignable vector space is way
smaller and non continous:
Vectors 0- 31 are reserved for CPU traps and exceptions
Vectors 239-255 are reserved by the kernel for IPIs, local timer etc.
Vector 32 can be reserved by the kernel for the reboot interrupt
Vector 96 can be reserved by the kernel for INT80
So the actually usable array size is between 205 and 207.
Aside of that the code is incorrect vs. the percpu accounting. Remove the
limit in the update function and see what happens. While the limit is
necessary in general, it should at least warn and yell whenever the
accounting goes out of bounds. And if that happens, then the whole thing is
completely useless simply because the numbers are just wrong.
> One scenario of this patch can be illustrated below:
>
> a) After initialization, vector_mask[0] is set with {CPU0...CPU31},
> other vector_mask[i] remain zero, which means that, CPU0...CPU31
> have zero vectors assigned.
> b) During driver probe, CPU0 is chosen to be assigned with one vector.
> Thus vector_mask[0] becomes {CPU1...CPU31}, and vector_mask[1] becomes
> {CPU0}.
> c) The next time the system tries to assign another vector, it searches from
> vector[0], because it is non-empty, CPU1 is chosen to place
> the vector. Thus vector_mask[0] becomes {CPU2...CPU31}, and vector_mask[1]
> becomes {CPU0, CPU1}.
> d) and so on, the vectors assignment will spread on different CPUs.
So this works for your particular network driver scenario, which is the
wrong example as this driver just needs to be reworked to not expose that
issue.
The reality of affinity requirements is not that simple. It's very much
device dependent and making it mandatory can have negative side effects.
> Q2:
> "respect the allocation domains"
> A2:
> This solution provides a CPU as the param for the vector allocation domains,
Solution? This is a crude hack which "solves" one particular problem in the
wrong way.
> in the hope that the domains can leverage this hint to generate the cpumask
Hope is never a good enginering principle.
> with less vectors assigned. But yes, the CPU we select upfront gets fed into
> apic->vector_allocation_domain() might just say no because the CPU
> does not belong to it, but anyway this should somehow spread the vectors,
> although I can not find out a graceful solution to this.
> BTW, the vector allocation domain is default_vector_allocation_domain()
> in our case, thus it works.
Now try that with the logical delivery modes which allow multi CPU
assignements and the cluster mode thereof.
> Q3:
> "set a preference for the node on which the interrupt was allocated"
> A3:
> This can be achieved by further optimization in the following way:
> a) Also record the number of vectors used per node.
> b) Each time invoking __assign_irq_vector, adjust the input mask
> by cpumask_and(mask, mask, node_mask_with_least_vectors())
I'm not looking forward to that heuristics and the next loop inside the
loops of loops.
So no, this is not going to happen. The current implementation sucks and
needs a rewrite.
I already started to look into that, but started at the low level end of
it. See the IDT cleanup series in tip:x86/apic. There is much more vooddo
in the vector management code to clean up before we can actually talk about
making the assignment algorithm smarter.
I'll send out a separate mail raising a few points to discuss before we are
actually starting to look into magic spreading functionality.
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* RFD: x86: Sanitize the vector allocator
2017-09-03 18:17 ` Thomas Gleixner
@ 2017-09-03 19:18 ` Thomas Gleixner
2017-09-05 22:57 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Thomas Gleixner
2017-09-06 4:13 ` Yu Chen
2 siblings, 0 replies; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-03 19:18 UTC (permalink / raw)
To: Chen Yu
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
[-- Attachment #1: Type: text/plain, Size: 6609 bytes --]
The vector allocator of x86 is a pretty stupid linear search algorithm with
a worst case of
nr_vectors * nr_online_cpus * nr_cpus_in_affinity mask
It has some other magic properties and really wants to be replaced by
something smarter.
That needs quite some cleanup of the vector management code outside of the
allocator, which I started to work on with the cleanup of the IDT
management which is headed for 4.14. I have some other things in the
pipeline which eliminate quite some duct tape in that area, but I ran into
a couple of interesting things:
1) Multi CPU affinities
This is only vailable when the APIC is using logical destination
mode. With physical destination mode there is already a restriction to a
single CPU target.
The multi CPU affinity is biased towards the CPU with the lowest APIC ID
in the destination bitfield. Only if that APIC is busy (ISR not empty)
then the next APIC gets it.
A full kernel build on a SKL 4 CPU desktop machine with affinity set to
CPU0-3 shows that more than 90 percent of the AHCI interrupts end up on
CPU0.
Aside of that the same cold build (right after boot) is about 2% faster
when the AHCI interrupt is only affine to CPU0.
I did some experiments on all my machines which have logical destination
mode with various workloads and the results are similiar. The
distribution of interrupts on the CPUs varies with the workloads, but
the vast majority always ends up on CPU0
I've not found a case where the multi CPU affinity is superiour. I might
have the wrong workloads and the wrong machines, but it would be
extremly helpful just to get rid of this and use single CPU affinities
only. That'd simplify the allocator along with the various APIC
implementations.
2) The 'priority level' spreading magic
The comment in __asign_irq_vector says:
* NOTE! The local APIC isn't very good at handling
* multiple interrupts at the same interrupt level.
* As the interrupt level is determined by taking the
* vector number and shifting that right by 4, we
* want to spread these out a bit so that they don't
* all fall in the same interrupt level.
After doing some palaeontological research I found the following in the
PPro Developer Manual Volume 3:
"7.4.2. Valid Interrupts
The local and I/O APICs support 240 distinct vectors in the range of 16
to 255. Interrupt priority is implied by its vector, according to the
following relationship: priority = vector / 16
One is the lowest priority and 15 is the highest. Vectors 16 through
31 are reserved for exclusive use by the processor. The remaining
vectors are for general use. The processor’s local APIC includes an
in-service entry and a holding entry for each priority level. To avoid
losing inter- rupts, software should allocate no more than 2 interrupt
vectors per priority."
The current SDM tells nothing about that, instead it states:
"If more than one interrupt is generated with the same vector number,
the local APIC can set the bit for the vector both in the IRR and the
ISR. This means that for the Pentium 4 and Intel Xeon processors, the
IRR and ISR can queue two interrupts for each interrupt vector: one
in the IRR and one in the ISR. Any additional interrupts issued for
the same interrupt vector are collapsed into the single bit in the
IRR.
For the P6 family and Pentium processors, the IRR and ISR registers
can queue no more than two interrupts per interrupt vector and will
reject other interrupts that are received within the same vector."
Which means, that on P6/Pentium the APIC will reject a new message and
tell the sender to retry, which increases the load on the APIC bus and
nothing more.
There is no affirmative answer from Intel on that, but I think it's sane
to remove that:
1) I've looked through a bunch of other operating systems and none of
them bothers to implement this or mentiones this at all.
2) The current allocator has no enforcement for this and especially the
legacy interrupts, which are the main source of interrupts on these
P6 and older systmes, are allocated linearly in the same priority
level and just work.
3) The current machines have no problem with that at all as I verified
with some experiments.
4) AMD at least confirmed that such an issue is unknown.
5) P6 and older are dinosaurs almost 20 years EOL, so we really should
not worry about that anymore.
So this can be eliminated, which makes the allocation mechanism way
simpler.
Some other issues which are not in the way of cleanups and replacements,
but need to be looked at as well:
1) Automated affinity assignment
This only helps when the underlying device requests it and has the
matching queues per CPU. That's what the managed interrupt affinity
mechanism was made for.
In other cases the automated assignment can have really bad effects.
On the same SKL as above I made the AHCI interrupt affine to CPU3 only
which makes the kernel build slower by whopping 10% than having it
affine on CPU0. Interestingly enough irqbalanced end up with the wrong
decision as well.
So we need to be very careful about that. It depends on the device and
the driver how good 'random' placement works.
That means we need hinting from the drivers about their preferred
allocation scheme. If we don't have that then we should for now default
to the current scheme which puts the interrupt on the node on which the
device is.
2) Vector waste
All 16 legacy interrupt vectors are populated at boot and stay there
forever whether they are used or not. On most modern machines that's 10+
vectors wasted for nothing. If the APIC uses logical destination mode
that means these vectors are per default allocated on up to 8 CPUs or in
the case of clustered X2APIC on all CPUs in a cluster.
It'd be worthwhile to allocate these legacy vectors dynamically when
they are actually used. That might fail, but that's the same on devices
which use MSI etc. For legacy systems this is a non issue as there are
plenty of vectors available. On modern machines the 4-5 really used
legacy vectors are requested early during the boot process and should
not end up in a fully exhausted vector space.
Nothing urgent, but worthwhile to fix I think.
Thoughts?
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-03 18:17 ` Thomas Gleixner
2017-09-03 19:18 ` RFD: x86: Sanitize the vector allocator Thomas Gleixner
@ 2017-09-05 22:57 ` Thomas Gleixner
2017-09-06 4:34 ` Yu Chen
2017-09-06 4:13 ` Yu Chen
2 siblings, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-05 22:57 UTC (permalink / raw)
To: Chen Yu
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
On Sun, 3 Sep 2017, Thomas Gleixner wrote:
> On Fri, 1 Sep 2017, Chen Yu wrote:
>
> > This is the major logic to spread the vectors on different CPUs.
> > The main idea is to choose the 'idlest' CPU which has assigned
> > the least number of vectors as the candidate/hint for the vector
> > allocation domain, in the hope that the vector allocation domain
> > could leverage this hint to generate corresponding cpumask.
> >
> > One of the requirements to do this vector spreading work comes from the
> > following hibernation problem found on a 16 cores server:
> >
> > CPU 31 disable failed: CPU has 62 vectors assigned and there
> > are only 0 available.
Thinking more about this, this makes no sense whatsoever.
The total number of interrupts on a system is the same whether they are
all on CPU 0 or evenly spread over all CPUs.
As this machine is using physcial destination mode, the number of vectors
used is the same as the number of interrupts, except for the case where a
move of an interrupt is in progress and the interrupt which cleans up the
old vector has not yet arrived. Lets ignore that for now.
The available vector space is 204 per CPU on such a system.
256 - SYSTEM[0-31, 32, 128, 239-255] - LEGACY[50] = 204
> > CPU 31 disable failed: CPU has 62 vectors assigned and there
> > are only 0 available.
CPU31 is the last AP going offline (CPU0 is still online).
It wants to move 62 vectors to CPU0, but it can't because CPU0 has 0
available vectors. That means CPU0 has 204 vectors used. I doubt that, but
what I doubt even more is that this interrupt spreading helps in any way.
Assumed that we have a total of 204 + 62 = 266 device interrupt vectors in
use and they are evenly spread over 32 CPUs, so each CPU has either 8 or
nine vectors. Fine.
Now if you unplug all CPUs except CPU0 starting from CPU1 up to CPU31 then
at the point where CPU31 is about to be unplugged, CPU0 holds 133 vectors
and CPU31 holds 133 vectors as well - assumed that the spread is exactly
even.
I have a hard time to figure out how the 133 vectors on CPU31 are now
magically fitting in the empty space on CPU0, which is 204 - 133 = 71. In
my limited understanding of math 133 is greater than 71, but your patch
might make that magically be wrong.
Can you please provide detailed information about how many device
interrupts are actually in use/allocated on that system?
Please enable CONFIG_GENERIC_IRQ_DEBUGFS and provide the output of
# cat /sys/kernel/debug/irq/domains/*
and
# ls /sys/kernel/debug/irq/irqs
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-05 22:57 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Thomas Gleixner
@ 2017-09-06 4:34 ` Yu Chen
2017-09-06 8:03 ` Thomas Gleixner
0 siblings, 1 reply; 21+ messages in thread
From: Yu Chen @ 2017-09-06 4:34 UTC (permalink / raw)
To: Thomas Gleixner
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
On Wed, Sep 06, 2017 at 12:57:41AM +0200, Thomas Gleixner wrote:
> On Sun, 3 Sep 2017, Thomas Gleixner wrote:
>
> > On Fri, 1 Sep 2017, Chen Yu wrote:
> >
> > > This is the major logic to spread the vectors on different CPUs.
> > > The main idea is to choose the 'idlest' CPU which has assigned
> > > the least number of vectors as the candidate/hint for the vector
> > > allocation domain, in the hope that the vector allocation domain
> > > could leverage this hint to generate corresponding cpumask.
> > >
> > > One of the requirements to do this vector spreading work comes from the
> > > following hibernation problem found on a 16 cores server:
> > >
> > > CPU 31 disable failed: CPU has 62 vectors assigned and there
> > > are only 0 available.
>
> Thinking more about this, this makes no sense whatsoever.
>
> The total number of interrupts on a system is the same whether they are
> all on CPU 0 or evenly spread over all CPUs.
>
> As this machine is using physcial destination mode, the number of vectors
> used is the same as the number of interrupts, except for the case where a
> move of an interrupt is in progress and the interrupt which cleans up the
> old vector has not yet arrived. Lets ignore that for now.
>
> The available vector space is 204 per CPU on such a system.
>
> 256 - SYSTEM[0-31, 32, 128, 239-255] - LEGACY[50] = 204
>
> > > CPU 31 disable failed: CPU has 62 vectors assigned and there
> > > are only 0 available.
>
> CPU31 is the last AP going offline (CPU0 is still online).
>
> It wants to move 62 vectors to CPU0, but it can't because CPU0 has 0
> available vectors. That means CPU0 has 204 vectors used. I doubt that, but
> what I doubt even more is that this interrupt spreading helps in any way.
>
> Assumed that we have a total of 204 + 62 = 266 device interrupt vectors in
> use and they are evenly spread over 32 CPUs, so each CPU has either 8 or
> nine vectors. Fine.
>
> Now if you unplug all CPUs except CPU0 starting from CPU1 up to CPU31 then
> at the point where CPU31 is about to be unplugged, CPU0 holds 133 vectors
> and CPU31 holds 133 vectors as well - assumed that the spread is exactly
> even.
>
> I have a hard time to figure out how the 133 vectors on CPU31 are now
> magically fitting in the empty space on CPU0, which is 204 - 133 = 71. In
> my limited understanding of math 133 is greater than 71, but your patch
> might make that magically be wrong.
>
The problem is reproduced when the network cable is not plugged in,
because this driver looks like this:
step 1. Reserved enough irq vectors and corresponding IRQs.
step 2. If the network is activated, invoke request_irq() to
register the handler.
step 3. Invoke set_affinity() to spread the IRQs onto different
CPUs, thus to spread the vectors too.
Here's my understanding for why spreading vectors might help for this
special case:
As step 2 will not get invoked, the IRQs of this driver
has not been enabled, thus in migrate_one_irq() this IRQ
will not be considered because there is a check of
irqd_is_started(d), thus there should only be 8 vectors
allocated by this driver on CPU0, and 8 vectors left on
CPU31, and the 8 vectors on CPU31 will not be migrated
to CPU0 neither, so there is room for other 'valid' vectors
to be migrated to CPU0.
> Can you please provide detailed information about how many device
> interrupts are actually in use/allocated on that system?
>
> Please enable CONFIG_GENERIC_IRQ_DEBUGFS and provide the output of
>
> # cat /sys/kernel/debug/irq/domains/*
>
> and
>
> # ls /sys/kernel/debug/irq/irqs
>
Ok, here's the information after system bootup on top
of 4.13:
# cat /sys/kernel/debug/irq/domains/*
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-0
size: 24
mapped: 16
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-1
size: 8
mapped: 2
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-2
size: 8
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-3
size: 8
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-4
size: 8
mapped: 5
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: PCI-HT
size: 0
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: PCI-MSI-2
size: 0
mapped: 365
flags: 0x00000051
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
# ls /sys/kernel/debug/irq/irqs
ls /sys/kernel/debug/irq/irqs
0 10 11 13 142 184 217 259 292 31 33 337 339
340 342 344 346 348 350 352 354 356 358 360 362
364 366 368 370 372 374 376 378 380 382 384 386
388 390 392 394 4 6 7 9 1 109 12 14 15 2
24 26 3 32 335 338 34 341 343 345 347 349
351 353 355 357 359 361 363 365 367 369 371 373
375 377 379 381 383 385 387 389 391 393 395 5
67 8
BTW, do we have sysfs to display how much vectors used on each CPUs?
Thanks,
Yu
> Thanks,
>
> tglx
>
>
>
>
>
>
>
>
>
>
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 4:34 ` Yu Chen
@ 2017-09-06 8:03 ` Thomas Gleixner
2017-09-07 2:52 ` Yu Chen
0 siblings, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-06 8:03 UTC (permalink / raw)
To: Yu Chen
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
On Wed, 6 Sep 2017, Yu Chen wrote:
> On Wed, Sep 06, 2017 at 12:57:41AM +0200, Thomas Gleixner wrote:
> > I have a hard time to figure out how the 133 vectors on CPU31 are now
> > magically fitting in the empty space on CPU0, which is 204 - 133 = 71. In
> > my limited understanding of math 133 is greater than 71, but your patch
> > might make that magically be wrong.
> >
> The problem is reproduced when the network cable is not plugged in,
> because this driver looks like this:
>
> step 1. Reserved enough irq vectors and corresponding IRQs.
> step 2. If the network is activated, invoke request_irq() to
> register the handler.
> step 3. Invoke set_affinity() to spread the IRQs onto different
> CPUs, thus to spread the vectors too.
>
> Here's my understanding for why spreading vectors might help for this
> special case:
> As step 2 will not get invoked, the IRQs of this driver
> has not been enabled, thus in migrate_one_irq() this IRQ
> will not be considered because there is a check of
> irqd_is_started(d), thus there should only be 8 vectors
> allocated by this driver on CPU0, and 8 vectors left on
> CPU31, and the 8 vectors on CPU31 will not be migrated
> to CPU0 neither, so there is room for other 'valid' vectors
> to be migrated to CPU0.
Can you please spare me repeating your theories, as long as you don't have
hard facts to back them up? The network cable is changing the symptoms,
but the underlying root cause is definitely something different.
> # cat /sys/kernel/debug/irq/domains/*
> name: VECTOR
> size: 0
> mapped: 388
> flags: 0x00000041
So we have 388 vectors mapped in total. And those are just device vectors
because system vectors are not accounted there.
> name: IO-APIC-0
> size: 24
> mapped: 16
That's the legacy space
> name: IO-APIC-1
> size: 8
> mapped: 2
> name: IO-APIC-2
> size: 8
> mapped: 0
> name: IO-APIC-3
> size: 8
> mapped: 0
> name: IO-APIC-4
> size: 8
> mapped: 5
And a few GSIs: Total GSIs = 16 + 2 + 5 = 23
> name: PCI-MSI-2
> size: 0
> mapped: 365
Plus 365 PCI-MSI vectors allocated.
> flags: 0x00000051
> parent: VECTOR
> name: VECTOR
> size: 0
> mapped: 388
Which nicely sums up to 388
> # ls /sys/kernel/debug/irq/irqs
> ls /sys/kernel/debug/irq/irqs
> 0 10 11 13 142 184 217 259 292 31 33 337 339
> 340 342 344 346 348 350 352 354 356 358 360 362
> 364 366 368 370 372 374 376 378 380 382 384 386
> 388 390 392 394 4 6 7 9 1 109 12 14 15 2
> 24 26 3 32 335 338 34 341 343 345 347 349
> 351 353 355 357 359 361 363 365 367 369 371 373
> 375 377 379 381 383 385 387 389 391 393 395 5
> 67 8
That are all interrupts which are active. That's a total of 89. Can you
explain where the delta of 299 vectors comes from?
299 allocated, vector mapped, but unused interrupts?
That's where your problem is, not in the vector spreading. You have a
massive leak.
> BTW, do we have sysfs to display how much vectors used on each CPUs?
Not yet.
Can you please apply the debug patch below, boot the machine and right
after login provide the output of
# cat /sys/kernel/debug/tracing/trace
Thanks,
tglx
8<-------------------
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -372,6 +372,9 @@ int msi_domain_alloc_irqs(struct irq_dom
return ret;
}
+ trace_printk("dev: %s nvec %d virq %d\n",
+ dev_name(dev), desc->nvec_used, virq);
+
for (i = 0; i < desc->nvec_used; i++)
irq_set_msi_desc_off(virq, i, desc);
}
@@ -419,6 +422,8 @@ void msi_domain_free_irqs(struct irq_dom
* entry. If that's the case, don't do anything.
*/
if (desc->irq) {
+ trace_printk("dev: %s nvec %d virq %d\n",
+ dev_name(dev), desc->nvec_used, desc->irq);
irq_domain_free_irqs(desc->irq, desc->nvec_used);
desc->irq = 0;
}
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 8:03 ` Thomas Gleixner
@ 2017-09-07 2:52 ` Yu Chen
2017-09-07 5:54 ` Thomas Gleixner
0 siblings, 1 reply; 21+ messages in thread
From: Yu Chen @ 2017-09-07 2:52 UTC (permalink / raw)
To: Thomas Gleixner
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
On Wed, Sep 06, 2017 at 10:03:58AM +0200, Thomas Gleixner wrote:
> On Wed, 6 Sep 2017, Yu Chen wrote:
> > On Wed, Sep 06, 2017 at 12:57:41AM +0200, Thomas Gleixner wrote:
> > > I have a hard time to figure out how the 133 vectors on CPU31 are now
> > > magically fitting in the empty space on CPU0, which is 204 - 133 = 71. In
> > > my limited understanding of math 133 is greater than 71, but your patch
> > > might make that magically be wrong.
> > >
> > The problem is reproduced when the network cable is not plugged in,
> > because this driver looks like this:
> >
> > step 1. Reserved enough irq vectors and corresponding IRQs.
> > step 2. If the network is activated, invoke request_irq() to
> > register the handler.
> > step 3. Invoke set_affinity() to spread the IRQs onto different
> > CPUs, thus to spread the vectors too.
> >
> > Here's my understanding for why spreading vectors might help for this
> > special case:
> > As step 2 will not get invoked, the IRQs of this driver
> > has not been enabled, thus in migrate_one_irq() this IRQ
> > will not be considered because there is a check of
> > irqd_is_started(d), thus there should only be 8 vectors
> > allocated by this driver on CPU0, and 8 vectors left on
> > CPU31, and the 8 vectors on CPU31 will not be migrated
> > to CPU0 neither, so there is room for other 'valid' vectors
> > to be migrated to CPU0.
>
> Can you please spare me repeating your theories, as long as you don't have
> hard facts to back them up? The network cable is changing the symptoms,
> but the underlying root cause is definitely something different.
>
> > # cat /sys/kernel/debug/irq/domains/*
> > name: VECTOR
> > size: 0
> > mapped: 388
> > flags: 0x00000041
>
> So we have 388 vectors mapped in total. And those are just device vectors
> because system vectors are not accounted there.
>
> > name: IO-APIC-0
> > size: 24
> > mapped: 16
>
> That's the legacy space
>
> > name: IO-APIC-1
> > size: 8
> > mapped: 2
>
> > name: IO-APIC-2
> > size: 8
> > mapped: 0
>
> > name: IO-APIC-3
> > size: 8
> > mapped: 0
>
> > name: IO-APIC-4
> > size: 8
> > mapped: 5
>
> And a few GSIs: Total GSIs = 16 + 2 + 5 = 23
>
> > name: PCI-MSI-2
> > size: 0
> > mapped: 365
>
> Plus 365 PCI-MSI vectors allocated.
>
> > flags: 0x00000051
> > parent: VECTOR
> > name: VECTOR
> > size: 0
> > mapped: 388
>
> Which nicely sums up to 388
>
> > # ls /sys/kernel/debug/irq/irqs
> > ls /sys/kernel/debug/irq/irqs
> > 0 10 11 13 142 184 217 259 292 31 33 337 339
> > 340 342 344 346 348 350 352 354 356 358 360 362
> > 364 366 368 370 372 374 376 378 380 382 384 386
> > 388 390 392 394 4 6 7 9 1 109 12 14 15 2
> > 24 26 3 32 335 338 34 341 343 345 347 349
> > 351 353 355 357 359 361 363 365 367 369 371 373
> > 375 377 379 381 383 385 387 389 391 393 395 5
> > 67 8
>
> That are all interrupts which are active. That's a total of 89. Can you
> explain where the delta of 299 vectors comes from?
>
> 299 allocated, vector mapped, but unused interrupts?
>
> That's where your problem is, not in the vector spreading. You have a
> massive leak.
>
> > BTW, do we have sysfs to display how much vectors used on each CPUs?
>
> Not yet.
>
> Can you please apply the debug patch below, boot the machine and right
> after login provide the output of
>
> # cat /sys/kernel/debug/tracing/trace
>
Ok, I've tested on top of 4.13, here're the results:
# cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
kworker/0:0-3 [000] .... 7.373461: msi_domain_alloc_irqs: dev: 0000:00:1c.0 nvec 1 virq 24
kworker/0:0-3 [000] .... 7.373666: msi_domain_alloc_irqs: dev: 0000:b2:00.0 nvec 1 virq 26
kworker/0:0-3 [000] .... 7.578980: msi_domain_alloc_irqs: dev: 0000:00:11.5 nvec 1 virq 31
kworker/0:0-3 [000] .... 7.676938: msi_domain_alloc_irqs: dev: 0000:00:17.0 nvec 1 virq 32
kworker/0:0-3 [000] .... 8.017664: msi_domain_alloc_irqs: dev: 0000:00:14.0 nvec 1 virq 33
kworker/0:2-303 [000] .... 9.135467: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 34
kworker/0:2-303 [000] .... 9.135476: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 35
kworker/0:2-303 [000] .... 9.135484: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 36
kworker/0:2-303 [000] .... 9.135492: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 37
kworker/0:2-303 [000] .... 9.135499: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 38
kworker/0:2-303 [000] .... 9.135508: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 39
kworker/0:2-303 [000] .... 9.135516: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 40
kworker/0:2-303 [000] .... 9.135526: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 41
kworker/0:2-303 [000] .... 9.135534: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 42
kworker/0:2-303 [000] .... 9.135542: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 43
kworker/0:2-303 [000] .... 9.135550: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 44
kworker/0:2-303 [000] .... 9.135557: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 45
kworker/0:2-303 [000] .... 9.135564: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 46
kworker/0:2-303 [000] .... 9.135572: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 47
kworker/0:2-303 [000] .... 9.135579: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 48
kworker/0:2-303 [000] .... 9.135587: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 49
kworker/0:2-303 [000] .... 9.135594: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 50
kworker/0:2-303 [000] .... 9.135601: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 51
kworker/0:2-303 [000] .... 9.135608: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 52
kworker/0:2-303 [000] .... 9.135616: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 53
kworker/0:2-303 [000] .... 9.135623: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 54
kworker/0:2-303 [000] .... 9.135631: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 55
kworker/0:2-303 [000] .... 9.135638: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 56
kworker/0:2-303 [000] .... 9.135645: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 57
kworker/0:2-303 [000] .... 9.135653: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 58
kworker/0:2-303 [000] .... 9.135660: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 59
kworker/0:2-303 [000] .... 9.135668: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 60
kworker/0:2-303 [000] .... 9.135676: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 61
kworker/0:2-303 [000] .... 9.135683: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 62
kworker/0:2-303 [000] .... 9.135691: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 63
kworker/0:2-303 [000] .... 9.135699: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 64
kworker/0:2-303 [000] .... 9.135707: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 65
kworker/0:2-303 [000] .... 9.135714: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 66
kworker/0:2-303 [000] .... 9.135721: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 67
kworker/0:2-303 [000] .... 9.135729: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 68
kworker/0:2-303 [000] .... 9.135736: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 69
kworker/0:2-303 [000] .... 9.135744: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 70
kworker/0:2-303 [000] .... 9.135751: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 71
kworker/0:2-303 [000] .... 9.135758: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 72
kworker/0:2-303 [000] .... 9.135766: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 73
kworker/0:2-303 [000] .... 9.135773: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 74
kworker/0:2-303 [000] .... 9.135780: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 75
kworker/0:2-303 [000] .... 9.135788: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 76
kworker/0:2-303 [000] .... 9.135795: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 77
kworker/0:2-303 [000] .... 9.135803: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 78
kworker/0:2-303 [000] .... 9.135810: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 79
kworker/0:2-303 [000] .... 9.135818: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 80
kworker/0:2-303 [000] .... 9.135826: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 81
kworker/0:2-303 [000] .... 9.135833: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 82
kworker/0:2-303 [000] .... 9.135841: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 83
kworker/0:2-303 [000] .... 9.135848: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 84
kworker/0:2-303 [000] .... 9.135856: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 85
kworker/0:2-303 [000] .... 9.135863: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 86
kworker/0:2-303 [000] .... 9.135871: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 87
kworker/0:2-303 [000] .... 9.135878: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 88
kworker/0:2-303 [000] .... 9.135885: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 89
kworker/0:2-303 [000] .... 9.135893: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 90
kworker/0:2-303 [000] .... 9.135900: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 91
kworker/0:2-303 [000] .... 9.135908: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 92
kworker/0:2-303 [000] .... 9.135915: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 93
kworker/0:2-303 [000] .... 9.135923: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 94
kworker/0:2-303 [000] .... 9.135930: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 95
kworker/0:2-303 [000] .... 9.135937: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 96
kworker/0:2-303 [000] .... 9.135944: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 97
kworker/0:2-303 [000] .... 9.135951: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 98
kworker/0:2-303 [000] .... 9.135958: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 99
kworker/0:2-303 [000] .... 9.135964: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 100
kworker/0:2-303 [000] .... 9.135971: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 101
kworker/0:2-303 [000] .... 9.135978: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 102
kworker/0:2-303 [000] .... 9.135985: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 103
kworker/0:2-303 [000] .... 9.135991: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 104
kworker/0:2-303 [000] .... 9.135998: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 105
kworker/0:2-303 [000] .... 9.136005: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 106
kworker/0:2-303 [000] .... 9.136012: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 107
kworker/0:2-303 [000] .... 9.136019: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 108
kworker/0:2-303 [000] .... 9.599173: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 109
kworker/0:2-303 [000] .... 9.599181: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 110
kworker/0:2-303 [000] .... 9.599189: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 111
kworker/0:2-303 [000] .... 9.599196: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 112
kworker/0:2-303 [000] .... 9.599203: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 113
kworker/0:2-303 [000] .... 9.599210: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 114
kworker/0:2-303 [000] .... 9.599217: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 115
kworker/0:2-303 [000] .... 9.599224: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 116
kworker/0:2-303 [000] .... 9.599231: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 117
kworker/0:2-303 [000] .... 9.599239: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 118
kworker/0:2-303 [000] .... 9.599246: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 119
kworker/0:2-303 [000] .... 9.599253: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 120
kworker/0:2-303 [000] .... 9.599259: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 121
kworker/0:2-303 [000] .... 9.599266: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 122
kworker/0:2-303 [000] .... 9.599273: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 123
kworker/0:2-303 [000] .... 9.599279: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 124
kworker/0:2-303 [000] .... 9.599286: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 125
kworker/0:2-303 [000] .... 9.599293: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 126
kworker/0:2-303 [000] .... 9.599300: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 127
kworker/0:2-303 [000] .... 9.599307: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 128
kworker/0:2-303 [000] .... 9.599314: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 129
kworker/0:2-303 [000] .... 9.599321: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 130
kworker/0:2-303 [000] .... 9.599327: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 131
kworker/0:2-303 [000] .... 9.599334: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 132
kworker/0:2-303 [000] .... 9.599341: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 133
kworker/0:2-303 [000] .... 9.599349: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 134
kworker/0:2-303 [000] .... 9.599356: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 135
kworker/0:2-303 [000] .... 9.599363: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 136
kworker/0:2-303 [000] .... 9.599369: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 137
kworker/0:2-303 [000] .... 9.599376: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 138
kworker/0:2-303 [000] .... 9.599382: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 139
kworker/0:2-303 [000] .... 9.599389: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 140
kworker/0:2-303 [000] .... 9.599395: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 141
kworker/0:2-303 [000] .... 9.599402: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 142
kworker/0:2-303 [000] .... 9.599409: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 143
kworker/0:2-303 [000] .... 9.599416: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 144
kworker/0:2-303 [000] .... 9.599422: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 145
kworker/0:2-303 [000] .... 9.599428: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 146
kworker/0:2-303 [000] .... 9.599437: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 147
kworker/0:2-303 [000] .... 9.599444: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 148
kworker/0:2-303 [000] .... 9.599450: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 149
kworker/0:2-303 [000] .... 9.599456: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 150
kworker/0:2-303 [000] .... 9.599463: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 151
kworker/0:2-303 [000] .... 9.599470: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 152
kworker/0:2-303 [000] .... 9.599476: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 153
kworker/0:2-303 [000] .... 9.599482: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 154
kworker/0:2-303 [000] .... 9.599488: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 155
kworker/0:2-303 [000] .... 9.599494: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 156
kworker/0:2-303 [000] .... 9.599501: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 157
kworker/0:2-303 [000] .... 9.599507: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 158
kworker/0:2-303 [000] .... 9.599513: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 159
kworker/0:2-303 [000] .... 9.599519: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 160
kworker/0:2-303 [000] .... 9.599527: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 161
kworker/0:2-303 [000] .... 9.599534: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 162
kworker/0:2-303 [000] .... 9.599541: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 163
kworker/0:2-303 [000] .... 9.599547: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 164
kworker/0:2-303 [000] .... 9.599553: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 165
kworker/0:2-303 [000] .... 9.599560: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 166
kworker/0:2-303 [000] .... 9.599567: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 167
kworker/0:2-303 [000] .... 9.599573: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 168
kworker/0:2-303 [000] .... 9.599579: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 169
kworker/0:2-303 [000] .... 9.599585: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 170
kworker/0:2-303 [000] .... 9.599591: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 171
kworker/0:2-303 [000] .... 9.599597: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 172
kworker/0:2-303 [000] .... 9.599604: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 173
kworker/0:2-303 [000] .... 9.599610: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 174
kworker/0:2-303 [000] .... 9.599616: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 175
kworker/0:2-303 [000] .... 9.599622: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 176
kworker/0:2-303 [000] .... 9.599628: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 177
kworker/0:2-303 [000] .... 9.599634: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 178
kworker/0:2-303 [000] .... 9.599640: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 179
kworker/0:2-303 [000] .... 9.599646: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 180
kworker/0:2-303 [000] .... 9.599653: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 181
kworker/0:2-303 [000] .... 9.599660: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 182
kworker/0:2-303 [000] .... 9.599666: msi_domain_alloc_irqs: dev: 0000:bb:00.1 nvec 1 virq 183
kworker/0:2-303 [000] .... 9.699902: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 184
kworker/0:2-303 [000] .... 9.699910: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 185
kworker/0:2-303 [000] .... 9.699917: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 186
kworker/0:2-303 [000] .... 9.699924: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 187
kworker/0:2-303 [000] .... 9.699931: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 188
kworker/0:2-303 [000] .... 9.699939: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 189
kworker/0:2-303 [000] .... 9.699946: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 190
kworker/0:2-303 [000] .... 9.699953: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 191
kworker/0:2-303 [000] .... 9.699960: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 192
kworker/0:2-303 [000] .... 9.699967: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 193
kworker/0:2-303 [000] .... 9.699975: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 194
kworker/0:2-303 [000] .... 9.699983: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 195
kworker/0:2-303 [000] .... 9.699989: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 196
kworker/0:2-303 [000] .... 9.699996: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 197
kworker/0:2-303 [000] .... 9.700003: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 198
kworker/0:2-303 [000] .... 9.700012: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 199
kworker/0:2-303 [000] .... 9.700019: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 200
kworker/0:2-303 [000] .... 9.700026: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 201
kworker/0:2-303 [000] .... 9.700032: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 202
kworker/0:2-303 [000] .... 9.700039: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 203
kworker/0:2-303 [000] .... 9.700047: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 204
kworker/0:2-303 [000] .... 9.700053: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 205
kworker/0:2-303 [000] .... 9.700060: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 206
kworker/0:2-303 [000] .... 9.700067: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 207
kworker/0:2-303 [000] .... 9.700074: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 208
kworker/0:2-303 [000] .... 9.700081: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 209
kworker/0:2-303 [000] .... 9.700088: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 210
kworker/0:2-303 [000] .... 9.700095: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 211
kworker/0:2-303 [000] .... 9.700103: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 212
kworker/0:2-303 [000] .... 9.700112: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 213
kworker/0:2-303 [000] .... 9.700121: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 214
kworker/0:2-303 [000] .... 9.700129: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 215
kworker/0:2-303 [000] .... 9.700137: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 216
kworker/0:2-303 [000] .... 9.700145: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 217
kworker/0:2-303 [000] .... 9.700154: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 218
kworker/0:2-303 [000] .... 9.700163: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 219
kworker/0:2-303 [000] .... 9.700172: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 220
kworker/0:2-303 [000] .... 9.700180: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 221
kworker/0:2-303 [000] .... 9.700189: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 222
kworker/0:2-303 [000] .... 9.700200: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 223
kworker/0:2-303 [000] .... 9.700209: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 224
kworker/0:2-303 [000] .... 9.700217: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 225
kworker/0:2-303 [000] .... 9.700225: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 226
kworker/0:2-303 [000] .... 9.700235: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 227
kworker/0:2-303 [000] .... 9.700244: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 228
kworker/0:2-303 [000] .... 9.700253: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 229
kworker/0:2-303 [000] .... 9.700261: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 230
kworker/0:2-303 [000] .... 9.700269: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 231
kworker/0:2-303 [000] .... 9.700278: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 232
kworker/0:2-303 [000] .... 9.700287: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 233
kworker/0:2-303 [000] .... 9.700295: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 234
kworker/0:2-303 [000] .... 9.700304: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 235
kworker/0:2-303 [000] .... 9.700313: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 236
kworker/0:2-303 [000] .... 9.700321: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 237
kworker/0:2-303 [000] .... 9.700330: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 238
kworker/0:2-303 [000] .... 9.700339: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 239
kworker/0:2-303 [000] .... 9.700347: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 240
kworker/0:2-303 [000] .... 9.700355: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 241
kworker/0:2-303 [000] .... 9.700363: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 242
kworker/0:2-303 [000] .... 9.700372: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 243
kworker/0:2-303 [000] .... 9.700380: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 244
kworker/0:2-303 [000] .... 9.700390: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 245
kworker/0:2-303 [000] .... 9.700398: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 246
kworker/0:2-303 [000] .... 9.700406: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 247
kworker/0:2-303 [000] .... 9.700415: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 248
kworker/0:2-303 [000] .... 9.700424: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 249
kworker/0:2-303 [000] .... 9.700432: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 250
kworker/0:2-303 [000] .... 9.700440: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 251
kworker/0:2-303 [000] .... 9.700449: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 252
kworker/0:2-303 [000] .... 9.700458: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 253
kworker/0:2-303 [000] .... 9.700466: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 254
kworker/0:2-303 [000] .... 9.700474: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 255
kworker/0:2-303 [000] .... 9.700483: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 256
kworker/0:2-303 [000] .... 9.700492: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 257
kworker/0:2-303 [000] .... 9.700500: msi_domain_alloc_irqs: dev: 0000:bb:00.2 nvec 1 virq 258
kworker/0:2-303 [000] .... 9.761556: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 259
kworker/0:2-303 [000] .... 9.761570: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 260
kworker/0:2-303 [000] .... 9.761583: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 261
kworker/0:2-303 [000] .... 9.761595: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 262
kworker/0:2-303 [000] .... 9.761608: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 263
kworker/0:2-303 [000] .... 9.761619: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 264
kworker/0:2-303 [000] .... 9.761629: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 265
kworker/0:2-303 [000] .... 9.761639: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 266
kworker/0:2-303 [000] .... 9.761649: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 267
kworker/0:2-303 [000] .... 9.761658: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 268
kworker/0:2-303 [000] .... 9.761669: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 269
kworker/0:2-303 [000] .... 9.761678: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 270
kworker/0:2-303 [000] .... 9.761688: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 271
kworker/0:2-303 [000] .... 9.761697: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 272
kworker/0:2-303 [000] .... 9.761707: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 273
kworker/0:2-303 [000] .... 9.761717: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 274
kworker/0:2-303 [000] .... 9.761726: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 275
kworker/0:2-303 [000] .... 9.761736: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 276
kworker/0:2-303 [000] .... 9.761745: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 277
kworker/0:2-303 [000] .... 9.761756: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 278
kworker/0:2-303 [000] .... 9.761765: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 279
kworker/0:2-303 [000] .... 9.761775: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 280
kworker/0:2-303 [000] .... 9.761784: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 281
kworker/0:2-303 [000] .... 9.761793: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 282
kworker/0:2-303 [000] .... 9.761803: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 283
kworker/0:2-303 [000] .... 9.761812: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 284
kworker/0:2-303 [000] .... 9.761822: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 285
kworker/0:2-303 [000] .... 9.761831: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 286
kworker/0:2-303 [000] .... 9.761840: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 287
kworker/0:2-303 [000] .... 9.761854: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 288
kworker/0:2-303 [000] .... 9.761863: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 289
kworker/0:2-303 [000] .... 9.761872: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 290
kworker/0:2-303 [000] .... 9.761881: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 291
kworker/0:2-303 [000] .... 9.761892: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 292
kworker/0:2-303 [000] .... 9.761903: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 293
kworker/0:2-303 [000] .... 9.761913: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 294
kworker/0:2-303 [000] .... 9.761923: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 295
kworker/0:2-303 [000] .... 9.761932: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 296
kworker/0:2-303 [000] .... 9.761941: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 297
kworker/0:2-303 [000] .... 9.761951: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 298
kworker/0:2-303 [000] .... 9.761961: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 299
kworker/0:2-303 [000] .... 9.761970: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 300
kworker/0:2-303 [000] .... 9.761979: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 301
kworker/0:2-303 [000] .... 9.761988: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 302
kworker/0:2-303 [000] .... 9.761998: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 303
kworker/0:2-303 [000] .... 9.762008: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 304
kworker/0:2-303 [000] .... 9.762017: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 305
kworker/0:2-303 [000] .... 9.762027: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 306
kworker/0:2-303 [000] .... 9.762037: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 307
kworker/0:2-303 [000] .... 9.762046: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 308
kworker/0:2-303 [000] .... 9.762055: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 309
kworker/0:2-303 [000] .... 9.762065: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 310
kworker/0:2-303 [000] .... 9.762074: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 311
kworker/0:2-303 [000] .... 9.762085: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 312
kworker/0:2-303 [000] .... 9.762094: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 313
kworker/0:2-303 [000] .... 9.762104: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 314
kworker/0:2-303 [000] .... 9.762113: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 315
kworker/0:2-303 [000] .... 9.762122: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 316
kworker/0:2-303 [000] .... 9.762133: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 317
kworker/0:2-303 [000] .... 9.762142: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 318
kworker/0:2-303 [000] .... 9.762151: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 319
kworker/0:2-303 [000] .... 9.762162: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 320
kworker/0:2-303 [000] .... 9.762171: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 321
kworker/0:2-303 [000] .... 9.762182: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 322
kworker/0:2-303 [000] .... 9.762191: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 323
kworker/0:2-303 [000] .... 9.762200: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 324
kworker/0:2-303 [000] .... 9.762209: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 325
kworker/0:2-303 [000] .... 9.762220: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 326
kworker/0:2-303 [000] .... 9.762231: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 327
kworker/0:2-303 [000] .... 9.762240: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 328
kworker/0:2-303 [000] .... 9.762249: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 329
kworker/0:2-303 [000] .... 9.762258: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 330
kworker/0:2-303 [000] .... 9.762268: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 331
kworker/0:2-303 [000] .... 9.762278: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 332
kworker/0:2-303 [000] .... 9.762288: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 333
kworker/0:0-3 [000] .... 12.026100: msi_domain_alloc_irqs: dev: 0000:00:04.0 nvec 1 virq 335
kworker/0:0-3 [000] .... 12.143303: msi_domain_alloc_irqs: dev: 0000:00:04.1 nvec 1 virq 337
kworker/0:0-3 [000] .... 12.158686: msi_domain_alloc_irqs: dev: 0000:00:04.2 nvec 1 virq 338
kworker/0:0-3 [000] .... 12.168628: msi_domain_alloc_irqs: dev: 0000:00:04.3 nvec 1 virq 339
kworker/0:0-3 [000] .... 12.176980: msi_domain_alloc_irqs: dev: 0000:00:04.4 nvec 1 virq 340
kworker/0:0-3 [000] .... 12.185280: msi_domain_alloc_irqs: dev: 0000:00:04.5 nvec 1 virq 341
kworker/0:0-3 [000] .... 12.204748: msi_domain_alloc_irqs: dev: 0000:00:04.6 nvec 1 virq 342
kworker/0:0-3 [000] .... 12.214549: msi_domain_alloc_irqs: dev: 0000:00:04.7 nvec 1 virq 343
kworker/0:1-169 [000] .... 12.440642: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 344
kworker/0:1-169 [000] .... 12.440664: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 345
kworker/0:1-169 [000] .... 12.440685: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 346
kworker/0:1-169 [000] .... 12.440704: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 347
kworker/0:1-169 [000] .... 12.440724: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 348
kworker/0:1-169 [000] .... 12.440742: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 349
kworker/0:1-169 [000] .... 12.440761: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 350
kworker/0:1-169 [000] .... 12.440780: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 351
kworker/0:1-169 [000] .... 12.440802: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 352
kworker/0:1-169 [000] .... 12.440822: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 353
kworker/0:1-169 [000] .... 12.440841: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 354
kworker/0:1-169 [000] .... 12.440860: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 355
kworker/0:1-169 [000] .... 12.440878: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 356
kworker/0:1-169 [000] .... 12.440897: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 357
kworker/0:1-169 [000] .... 12.440917: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 358
kworker/0:1-169 [000] .... 12.440936: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 359
kworker/0:1-169 [000] .... 12.440954: msi_domain_alloc_irqs: dev: 0000:b5:00.0 nvec 1 virq 360
kworker/0:1-169 [000] .... 13.251009: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 361
kworker/0:1-169 [000] .... 13.251033: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 362
kworker/0:1-169 [000] .... 13.251055: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 363
kworker/0:1-169 [000] .... 13.251076: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 364
kworker/0:1-169 [000] .... 13.251100: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 365
kworker/0:1-169 [000] .... 13.251120: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 366
kworker/0:1-169 [000] .... 13.251142: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 367
kworker/0:1-169 [000] .... 13.251161: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 368
kworker/0:1-169 [000] .... 13.251182: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 369
kworker/0:1-169 [000] .... 13.251204: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 370
kworker/0:1-169 [000] .... 13.251224: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 371
kworker/0:1-169 [000] .... 13.251263: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 372
kworker/0:1-169 [000] .... 13.251288: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 373
kworker/0:1-169 [000] .... 13.251308: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 374
kworker/0:1-169 [000] .... 13.251330: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 375
kworker/0:1-169 [000] .... 13.251349: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 376
kworker/0:1-169 [000] .... 13.251374: msi_domain_alloc_irqs: dev: 0000:b7:00.0 nvec 1 virq 377
kworker/0:1-169 [000] .... 14.043690: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 378
kworker/0:1-169 [000] .... 14.043721: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 379
kworker/0:1-169 [000] .... 14.043745: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 380
kworker/0:1-169 [000] .... 14.043767: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 381
kworker/0:1-169 [000] .... 14.043793: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 382
kworker/0:1-169 [000] .... 14.043817: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 383
kworker/0:1-169 [000] .... 14.043840: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 384
kworker/0:1-169 [000] .... 14.043862: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 385
kworker/0:1-169 [000] .... 14.043884: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 386
kworker/0:1-169 [000] .... 14.043909: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 387
kworker/0:1-169 [000] .... 14.043932: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 388
kworker/0:1-169 [000] .... 14.043953: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 389
kworker/0:1-169 [000] .... 14.043975: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 390
kworker/0:1-169 [000] .... 14.043997: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 391
kworker/0:1-169 [000] .... 14.044021: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 392
kworker/0:1-169 [000] .... 14.044043: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 393
kworker/0:1-169 [000] .... 14.044065: msi_domain_alloc_irqs: dev: 0000:b9:00.0 nvec 1 virq 394
kworker/0:1-169 [000] .... 5181.743530: msi_domain_alloc_irqs: dev: 0000:00:1f.6 nvec 1 virq 395
ip-2037 [029] .... 5182.263982: msi_domain_alloc_irqs: dev: 0000:00:1f.6 nvec 1 virq 395
ip-2037 [029] .... 5182.367473: msi_domain_alloc_irqs: dev: 0000:00:1f.6 nvec 1 virq 395
bb:00.[0-3] Ethernet controller: Intel Corporation Device 37d0 (rev 03)
-+-[0000:b2]-+-00.0-[b3-bc]----00.0-[b4-bc]--+-00.0-[b5-b6]----00.0
| | +-01.0-[b7-b8]----00.0
| | +-02.0-[b9-ba]----00.0
| | \-03.0-[bb-bc]--+-00.0
| | +-00.1
| | +-00.2
| | \-00.3
and they are using i40e driver, the vectors should be reserved by:
i40e_probe() ->
i40e_init_interrupt_scheme() ->
i40e_init_msix() ->
i40e_reserve_msix_vectors() ->
pci_enable_msix_range()
# ls /sys/kernel/debug/irq/irqs
0 10 11 13 142 184 217 259 292 31 33
337 339 340 342 344 346 348 350 352 354 356
358 360 362 364 366 368 370 372 374 376 378
380 382 384 386 388 390 392 394 4 6 7 9
1 109 12 14 15 2 24 26 3 32 335
338 34 341 343 345 347 349 351 353 355 357
359 361 363 365 367 369 371 373 375 377 379
381 383 385 387 389 391 393 395 5 67 8
# cat /sys/kernel/debug/irq/domains/*
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-0
size: 24
mapped: 16
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-1
size: 8
mapped: 2
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-2
size: 8
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-3
size: 8
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: IO-APIC-4
size: 8
mapped: 5
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: PCI-HT
size: 0
mapped: 0
flags: 0x00000041
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: PCI-MSI-2
size: 0
mapped: 365
flags: 0x00000051
parent: VECTOR
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
name: VECTOR
size: 0
mapped: 388
flags: 0x00000041
Thanks,
Yu
> Thanks,
>
> tglx
>
> 8<-------------------
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -372,6 +372,9 @@ int msi_domain_alloc_irqs(struct irq_dom
> return ret;
> }
>
> + trace_printk("dev: %s nvec %d virq %d\n",
> + dev_name(dev), desc->nvec_used, virq);
> +
> for (i = 0; i < desc->nvec_used; i++)
> irq_set_msi_desc_off(virq, i, desc);
> }
> @@ -419,6 +422,8 @@ void msi_domain_free_irqs(struct irq_dom
> * entry. If that's the case, don't do anything.
> */
> if (desc->irq) {
> + trace_printk("dev: %s nvec %d virq %d\n",
> + dev_name(dev), desc->nvec_used, desc->irq);
> irq_domain_free_irqs(desc->irq, desc->nvec_used);
> desc->irq = 0;
> }
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-07 2:52 ` Yu Chen
@ 2017-09-07 5:54 ` Thomas Gleixner
2017-09-07 8:34 ` Yu Chen
0 siblings, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-07 5:54 UTC (permalink / raw)
To: Yu Chen
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra, Jeff Kirsher
On Thu, 7 Sep 2017, Yu Chen wrote:
> On Wed, Sep 06, 2017 at 10:03:58AM +0200, Thomas Gleixner wrote:
> > Can you please apply the debug patch below, boot the machine and right
> > after login provide the output of
> >
> > # cat /sys/kernel/debug/tracing/trace
> >
> kworker/0:2-303 [000] .... 9.135467: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 34
> kworker/0:2-303 [000] .... 9.135476: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 35
> kworker/0:2-303 [000] .... 9.135484: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 36
<SNIP>
> kworker/0:2-303 [000] .... 9.762268: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 331
> kworker/0:2-303 [000] .... 9.762278: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 332
> kworker/0:2-303 [000] .... 9.762288: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 333
That's 300 vectors.
> bb:00.[0-3] Ethernet controller: Intel Corporation Device 37d0 (rev 03)
>
> -+-[0000:b2]-+-00.0-[b3-bc]----00.0-[b4-bc]--+-00.0-[b5-b6]----00.0
> | | +-01.0-[b7-b8]----00.0
> | | +-02.0-[b9-ba]----00.0
> | | \-03.0-[bb-bc]--+-00.0
> | | +-00.1
> | | +-00.2
> | | \-00.3
>
> and they are using i40e driver, the vectors should be reserved by:
> i40e_probe() ->
> i40e_init_interrupt_scheme() ->
> i40e_init_msix() ->
> i40e_reserve_msix_vectors() ->
> pci_enable_msix_range()
>
> # ls /sys/kernel/debug/irq/irqs
> 0 10 11 13 142 184 217 259 292 31 33
> 337 339 340 342 344 346 348 350 352 354 356
> 358 360 362 364 366 368 370 372 374 376 378
> 380 382 384 386 388 390 392 394 4 6 7 9
> 1 109 12 14 15 2 24 26 3 32 335
> 338 34 341 343 345 347 349 351 353 355 357
> 359 361 363 365 367 369 371 373 375 377 379
> 381 383 385 387 389 391 393 395 5 67 8
Out of these 300 interrupts exactly 8 randomly selected ones are actively
used. And the other 292 interrupts are just there because it might need
them in the future when the 32 CPU machine gets magically upgraded to 4096
cores at runtime?
Can the i40e people @intel please fix this waste of resources and sanitize
their interrupt allocation scheme?
Please switch it over to managed interrupts so the affinity spreading
happens in a sane way and the interrupts are properly managed on CPU
hotplug.
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-07 5:54 ` Thomas Gleixner
@ 2017-09-07 8:34 ` Yu Chen
2017-09-07 9:45 ` Thomas Gleixner
0 siblings, 1 reply; 21+ messages in thread
From: Yu Chen @ 2017-09-07 8:34 UTC (permalink / raw)
To: Thomas Gleixner
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra, Jeff Kirsher
On Thu, Sep 07, 2017 at 07:54:09AM +0200, Thomas Gleixner wrote:
> On Thu, 7 Sep 2017, Yu Chen wrote:
> > On Wed, Sep 06, 2017 at 10:03:58AM +0200, Thomas Gleixner wrote:
> > > Can you please apply the debug patch below, boot the machine and right
> > > after login provide the output of
> > >
> > > # cat /sys/kernel/debug/tracing/trace
> > >
> > kworker/0:2-303 [000] .... 9.135467: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 34
> > kworker/0:2-303 [000] .... 9.135476: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 35
> > kworker/0:2-303 [000] .... 9.135484: msi_domain_alloc_irqs: dev: 0000:bb:00.0 nvec 1 virq 36
>
> <SNIP>
>
> > kworker/0:2-303 [000] .... 9.762268: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 331
> > kworker/0:2-303 [000] .... 9.762278: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 332
> > kworker/0:2-303 [000] .... 9.762288: msi_domain_alloc_irqs: dev: 0000:bb:00.3 nvec 1 virq 333
>
> That's 300 vectors.
>
> > bb:00.[0-3] Ethernet controller: Intel Corporation Device 37d0 (rev 03)
> >
> > -+-[0000:b2]-+-00.0-[b3-bc]----00.0-[b4-bc]--+-00.0-[b5-b6]----00.0
> > | | +-01.0-[b7-b8]----00.0
> > | | +-02.0-[b9-ba]----00.0
> > | | \-03.0-[bb-bc]--+-00.0
> > | | +-00.1
> > | | +-00.2
> > | | \-00.3
> >
> > and they are using i40e driver, the vectors should be reserved by:
> > i40e_probe() ->
> > i40e_init_interrupt_scheme() ->
> > i40e_init_msix() ->
> > i40e_reserve_msix_vectors() ->
> > pci_enable_msix_range()
> >
> > # ls /sys/kernel/debug/irq/irqs
> > 0 10 11 13 142 184 217 259 292 31 33
> > 337 339 340 342 344 346 348 350 352 354 356
> > 358 360 362 364 366 368 370 372 374 376 378
> > 380 382 384 386 388 390 392 394 4 6 7 9
> > 1 109 12 14 15 2 24 26 3 32 335
> > 338 34 341 343 345 347 349 351 353 355 357
> > 359 361 363 365 367 369 371 373 375 377 379
> > 381 383 385 387 389 391 393 395 5 67 8
>
> Out of these 300 interrupts exactly 8 randomly selected ones are actively
> used. And the other 292 interrupts are just there because it might need
> them in the future when the 32 CPU machine gets magically upgraded to 4096
> cores at runtime?
>
Humm, the 292 vectors remain disabled due to the network devices have
not been enabled(say,ifconfig up does not get invoked), so request_irq()
does not get invoked for these vectors? I have an impression that once
I've borrowed some fiber cables to connect the platform, the active IRQ
from i40e raised a lot, although I don't have these expensive cables
now...
> Can the i40e people @intel please fix this waste of resources and sanitize
> their interrupt allocation scheme?
>
> Please switch it over to managed interrupts so the affinity spreading
> happens in a sane way and the interrupts are properly managed on CPU
> hotplug.
Ok, I think currently in i40e driver the reservation of vectors
leverages pci_enable_msix_range() and did not provide the affinity
hit to low level IRQ system thus the managed interrupts is not enabled
there(although later in i40e driver we use irq_set_affinity_hint() to
spread the IRQs)
Thanks,
Yu
>
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-07 8:34 ` Yu Chen
@ 2017-09-07 9:45 ` Thomas Gleixner
0 siblings, 0 replies; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-07 9:45 UTC (permalink / raw)
To: Yu Chen
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra, Jeff Kirsher
On Thu, 7 Sep 2017, Yu Chen wrote:
> On Thu, Sep 07, 2017 at 07:54:09AM +0200, Thomas Gleixner wrote:
> > Please switch it over to managed interrupts so the affinity spreading
> > happens in a sane way and the interrupts are properly managed on CPU
> > hotplug.
> Ok, I think currently in i40e driver the reservation of vectors
> leverages pci_enable_msix_range() and did not provide the affinity
> hit to low level IRQ system thus the managed interrupts is not enabled
> there(although later in i40e driver we use irq_set_affinity_hint() to
> spread the IRQs)
The affinity hint has nothing to do with that. It's a hint which tells user
space irqbalanced what the desired placement of the interrupt should
be. That was never used for spreading the affinity automatically in the
kernel and will never be used to do so. It was a design failure from the
very beginning and should be eliminated ASAP.
The general problem here is the way how the whole MSI(X) machinery works in
the kernel.
pci_enable_msix()
allocate_interrupts()
allocate_irqdescs()
allocate_resources()
allocate_DMAR_entries()
allocate_vectors()
initialize_MSI_entries()
The reason for this is historical. Drivers expect, that request_irq()
works, when they allocated the required resources upfront.
Of course this could be changed, but there are issues with that:
1) The driver must ensure that it does not enable any of the internal
interrupt delivery mechanisms in the device before request_irq() has
succeeded.
That needs auditing drivers all over the place or we just ignore that
and leave everyone puzzled why things suddenly stop to work.
2) Reservation accounting
When no vectors are allocated, we still need to make reservations so
we can tell a driver that the vector space is exhausted when it
invokes pci_enable_msix(). But how do we size the reservation space?
Based on nr_possible_cpus(), nr_online_cpus() or some other
heuristics?
Sure, we can just ignore that and resort to overcommitment and fail
request_irq() when resources are not available, which brings us back
to #1
But resorting to overcommitment does not make the cpu hotplug problem
magically go away. If queues and interrupts are used, then the non managed
variants are going to break affinities and move stuff to the still online
CPUs, which is going to fail.
Managed irqs just work because the driver stops the queue and the interrupt
(which can even stay requested) is shut down and 'kept' on the outgoing
CPU. If the CPU comes back then the vector is reestablished and the
interrupt started up on the fly. Stuff just works.....
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-03 18:17 ` Thomas Gleixner
2017-09-03 19:18 ` RFD: x86: Sanitize the vector allocator Thomas Gleixner
2017-09-05 22:57 ` [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU Thomas Gleixner
@ 2017-09-06 4:13 ` Yu Chen
2017-09-06 6:15 ` Christoph Hellwig
2 siblings, 1 reply; 21+ messages in thread
From: Yu Chen @ 2017-09-06 4:13 UTC (permalink / raw)
To: Thomas Gleixner
Cc: x86, Ingo Molnar, H. Peter Anvin, Rui Zhang, LKML,
Rafael J. Wysocki, Len Brown, Dan Williams, Christoph Hellwig,
Peter Zijlstra
Thanks for looking at this,
(please bear my slow response as I need to check related code
before replying.)
On Sun, Sep 03, 2017 at 08:17:04PM +0200, Thomas Gleixner wrote:
> On Fri, 1 Sep 2017, Chen Yu wrote:
>
> > This is the major logic to spread the vectors on different CPUs.
> > The main idea is to choose the 'idlest' CPU which has assigned
> > the least number of vectors as the candidate/hint for the vector
> > allocation domain, in the hope that the vector allocation domain
> > could leverage this hint to generate corresponding cpumask.
> >
> > One of the requirements to do this vector spreading work comes from the
> > following hibernation problem found on a 16 cores server:
> >
> > CPU 31 disable failed: CPU has 62 vectors assigned and there
> > are only 0 available.
> >
> > 2 issues were found after investigation:
> >
> > 1. The network driver has declared many vector resources via
> > pci_enable_msix_range(), say, this driver might likely want
> > to reserve 6 per logical CPU, then there would be 192 of them.
> > 2. Besides, most of the vectors reserved by this driver are assigned
> > on CPU0 due to the current code strategy, so there would be
> > insufficient slots left on CPU0 to receive any migrated IRQs
> > during CPU offine.
> >
> > In theory after the commit c5cb83bb337c ("genirq/cpuhotplug: Handle
> > managed IRQs on CPU hotplug") the issue 1 has been solved with the
> > managed interrupt mechanism for multi queue devices, which no longer
> > migrate these interrupts. It simply shuts them down and restarts
> > them when the CPU comes back online.
> >
> > However, according to the implementation of this network driver,
> > the commit mentioned above does not fully fix the issue 1.
> > Here's the framework of the network driver:
> >
> > step 1. Reserved enough irq vectors and corresponding IRQs.
> > step 2. If the network is activated, invoke request_irq() to
> > register the handler.
> > step 3. Invoke set_affinity() to spread the IRQs onto different
> > CPUs, thus to spread the vectors too.
> >
> > The problem is, if the network cable is not connected, step 2
> > and 3 will not get invoked, thus the IRQ vectors will not spread
> > on different CPUs and will still be allocated on CPU0. As a result
> > the CPUs will still get the offline failure.
>
> So the proper solution for this network driver is to switch to managed
> interrupts instead of trying to work around it in some other place. It's
> using the wrong mechanism - end of story.
>
> Why are you insisting on implementing a band aid for this particular driver
> instead of fixing the underlying problem of that driver which requires to
> have 32 queues and interrupts open even if only a single CPU is online?
>
I agree, the driver could be rewritten, but it might take some time, so
meanwhile I'm looking at also other possible optimization.
> > Previously there were some discussion in the thread [1] about the
> > vector spread, and here are the suggestion from Thomas:
> >
> > Q1:
> > Rewrite that allocation code from scratch, use per cpu bitmaps,
> > so we can do fast search over masks and keep track of
> > the vectors which are used/free per cpu.
> > A1:
> > per cpu bitmap was onced considered but this might not be
> > as fast as the solution proposed here. That is, if we introduce the
> > per cpu bitmap, we need to count the number of vectors assigned on
> > each CPUs by cpumask_weight() and sort them in order to get the
> > CPUs who have the least number of vectors assigned. By contrast,
> > if we introduce the bitmaps for each vector number, we can get the
> > 'idle' CPU at the cost of nearly O(1).
>
> The weight accounting with the cpumasks is an orthogonal issue to the per
> cpu vector bitmaps.
>
> Once you selected a CPU the current code still loops in circles instead of
> just searching a bitmap.
Yes, I agree.
>
> And if the required affinity mask needs more than one CPU then this is
> still not giving you the right answer. You assign perhaps a vector to the
> least busy CPU, but the other CPUs which happen to be in the mask are going
> to be randomly selected.
Yes, for multi cpumask, it might depends on how vector domain behaves.
>
> I'm not against making the vector allocation better, but certainly not by
> adding yet more duct tape to something which is well known as one of the
> dumbest search algorithms on the planet with a worst case of
>
> nvectors * nr_online_cpus * nr_online_cpus_in_affinity_mask
>
> and your mechanism nests another loop of potentially NR_VECTORS into
> that. Which is pointless as the actual assignable vector space is
> smaller. While x86 has 256 vectors the assignable vector space is way
> smaller and non continous:
>
> Vectors 0- 31 are reserved for CPU traps and exceptions
> Vectors 239-255 are reserved by the kernel for IPIs, local timer etc.
> Vector 32 can be reserved by the kernel for the reboot interrupt
> Vector 96 can be reserved by the kernel for INT80
>
> So the actually usable array size is between 205 and 207.
>
> Aside of that the code is incorrect vs. the percpu accounting. Remove the
> limit in the update function and see what happens. While the limit is
> necessary in general, it should at least warn and yell whenever the
> accounting goes out of bounds. And if that happens, then the whole thing is
> completely useless simply because the numbers are just wrong.
>
Ok.
> > One scenario of this patch can be illustrated below:
> >
> > a) After initialization, vector_mask[0] is set with {CPU0...CPU31},
> > other vector_mask[i] remain zero, which means that, CPU0...CPU31
> > have zero vectors assigned.
> > b) During driver probe, CPU0 is chosen to be assigned with one vector.
> > Thus vector_mask[0] becomes {CPU1...CPU31}, and vector_mask[1] becomes
> > {CPU0}.
> > c) The next time the system tries to assign another vector, it searches from
> > vector[0], because it is non-empty, CPU1 is chosen to place
> > the vector. Thus vector_mask[0] becomes {CPU2...CPU31}, and vector_mask[1]
> > becomes {CPU0, CPU1}.
> > d) and so on, the vectors assignment will spread on different CPUs.
>
> So this works for your particular network driver scenario, which is the
> wrong example as this driver just needs to be reworked to not expose that
> issue.
>
> The reality of affinity requirements is not that simple. It's very much
> device dependent and making it mandatory can have negative side effects.
>
Yes, this patch just 'cure' a special case.
> > Q2:
> > "respect the allocation domains"
> > A2:
> > This solution provides a CPU as the param for the vector allocation domains,
>
> Solution? This is a crude hack which "solves" one particular problem in the
> wrong way.
>
> > in the hope that the domains can leverage this hint to generate the cpumask
>
> Hope is never a good enginering principle.
>
> > with less vectors assigned. But yes, the CPU we select upfront gets fed into
> > apic->vector_allocation_domain() might just say no because the CPU
> > does not belong to it, but anyway this should somehow spread the vectors,
> > although I can not find out a graceful solution to this.
> > BTW, the vector allocation domain is default_vector_allocation_domain()
> > in our case, thus it works.
>
> Now try that with the logical delivery modes which allow multi CPU
> assignements and the cluster mode thereof.
Yes, I feel a little hard to consider the vector domain as it looks
indepent of the current vector allocation process.
>
> > Q3:
> > "set a preference for the node on which the interrupt was allocated"
> > A3:
> > This can be achieved by further optimization in the following way:
> > a) Also record the number of vectors used per node.
> > b) Each time invoking __assign_irq_vector, adjust the input mask
> > by cpumask_and(mask, mask, node_mask_with_least_vectors())
>
> I'm not looking forward to that heuristics and the next loop inside the
> loops of loops.
>
> So no, this is not going to happen. The current implementation sucks and
> needs a rewrite.
>
> I already started to look into that, but started at the low level end of
> it. See the IDT cleanup series in tip:x86/apic. There is much more vooddo
> in the vector management code to clean up before we can actually talk about
> making the assignment algorithm smarter.
Ok, I'm pulling from tip:x86/apic.
>
> I'll send out a separate mail raising a few points to discuss before we are
> actually starting to look into magic spreading functionality.
>
For the vector calculation question you asked I'll reply to that thread.
Thanks,
Yu
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 4:13 ` Yu Chen
@ 2017-09-06 6:15 ` Christoph Hellwig
2017-09-06 17:46 ` Dan Williams
0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2017-09-06 6:15 UTC (permalink / raw)
To: Yu Chen
Cc: Thomas Gleixner, x86, Ingo Molnar, H. Peter Anvin, Rui Zhang,
LKML, Rafael J. Wysocki, Len Brown, Dan Williams,
Christoph Hellwig, Peter Zijlstra
On Wed, Sep 06, 2017 at 12:13:38PM +0800, Yu Chen wrote:
> I agree, the driver could be rewritten, but it might take some time, so
> meanwhile I'm looking at also other possible optimization.
Which driver are we talking about anyway? Let's start looking at it
and fix the issue there.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 6:15 ` Christoph Hellwig
@ 2017-09-06 17:46 ` Dan Williams
2017-09-07 2:57 ` Yu Chen
2017-09-07 5:59 ` Thomas Gleixner
0 siblings, 2 replies; 21+ messages in thread
From: Dan Williams @ 2017-09-06 17:46 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Yu Chen, Thomas Gleixner, X86 ML, Ingo Molnar, H. Peter Anvin,
Rui Zhang, LKML, Rafael J. Wysocki, Len Brown, Peter Zijlstra
On Tue, Sep 5, 2017 at 11:15 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Wed, Sep 06, 2017 at 12:13:38PM +0800, Yu Chen wrote:
>> I agree, the driver could be rewritten, but it might take some time, so
>> meanwhile I'm looking at also other possible optimization.
>
> Which driver are we talking about anyway? Let's start looking at it
> and fix the issue there.
As far as I understand, it's already fixed there:
commit 7c9ae7f053e9e896c24fd23595ba369a5fe322e1
Author: Carolyn Wyborny <carolyn.wyborny@intel.com>
Date: Tue Jun 20 15:16:53 2017 -0700
i40e: Fix for trace found with S4 state
This patch fixes a problem found in systems when entering
S4 state. This patch fixes the problem by ensuring that
the misc vector's IRQ is disabled as well. Without this
patch a stack trace can be seen upon entering S4 state.
However this seems like something that should be handled generically
in the irq-core especially since commit c5cb83bb337c
"genirq/cpuhotplug: Handle managed IRQs on CPU hotplug" was headed in
that direction. It's otherwise non-obvious when a driver needs to
release and re-acquire interrupts or be reworked to use managed
interrupts.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 17:46 ` Dan Williams
@ 2017-09-07 2:57 ` Yu Chen
2017-09-07 5:59 ` Thomas Gleixner
1 sibling, 0 replies; 21+ messages in thread
From: Yu Chen @ 2017-09-07 2:57 UTC (permalink / raw)
To: Dan Williams
Cc: Christoph Hellwig, Thomas Gleixner, X86 ML, Ingo Molnar,
H. Peter Anvin, Rui Zhang, LKML, Rafael J. Wysocki, Len Brown,
Peter Zijlstra
On Wed, Sep 06, 2017 at 10:46:17AM -0700, Dan Williams wrote:
> On Tue, Sep 5, 2017 at 11:15 PM, Christoph Hellwig <hch@lst.de> wrote:
> > On Wed, Sep 06, 2017 at 12:13:38PM +0800, Yu Chen wrote:
> >> I agree, the driver could be rewritten, but it might take some time, so
> >> meanwhile I'm looking at also other possible optimization.
> >
> > Which driver are we talking about anyway? Let's start looking at it
> > and fix the issue there.
>
> As far as I understand, it's already fixed there:
>
> commit 7c9ae7f053e9e896c24fd23595ba369a5fe322e1
> Author: Carolyn Wyborny <carolyn.wyborny@intel.com>
> Date: Tue Jun 20 15:16:53 2017 -0700
>
> i40e: Fix for trace found with S4 state
>
> This patch fixes a problem found in systems when entering
> S4 state. This patch fixes the problem by ensuring that
> the misc vector's IRQ is disabled as well. Without this
> patch a stack trace can be seen upon entering S4 state.
>
> However this seems like something that should be handled generically
> in the irq-core especially since commit c5cb83bb337c
> "genirq/cpuhotplug: Handle managed IRQs on CPU hotplug" was headed in
> that direction. It's otherwise non-obvious when a driver needs to
> release and re-acquire interrupts or be reworked to use managed
> interrupts.
Yes, thanks for the explaination! I did not notice this patch has
been merged already.
I'm using the normal CPU hotplug to reproduce the issue:
#!/bin/bash
n=1
while [ $n -le 31 ]
do
echo 0 > /sys/devices/system/cpu/cpu${n}/online
n=$(( n+1 ))
done
Thanks,
Yu
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-06 17:46 ` Dan Williams
2017-09-07 2:57 ` Yu Chen
@ 2017-09-07 5:59 ` Thomas Gleixner
2017-09-07 6:23 ` Dan Williams
1 sibling, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-07 5:59 UTC (permalink / raw)
To: Dan Williams
Cc: Christoph Hellwig, Yu Chen, X86 ML, Ingo Molnar, H. Peter Anvin,
Rui Zhang, LKML, Rafael J. Wysocki, Len Brown, Peter Zijlstra,
Jeff Kirsher
On Wed, 6 Sep 2017, Dan Williams wrote:
> On Tue, Sep 5, 2017 at 11:15 PM, Christoph Hellwig <hch@lst.de> wrote:
> > On Wed, Sep 06, 2017 at 12:13:38PM +0800, Yu Chen wrote:
> >> I agree, the driver could be rewritten, but it might take some time, so
> >> meanwhile I'm looking at also other possible optimization.
> >
> > Which driver are we talking about anyway? Let's start looking at it
> > and fix the issue there.
>
> As far as I understand, it's already fixed there:
>
> commit 7c9ae7f053e9e896c24fd23595ba369a5fe322e1
-ENOSUCHCOMMIT
> Author: Carolyn Wyborny <carolyn.wyborny@intel.com>
> Date: Tue Jun 20 15:16:53 2017 -0700
>
> i40e: Fix for trace found with S4 state
>
> This patch fixes a problem found in systems when entering
> S4 state. This patch fixes the problem by ensuring that
> the misc vector's IRQ is disabled as well. Without this
> patch a stack trace can be seen upon entering S4 state.
>
> However this seems like something that should be handled generically
> in the irq-core especially since commit c5cb83bb337c
> "genirq/cpuhotplug: Handle managed IRQs on CPU hotplug" was headed in
> that direction. It's otherwise non-obvious when a driver needs to
> release and re-acquire interrupts or be reworked to use managed
> interrupts.
There are two problems here:
1) The driver allocates 300 interrupts and uses exactly 8 randomly chosen
ones.
2) It's not using the managed affinity mechanics, so the interrupts cannot
be sanely handled by the kernel, neither affinity wise nor at hotplug
time.
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-07 5:59 ` Thomas Gleixner
@ 2017-09-07 6:23 ` Dan Williams
2017-09-07 6:59 ` Thomas Gleixner
0 siblings, 1 reply; 21+ messages in thread
From: Dan Williams @ 2017-09-07 6:23 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Christoph Hellwig, Yu Chen, X86 ML, Ingo Molnar, H. Peter Anvin,
Rui Zhang, LKML, Rafael J. Wysocki, Len Brown, Peter Zijlstra,
Jeff Kirsher
On Wed, Sep 6, 2017 at 10:59 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Wed, 6 Sep 2017, Dan Williams wrote:
>
>> On Tue, Sep 5, 2017 at 11:15 PM, Christoph Hellwig <hch@lst.de> wrote:
>> > On Wed, Sep 06, 2017 at 12:13:38PM +0800, Yu Chen wrote:
>> >> I agree, the driver could be rewritten, but it might take some time, so
>> >> meanwhile I'm looking at also other possible optimization.
>> >
>> > Which driver are we talking about anyway? Let's start looking at it
>> > and fix the issue there.
>>
>> As far as I understand, it's already fixed there:
>>
>> commit 7c9ae7f053e9e896c24fd23595ba369a5fe322e1
>
> -ENOSUCHCOMMIT
Sorry, that's still pending in -next.
>> Author: Carolyn Wyborny <carolyn.wyborny@intel.com>
>> Date: Tue Jun 20 15:16:53 2017 -0700
>>
>> i40e: Fix for trace found with S4 state
>>
>> This patch fixes a problem found in systems when entering
>> S4 state. This patch fixes the problem by ensuring that
>> the misc vector's IRQ is disabled as well. Without this
>> patch a stack trace can be seen upon entering S4 state.
>>
>> However this seems like something that should be handled generically
>> in the irq-core especially since commit c5cb83bb337c
>> "genirq/cpuhotplug: Handle managed IRQs on CPU hotplug" was headed in
>> that direction. It's otherwise non-obvious when a driver needs to
>> release and re-acquire interrupts or be reworked to use managed
>> interrupts.
>
> There are two problems here:
>
> 1) The driver allocates 300 interrupts and uses exactly 8 randomly chosen
> ones.
>
> 2) It's not using the managed affinity mechanics, so the interrupts cannot
> be sanely handled by the kernel, neither affinity wise nor at hotplug
> time.
Ok, this driver is an obvious candidate, but is there a general
guideline of when a driver must use affinity management? Should we be
emitting a message when a driver exceeds a certain threshold of
unmanaged interrupts to flag this in the future?
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH 4/4][RFC v2] x86/apic: Spread the vectors by choosing the idlest CPU
2017-09-07 6:23 ` Dan Williams
@ 2017-09-07 6:59 ` Thomas Gleixner
0 siblings, 0 replies; 21+ messages in thread
From: Thomas Gleixner @ 2017-09-07 6:59 UTC (permalink / raw)
To: Dan Williams
Cc: Christoph Hellwig, Yu Chen, X86 ML, Ingo Molnar, H. Peter Anvin,
Rui Zhang, LKML, Rafael J. Wysocki, Len Brown, Peter Zijlstra,
Jeff Kirsher
On Wed, 6 Sep 2017, Dan Williams wrote:
> On Wed, Sep 6, 2017 at 10:59 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >> commit 7c9ae7f053e9e896c24fd23595ba369a5fe322e1
> >
> > -ENOSUCHCOMMIT
>
> Sorry, that's still pending in -next.
Ok.
> >> Author: Carolyn Wyborny <carolyn.wyborny@intel.com>
> >> Date: Tue Jun 20 15:16:53 2017 -0700
> >>
> >> i40e: Fix for trace found with S4 state
> >>
> >> This patch fixes a problem found in systems when entering
> >> S4 state. This patch fixes the problem by ensuring that
> >> the misc vector's IRQ is disabled as well. Without this
> >> patch a stack trace can be seen upon entering S4 state.
Btw. This changelog is pretty useless.....
> >> However this seems like something that should be handled generically
> >> in the irq-core especially since commit c5cb83bb337c
> >> "genirq/cpuhotplug: Handle managed IRQs on CPU hotplug" was headed in
> >> that direction. It's otherwise non-obvious when a driver needs to
> >> release and re-acquire interrupts or be reworked to use managed
> >> interrupts.
> >
> > There are two problems here:
> >
> > 1) The driver allocates 300 interrupts and uses exactly 8 randomly chosen
> > ones.
> >
> > 2) It's not using the managed affinity mechanics, so the interrupts cannot
> > be sanely handled by the kernel, neither affinity wise nor at hotplug
> > time.
>
> Ok, this driver is an obvious candidate, but is there a general
> guideline of when a driver must use affinity management? Should we be
> emitting a message when a driver exceeds a certain threshold of
> unmanaged interrupts to flag this in the future?
I guess everything which uses multi queues and therefor allocates 8+
vectors is something which falls into that category.
Aside of that, drivers should be sane in terms of allocations. Allocating
metric tons of interrupts for nothing is not really a sign of a proper
thought out resource management.
Thanks,
tglx
^ permalink raw reply [flat|nested] 21+ messages in thread