[PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance

kvmarm.lists.cs.columbia.edu archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance
@ 2019-07-24  9:04 Xiangyou Xie
  2019-07-24  9:04 ` [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches Xiangyou Xie
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-24  9:04 UTC (permalink / raw)
  To: marc.zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

Hello,

This patches are based on Marc Zyngier's branch
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/its-translation-cache

As follows:
(1) Introduce multiple LPI translation caches to reduce concurrency
(2) Removed the unnecessary vgic_its_invalidate_cache()
(3) Introduced vgic_cpu->pending and vgic_cpu->lowest, reducing
    vgic_cpu.ap_list_lock competition

Best Regards,

Xiangyou

Xiangyou Xie (3):
  KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI
    translation cache on movi command
  KVM: arm/arm64: vgic: introduce vgic_cpu pending status and
    lowest_priority

 include/kvm/arm_vgic.h        |  18 +++-
 virt/kvm/arm/vgic/vgic-init.c |   6 +-
 virt/kvm/arm/vgic/vgic-its.c  | 201 ++++++++++++++++++++++++------------------
 virt/kvm/arm/vgic/vgic.c      |  40 +++++----
 4 files changed, 160 insertions(+), 105 deletions(-)

-- 
1.8.3.1


_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  2019-07-24  9:04 [PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance Xiangyou Xie
@ 2019-07-24  9:04 ` Xiangyou Xie
  2019-07-24 11:09   ` Marc Zyngier
  2019-07-24  9:04 ` [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command Xiangyou Xie
  2019-07-24  9:04 ` [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority Xiangyou Xie
  2 siblings, 1 reply; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-24  9:04 UTC (permalink / raw)
  To: marc.zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

Because dist->lpi_list_lock is a perVM lock, when a virtual machine
is configured with multiple virtual NIC devices and receives
network packets at the same time, dist->lpi_list_lock will become
a performance bottleneck.

This patch increases the number of lpi_translation_cache to eight,
hashes the cpuid that executes irqfd_wakeup, and chooses which
lpi_translation_cache to use.

Signed-off-by: Xiangyou Xie <xiexiangyou@huawei.com>
---
 include/kvm/arm_vgic.h        |  13 ++-
 virt/kvm/arm/vgic/vgic-init.c |   6 +-
 virt/kvm/arm/vgic/vgic-its.c  | 199 +++++++++++++++++++++++++-----------------
 3 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index eef69b5..ce372a0 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -33,6 +33,9 @@
 #define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
 			 (irq) <= VGIC_MAX_SPI)
 
+/*The number of lpi translation cache lists*/
+#define LPI_TRANS_CACHES_NUM 8
+
 enum vgic_type {
 	VGIC_V2,		/* Good ol' GICv2 */
 	VGIC_V3,		/* New fancy GICv3 */
@@ -162,6 +165,12 @@ struct vgic_io_device {
 	struct kvm_io_device dev;
 };
 
+struct its_translation_cache {
+	/* LPI translation cache */
+	struct list_head        lpi_cache;
+	raw_spinlock_t          lpi_cache_lock;
+};
+
 struct vgic_its {
 	/* The base address of the ITS control register frame */
 	gpa_t			vgic_its_base;
@@ -249,8 +258,8 @@ struct vgic_dist {
 	struct list_head	lpi_list_head;
 	int			lpi_list_count;
 
-	/* LPI translation cache */
-	struct list_head	lpi_translation_cache;
+	/* LPI translation cache array*/
+	struct its_translation_cache lpi_translation_cache[LPI_TRANS_CACHES_NUM];
 	u32			lpi_pcpu_cache_size;
 
 	/* used by vgic-debug */
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 80127ca..6060dbe 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -51,9 +51,13 @@
 void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	int i;
 
 	INIT_LIST_HEAD(&dist->lpi_list_head);
-	INIT_LIST_HEAD(&dist->lpi_translation_cache);
+	for (i = 0; i < LPI_TRANS_CACHES_NUM; i++) {
+		INIT_LIST_HEAD(&dist->lpi_translation_cache[i].lpi_cache);
+		raw_spin_lock_init(&dist->lpi_translation_cache[i].lpi_cache_lock);
+	}
 	raw_spin_lock_init(&dist->lpi_list_lock);
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 5f2ad74..792d90b 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -535,13 +535,21 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
 	return 0;
 }
 
+/* Default is 16 cached LPIs per vcpu */
+#define LPI_DEFAULT_PCPU_CACHE_SIZE	16
+
 static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
 					       phys_addr_t db,
-					       u32 devid, u32 eventid)
+					       u32 devid, u32 eventid,
+					       int cacheid)
 {
 	struct vgic_translation_cache_entry *cte;
+	struct vgic_irq *irq = NULL;
+	int pos = 0;
 
-	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+	list_for_each_entry(cte,
+			    &dist->lpi_translation_cache[cacheid].lpi_cache,
+			    entry) {
 		/*
 		 * If we hit a NULL entry, there is nothing after this
 		 * point.
@@ -549,21 +557,26 @@ static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
 		if (!cte->irq)
 			break;
 
-		if (cte->db != db || cte->devid != devid ||
-		    cte->eventid != eventid)
-			continue;
-
-		/*
-		 * Move this entry to the head, as it is the most
-		 * recently used.
-		 */
-		if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
-			list_move(&cte->entry, &dist->lpi_translation_cache);
-
-		return cte->irq;
+		pos++;
+
+		if (cte->devid == devid &&
+		    cte->eventid == eventid &&
+		    cte->db == db) {
+			/*
+			 * Move this entry to the head if the entry at the
+			 * position behind the LPI_DEFAULT_PCPU_CACHE_SIZE * 2
+			 * of the LRU list, as it is the most recently used.
+			 */
+			if (pos > LPI_DEFAULT_PCPU_CACHE_SIZE * 2)
+				list_move(&cte->entry,
+				    &dist->lpi_translation_cache[cacheid].lpi_cache);
+
+			irq = cte->irq;
+			break;
+		}
 	}
 
-	return NULL;
+	return irq;
 }
 
 static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
@@ -571,11 +584,15 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq;
-	unsigned long flags;
+	int cpu;
+	int cacheid;
 
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	irq = __vgic_its_check_cache(dist, db, devid, eventid);
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+	cpu = smp_processor_id();
+	cacheid = cpu % LPI_TRANS_CACHES_NUM;
+
+	raw_spin_lock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock);
+	irq = __vgic_its_check_cache(dist, db, devid, eventid, cacheid);
+	raw_spin_unlock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock);
 
 	return irq;
 }
@@ -588,49 +605,55 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 	struct vgic_translation_cache_entry *cte;
 	unsigned long flags;
 	phys_addr_t db;
+	int cacheid;
 
 	/* Do not cache a directly injected interrupt */
 	if (irq->hw)
 		return;
 
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	if (unlikely(list_empty(&dist->lpi_translation_cache)))
-		goto out;
-
-	/*
-	 * We could have raced with another CPU caching the same
-	 * translation behind our back, so let's check it is not in
-	 * already
-	 */
-	db = its->vgic_its_base + GITS_TRANSLATER;
-	if (__vgic_its_check_cache(dist, db, devid, eventid))
-		goto out;
-
-	/* Always reuse the last entry (LRU policy) */
-	cte = list_last_entry(&dist->lpi_translation_cache,
-			      typeof(*cte), entry);
+	for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) {
+		raw_spin_lock_irqsave(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
+		if (unlikely(list_empty(&dist->lpi_translation_cache[cacheid].lpi_cache))) {
+			raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
+			break;
+		}
 
-	/*
-	 * Caching the translation implies having an extra reference
-	 * to the interrupt, so drop the potential reference on what
-	 * was in the cache, and increment it on the new interrupt.
-	 */
-	if (cte->irq)
-		__vgic_put_lpi_locked(kvm, cte->irq);
+		/*
+		 * We could have raced with another CPU caching the same
+		 * translation behind our back, so let's check it is not in
+		 * already
+		 */
+		db = its->vgic_its_base + GITS_TRANSLATER;
+		if (__vgic_its_check_cache(dist, db, devid, eventid, cacheid)) {
+			raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
+			continue;
+		}
 
-	vgic_get_irq_kref(irq);
+		/* Always reuse the last entry (LRU policy) */
+		cte = list_last_entry(&dist->lpi_translation_cache[cacheid].lpi_cache,
+				      typeof(*cte), entry);
 
-	cte->db		= db;
-	cte->devid	= devid;
-	cte->eventid	= eventid;
-	cte->irq	= irq;
+		/*
+		 * Caching the translation implies having an extra reference
+		 * to the interrupt, so drop the potential reference on what
+		 * was in the cache, and increment it on the new interrupt.
+		 */
+		if (cte->irq) {
+			raw_spin_lock(&dist->lpi_list_lock);
+			__vgic_put_lpi_locked(kvm, cte->irq);
+			raw_spin_unlock(&dist->lpi_list_lock);
+		}
+		vgic_get_irq_kref(irq);
 
-	/* Move the new translation to the head of the list */
-	list_move(&cte->entry, &dist->lpi_translation_cache);
+		cte->db		= db;
+		cte->devid	= devid;
+		cte->eventid	= eventid;
+		cte->irq	= irq;
 
-out:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+		/* Move the new translation to the head of the list */
+		list_move(&cte->entry, &dist->lpi_translation_cache[cacheid].lpi_cache);
+		raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
+	}
 }
 
 void vgic_its_invalidate_cache(struct kvm *kvm)
@@ -638,22 +661,25 @@ void vgic_its_invalidate_cache(struct kvm *kvm)
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_translation_cache_entry *cte;
 	unsigned long flags;
+	int i;
 
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-		/*
-		 * If we hit a NULL entry, there is nothing after this
-		 * point.
-		 */
-		if (!cte->irq)
-			break;
-
-		__vgic_put_lpi_locked(kvm, cte->irq);
-		cte->irq = NULL;
+	for (i = 0; i < LPI_TRANS_CACHES_NUM; i++) {
+		raw_spin_lock_irqsave(&dist->lpi_translation_cache[i].lpi_cache_lock, flags);
+		list_for_each_entry(cte, &dist->lpi_translation_cache[i].lpi_cache, entry) {
+			/*
+			 * If we hit a NULL entry, there is nothing after this
+			 * point.
+			 */
+			if (!cte->irq)
+				break;
+
+			raw_spin_lock(&dist->lpi_list_lock);
+			__vgic_put_lpi_locked(kvm, cte->irq);
+			raw_spin_unlock(&dist->lpi_list_lock);
+			cte->irq = NULL;
+		}
+		raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[i].lpi_cache_lock, flags);
 	}
-
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 }
 
 int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
@@ -1821,16 +1847,18 @@ static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
 	return ret;
 }
 
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE	16
-
 void vgic_lpi_translation_cache_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	unsigned int sz = dist->lpi_pcpu_cache_size;
 	int i;
+	int cacheid;
 
-	if (!list_empty(&dist->lpi_translation_cache))
+	/*
+	 * If the first cache entry has been initialized, all cache groups
+	 * have been initialized.
+	 */
+	if (!list_empty(&dist->lpi_translation_cache[0].lpi_cache))
 		return;
 
 	if (!sz)
@@ -1838,16 +1866,17 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm)
 
 	sz *= atomic_read(&kvm->online_vcpus);
 
-	for (i = 0; i < sz; i++) {
-		struct vgic_translation_cache_entry *cte;
+	for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) {
+		for (i = 0; i < sz; i++) {
+			struct vgic_translation_cache_entry *cte;
 
-		/* An allocation failure is not fatal */
-		cte = kzalloc(sizeof(*cte), GFP_KERNEL);
-		if (WARN_ON(!cte))
-			break;
+			cte = kzalloc(sizeof(*cte), GFP_KERNEL);
+			if (WARN_ON(!cte))
+				break;
 
-		INIT_LIST_HEAD(&cte->entry);
-		list_add(&cte->entry, &dist->lpi_translation_cache);
+			INIT_LIST_HEAD(&cte->entry);
+			list_add(&cte->entry, &dist->lpi_translation_cache[cacheid].lpi_cache);
+		}
 	}
 }
 
@@ -1855,13 +1884,19 @@ void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_translation_cache_entry *cte, *tmp;
+	unsigned long flags;
+	int cacheid;
 
 	vgic_its_invalidate_cache(kvm);
 
-	list_for_each_entry_safe(cte, tmp,
-				 &dist->lpi_translation_cache, entry) {
-		list_del(&cte->entry);
-		kfree(cte);
+	for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) {
+		raw_spin_lock_irqsave(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
+		list_for_each_entry_safe(cte, tmp,
+					 &dist->lpi_translation_cache[cacheid].lpi_cache, entry) {
+			list_del(&cte->entry);
+			kfree(cte);
+		}
+		raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
 	}
 }
 
-- 
1.8.3.1


_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command
  2019-07-24  9:04 [PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance Xiangyou Xie
  2019-07-24  9:04 ` [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches Xiangyou Xie
@ 2019-07-24  9:04 ` Xiangyou Xie
  2019-07-24 11:20   ` Marc Zyngier
  2019-07-24  9:04 ` [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority Xiangyou Xie
  2 siblings, 1 reply; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-24  9:04 UTC (permalink / raw)
  To: marc.zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

It is not necessary to invalidate the lpi translation cache when the
virtual machine executes the movi instruction to adjust the affinity of
the interrupt. Irqbalance will adjust the interrupt affinity in a short
period of time to achieve the purpose of interrupting load balancing,
but this does not affect the contents of the lpi translation cache.

Signed-off-by: Xiangyou Xie <xiexiangyou@huawei.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 792d90b..66e93ab 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -907,8 +907,6 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
 	ite->collection = collection;
 	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
 
-	vgic_its_invalidate_cache(kvm);
-
 	return update_affinity(ite->irq, vcpu);
 }
 
-- 
1.8.3.1


_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority
  2019-07-24  9:04 [PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance Xiangyou Xie
  2019-07-24  9:04 ` [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches Xiangyou Xie
  2019-07-24  9:04 ` [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command Xiangyou Xie
@ 2019-07-24  9:04 ` Xiangyou Xie
  2019-07-24 11:39   ` Marc Zyngier
  2 siblings, 1 reply; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-24  9:04 UTC (permalink / raw)
  To: marc.zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

During the halt polling process, vgic_cpu->ap_list_lock is frequently
obtained andreleased, (kvm_vcpu_check_block->kvm_arch_vcpu_runnable->
kvm_vgic_vcpu_pending_irq).This action affects the performance of virq
interrupt injection, because vgic_queue_irq_unlock also attempts to get
vgic_cpu->ap_list_lock and add irq to vgic_cpu ap_list.

The irq pending state and the minimum priority introduced by the patch,
kvm_vgic_vcpu_pending_irq do not need to traverse vgic_cpu ap_list, only
the check pending state and priority.

Signed-off-by: Xiangyou Xie <xiexiangyou@huawei.com>
---
 include/kvm/arm_vgic.h   |  5 +++++
 virt/kvm/arm/vgic/vgic.c | 40 ++++++++++++++++++++++------------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ce372a0..636db29 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -337,6 +337,11 @@ struct vgic_cpu {
 
 	/* Cache guest interrupt ID bits */
 	u32 num_id_bits;
+
+	/* Minimum of priority in all irqs */
+	u8 lowest_priority;
+	/* Irq pending flag */
+	bool pending;
 };
 
 extern struct static_key_false vgic_v2_cpuif_trap;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index deb8471..767dfe0 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -398,6 +398,12 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 	 * now in the ap_list.
 	 */
 	vgic_get_irq_kref(irq);
+
+	if (!irq->active) {
+		vcpu->arch.vgic_cpu.pending = true;
+		if (vcpu->arch.vgic_cpu.lowest_priority > irq->priority)
+			vcpu->arch.vgic_cpu.lowest_priority = irq->priority;
+	}
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
 	irq->vcpu = vcpu;
 
@@ -618,6 +624,9 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 retry:
 	raw_spin_lock(&vgic_cpu->ap_list_lock);
 
+	vgic_cpu->lowest_priority = U8_MAX;
+	vgic_cpu->pending = false;
+
 	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
 		struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
 		bool target_vcpu_needs_kick = false;
@@ -649,6 +658,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 		}
 
 		if (target_vcpu == vcpu) {
+			if (!irq->active) {
+				vgic_cpu->pending = true;
+				if (vgic_cpu->lowest_priority > irq->priority)
+					vgic_cpu->lowest_priority = irq->priority;
+			}
 			/* We're on the right CPU */
 			raw_spin_unlock(&irq->irq_lock);
 			continue;
@@ -690,6 +704,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 
 			list_del(&irq->ap_list);
 			irq->vcpu = target_vcpu;
+			if (!irq->active) {
+				new_cpu->pending = true;
+				if (new_cpu->lowest_priority > irq->priority)
+					new_cpu->lowest_priority = irq->priority;
+			}
 			list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
 			target_vcpu_needs_kick = true;
 		}
@@ -930,9 +949,6 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu)
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	bool pending = false;
-	unsigned long flags;
 	struct vgic_vmcr vmcr;
 
 	if (!vcpu->kvm->arch.vgic.enabled)
@@ -943,22 +959,10 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 
 	vgic_get_vmcr(vcpu, &vmcr);
 
-	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		raw_spin_lock(&irq->irq_lock);
-		pending = irq_is_pending(irq) && irq->enabled &&
-			  !irq->active &&
-			  irq->priority < vmcr.pmr;
-		raw_spin_unlock(&irq->irq_lock);
-
-		if (pending)
-			break;
-	}
-
-	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+	if (vgic_cpu->pending && vgic_cpu->lowest_priority < vmcr.pmr)
+		return true;
 
-	return pending;
+	return false;
 }
 
 void vgic_kick_vcpus(struct kvm *kvm)
-- 
1.8.3.1


_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  2019-07-24  9:04 ` [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches Xiangyou Xie
@ 2019-07-24 11:09   ` Marc Zyngier
  2019-07-26 12:35     ` Xiangyou Xie
  0 siblings, 1 reply; 10+ messages in thread
From: Marc Zyngier @ 2019-07-24 11:09 UTC (permalink / raw)
  To: Xiangyou Xie; +Cc: kvmarm, linux-arm-kernel, kvm

Hi Xiangyou,

On 24/07/2019 10:04, Xiangyou Xie wrote:
> Because dist->lpi_list_lock is a perVM lock, when a virtual machine
> is configured with multiple virtual NIC devices and receives
> network packets at the same time, dist->lpi_list_lock will become
> a performance bottleneck.

I'm sorry, but you'll have to show me some real numbers before I
consider any of this. There is a reason why the original series still
isn't in mainline, and that's because people don't post any numbers.
Adding more patches is not going to change that small fact.

> This patch increases the number of lpi_translation_cache to eight,
> hashes the cpuid that executes irqfd_wakeup, and chooses which
> lpi_translation_cache to use.

So you've now switched to a per-cache lock, meaning that the rest of the
ITS code can manipulate the the lpi_list without synchronization with
the caches. Have you worked out all the possible races? Also, how does
this new lock class fits in the whole locking hierarchy?

If you want something that is actually scalable, do it the right way.
Use a better data structure than a list, switch to using RCU rather than
the current locking strategy. But your current approach looks quite fragile.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command
  2019-07-24  9:04 ` [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command Xiangyou Xie
@ 2019-07-24 11:20   ` Marc Zyngier
  0 siblings, 0 replies; 10+ messages in thread
From: Marc Zyngier @ 2019-07-24 11:20 UTC (permalink / raw)
  To: Xiangyou Xie; +Cc: kvmarm, linux-arm-kernel, kvm

On 24/07/2019 10:04, Xiangyou Xie wrote:
> It is not necessary to invalidate the lpi translation cache when the
> virtual machine executes the movi instruction to adjust the affinity of
> the interrupt. Irqbalance will adjust the interrupt affinity in a short
> period of time to achieve the purpose of interrupting load balancing,
> but this does not affect the contents of the lpi translation cache.

What does irqbalance have to do with it? We're dealing with the GIC
architecture here, not with userspace.

If the guest issues a MOVI command to a RD where GICR_CTLR.EnableLPI is
0, and that we use an existing cached translation, we are going to make
the interrupt pending for that RD. This is direct violation of the
architecture, which says:

"LPI support is disabled. Any doorbell interrupt generated as a result
of a write to a virtual LPI register must be discarded, and any ITS
translation requests or commands involving LPIs in this Redistributor
are ignored."

So the interrupt cannot be made pending. No IFs, no BUTs. If you really
want to optimize it, check that the target RD is actually enabled and
only invalidate in this particular case.

Your guest would have to have a rate of MOVI that is comparable to that
of the interrupts for it to show on any radar though...

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority
  2019-07-24  9:04 ` [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority Xiangyou Xie
@ 2019-07-24 11:39   ` Marc Zyngier
  0 siblings, 0 replies; 10+ messages in thread
From: Marc Zyngier @ 2019-07-24 11:39 UTC (permalink / raw)
  To: Xiangyou Xie; +Cc: kvmarm, linux-arm-kernel, kvm

On 24/07/2019 10:04, Xiangyou Xie wrote:
> During the halt polling process, vgic_cpu->ap_list_lock is frequently
> obtained andreleased, (kvm_vcpu_check_block->kvm_arch_vcpu_runnable->
> kvm_vgic_vcpu_pending_irq).This action affects the performance of virq
> interrupt injection, because vgic_queue_irq_unlock also attempts to get
> vgic_cpu->ap_list_lock and add irq to vgic_cpu ap_list.

Numbers. Give me numbers. Please.

> 
> The irq pending state and the minimum priority introduced by the patch,
> kvm_vgic_vcpu_pending_irq do not need to traverse vgic_cpu ap_list, only
> the check pending state and priority.
> 
> Signed-off-by: Xiangyou Xie <xiexiangyou@huawei.com>
> ---
>  include/kvm/arm_vgic.h   |  5 +++++
>  virt/kvm/arm/vgic/vgic.c | 40 ++++++++++++++++++++++------------------
>  2 files changed, 27 insertions(+), 18 deletions(-)
> 
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index ce372a0..636db29 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -337,6 +337,11 @@ struct vgic_cpu {
>  
>  	/* Cache guest interrupt ID bits */
>  	u32 num_id_bits;
> +
> +	/* Minimum of priority in all irqs */
> +	u8 lowest_priority;

In all IRQs? That are in every possible state?

> +	/* Irq pending flag */
> +	bool pending;

What does pending mean here? Strictly pending? or covering the other
states of an interrupt (Active, Active+Pending)?

>  };
>  
>  extern struct static_key_false vgic_v2_cpuif_trap;
> diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
> index deb8471..767dfe0 100644
> --- a/virt/kvm/arm/vgic/vgic.c
> +++ b/virt/kvm/arm/vgic/vgic.c
> @@ -398,6 +398,12 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
>  	 * now in the ap_list.
>  	 */
>  	vgic_get_irq_kref(irq);
> +
> +	if (!irq->active) {

Why not active? What if the interrupt is Active+Pending? What is the
rational for this? This applies to the whole of this patch.

> +		vcpu->arch.vgic_cpu.pending = true;
> +		if (vcpu->arch.vgic_cpu.lowest_priority > irq->priority)
> +			vcpu->arch.vgic_cpu.lowest_priority = irq->priority;
> +	}
>  	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
>  	irq->vcpu = vcpu;
>  
> @@ -618,6 +624,9 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
>  retry:
>  	raw_spin_lock(&vgic_cpu->ap_list_lock);
>  
> +	vgic_cpu->lowest_priority = U8_MAX;
> +	vgic_cpu->pending = false;
> +
>  	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
>  		struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
>  		bool target_vcpu_needs_kick = false;
> @@ -649,6 +658,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
>  		}
>  
>  		if (target_vcpu == vcpu) {
> +			if (!irq->active) {
> +				vgic_cpu->pending = true;
> +				if (vgic_cpu->lowest_priority > irq->priority)
> +					vgic_cpu->lowest_priority = irq->priority;
> +			}
>  			/* We're on the right CPU */
>  			raw_spin_unlock(&irq->irq_lock);
>  			continue;
> @@ -690,6 +704,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
>  
>  			list_del(&irq->ap_list);
>  			irq->vcpu = target_vcpu;
> +			if (!irq->active) {
> +				new_cpu->pending = true;
> +				if (new_cpu->lowest_priority > irq->priority)
> +					new_cpu->lowest_priority = irq->priority;
> +			}
>  			list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
>  			target_vcpu_needs_kick = true;
>  		}
> @@ -930,9 +949,6 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu)
>  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
>  {
>  	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> -	struct vgic_irq *irq;
> -	bool pending = false;
> -	unsigned long flags;
>  	struct vgic_vmcr vmcr;
>  
>  	if (!vcpu->kvm->arch.vgic.enabled)
> @@ -943,22 +959,10 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
>  
>  	vgic_get_vmcr(vcpu, &vmcr);
>  
> -	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
> -
> -	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
> -		raw_spin_lock(&irq->irq_lock);
> -		pending = irq_is_pending(irq) && irq->enabled &&
> -			  !irq->active &&
> -			  irq->priority < vmcr.pmr;
> -		raw_spin_unlock(&irq->irq_lock);
> -
> -		if (pending)
> -			break;
> -	}
> -
> -	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
> +	if (vgic_cpu->pending && vgic_cpu->lowest_priority < vmcr.pmr)
> +		return true;

And here we go. You've dropped the lock, and yet are evaluating two
unrelated fields that could be changed by a parallel injection or the
vcpu entering/exiting the guest.

I'm sure you get better performance. I'm also pretty sure this is
completely unsafe.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  2019-07-24 11:09   ` Marc Zyngier
@ 2019-07-26 12:35     ` Xiangyou Xie
  2019-07-26 13:02       ` Marc Zyngier
  0 siblings, 1 reply; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-26 12:35 UTC (permalink / raw)
  To: Marc Zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

Hi Marc,

Sorry, the test data was not posted in the previous email.

I tested the impact of virtual interrupt injection time-consuming：
Run the iperf command to send UDP packets to the VM:
	iperf -c $IP -u -b 40m -l 64 -t 6000&
The vm just receive UDP traffic. When configure multiple NICs, each NIC 
receives the above iperf UDP traffic, This may reflect the performance 
impact of shared resource competition, such as lock.

Observing the delay of virtual interrupt injection: the time spent by 
the "irqfd_wakeup", "irqfd_inject" function, and kworker context switch.
The less the better.

ITS translation cache greatly reduces the delay of interrupt injection 
compared to kworker thread, because it eliminate wakeup and uncertain 
scheduling delay:
                   kworker              ITS translation cache    improved
   1 NIC           6.692 us                 1.766 us               73.6% 

  10 NICs          7.536 us                 2.574 us               65.8% 


Increases the number of lpi_translation_cache reduce lock competition.
Multi-interrupt concurrent injections perform better:

             ITS translation cache      with patch             improved
    1 NIC        1.766 us                 1.694 us                4.1%
  10 NICs        2.574 us                 1.848 us               28.2%

Regards,

Xiangyou

On 2019/7/24 19:09, Marc Zyngier wrote:
> Hi Xiangyou,
> 
> On 24/07/2019 10:04, Xiangyou Xie wrote:
>> Because dist->lpi_list_lock is a perVM lock, when a virtual machine
>> is configured with multiple virtual NIC devices and receives
>> network packets at the same time, dist->lpi_list_lock will become
>> a performance bottleneck.
> 
> I'm sorry, but you'll have to show me some real numbers before I
> consider any of this. There is a reason why the original series still
> isn't in mainline, and that's because people don't post any numbers.
> Adding more patches is not going to change that small fact.
> 
>> This patch increases the number of lpi_translation_cache to eight,
>> hashes the cpuid that executes irqfd_wakeup, and chooses which
>> lpi_translation_cache to use.
> 
> So you've now switched to a per-cache lock, meaning that the rest of the
> ITS code can manipulate the the lpi_list without synchronization with
> the caches. Have you worked out all the possible races? Also, how does
> this new lock class fits in the whole locking hierarchy?
> 
> If you want something that is actually scalable, do it the right way.
> Use a better data structure than a list, switch to using RCU rather than
> the current locking strategy. But your current approach looks quite fragile.
> 
> Thanks,
> 
> 	M.
> 

_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  2019-07-26 12:35     ` Xiangyou Xie
@ 2019-07-26 13:02       ` Marc Zyngier
  2019-07-27  6:13         ` Xiangyou Xie
  0 siblings, 1 reply; 10+ messages in thread
From: Marc Zyngier @ 2019-07-26 13:02 UTC (permalink / raw)
  To: Xiangyou Xie; +Cc: kvmarm, linux-arm-kernel, kvm

On Fri, 26 Jul 2019 13:35:20 +0100,
Xiangyou Xie <xiexiangyou@huawei.com> wrote:
> 
> Hi Marc,
> 
> Sorry, the test data was not posted in the previous email.
> 
> I tested the impact of virtual interrupt injection time-consuming：
> Run the iperf command to send UDP packets to the VM:
> 	iperf -c $IP -u -b 40m -l 64 -t 6000&
> The vm just receive UDP traffic. When configure multiple NICs, each
> NIC receives the above iperf UDP traffic, This may reflect the
> performance impact of shared resource competition, such as lock.
> 
> Observing the delay of virtual interrupt injection: the time spent by
> the "irqfd_wakeup", "irqfd_inject" function, and kworker context
> switch.
> The less the better.
> 
> ITS translation cache greatly reduces the delay of interrupt injection
> compared to kworker thread, because it eliminate wakeup and uncertain
> scheduling delay:
>                   kworker              ITS translation cache    improved
>   1 NIC           6.692 us                 1.766 us
> 73.6% 
>  10 NICs          7.536 us                 2.574 us
> 65.8%

OK, that's pretty interesting. It would have been good to post such
data in reply to the ITS cache series.

> 
> Increases the number of lpi_translation_cache reduce lock competition.
> Multi-interrupt concurrent injections perform better:
> 
>             ITS translation cache      with patch             improved
>    1 NIC        1.766 us                 1.694 us                4.1%
>  10 NICs        2.574 us                 1.848 us               28.2%


That's sort off interesting too, but it doesn't answer any of the
questions I had in response to your patch: How do you ensure mutual
exclusion with the LPI list being concurrently updated? How does the
new locking fit in the current locking scheme?

Thanks,

	M.

> Regards,
> 
> Xiangyou
> 
> On 2019/7/24 19:09, Marc Zyngier wrote:
> > Hi Xiangyou,
> > 
> > On 24/07/2019 10:04, Xiangyou Xie wrote:
> >> Because dist->lpi_list_lock is a perVM lock, when a virtual machine
> >> is configured with multiple virtual NIC devices and receives
> >> network packets at the same time, dist->lpi_list_lock will become
> >> a performance bottleneck.
> > 
> > I'm sorry, but you'll have to show me some real numbers before I
> > consider any of this. There is a reason why the original series still
> > isn't in mainline, and that's because people don't post any numbers.
> > Adding more patches is not going to change that small fact.
> > 
> >> This patch increases the number of lpi_translation_cache to eight,
> >> hashes the cpuid that executes irqfd_wakeup, and chooses which
> >> lpi_translation_cache to use.
> > 
> > So you've now switched to a per-cache lock, meaning that the rest of the
> > ITS code can manipulate the the lpi_list without synchronization with
> > the caches. Have you worked out all the possible races? Also, how does
> > this new lock class fits in the whole locking hierarchy?
> > 
> > If you want something that is actually scalable, do it the right way.
> > Use a better data structure than a list, switch to using RCU rather than
> > the current locking strategy. But your current approach looks quite fragile.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> 

-- 
Jazz is not dead, it just smells funny.
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches
  2019-07-26 13:02       ` Marc Zyngier
@ 2019-07-27  6:13         ` Xiangyou Xie
  0 siblings, 0 replies; 10+ messages in thread
From: Xiangyou Xie @ 2019-07-27  6:13 UTC (permalink / raw)
  To: Marc Zyngier; +Cc: kvmarm, linux-arm-kernel, kvm

Hi Marc,

Adding a new lpi to lpi_list, the contents of the caches will not be 
updated immediately until the irq is injected.

Irq's reference count will be increased when synchronized to each cache, 
and reduced after deleting from each cache. Only when irq is removed 
from all caches, the reference count is reduced to 0, which is allowed 
to be removed from lpi_list.

So I think that the reference count of IRQ can guarantee the consistency 
of lpi_list and cache, is right?

When multiple CPUs inject different lpi interrupts at the same time, 
which may result in different sorts of interrupts in different caches, 
even some interrupts of a certain cache are overflowed.
Eg:
The contents of the cache at a certain moment：
cache 0: c->b->a
cache 1: a->b->c

Then, inject two interrupts "d,e" simultaneously, the following results 
may occur:
cache 0: e->d->c
cache 1: d->e->a

But this is no problem, just make sure that the irq found is correct.

In addition, about Locking order:
kvm->lock (mutex)
   its->cmd_lock (mutex)
     its->its_lock (mutex)
       vgic_cpu->ap_list_lock
         cache->lpi_cache_lock  /**/
            kvm->lpi_list_lock	
              vgic_irq->irq_lock	

Thanks,

Xiangyou.

On 2019/7/26 21:02, Marc Zyngier wrote:
> On Fri, 26 Jul 2019 13:35:20 +0100,
> Xiangyou Xie <xiexiangyou@huawei.com> wrote:
>>
>> Hi Marc,
>>
>> Sorry, the test data was not posted in the previous email.
>>
>> I tested the impact of virtual interrupt injection time-consuming：
>> Run the iperf command to send UDP packets to the VM:
>> 	iperf -c $IP -u -b 40m -l 64 -t 6000&
>> The vm just receive UDP traffic. When configure multiple NICs, each
>> NIC receives the above iperf UDP traffic, This may reflect the
>> performance impact of shared resource competition, such as lock.
>>
>> Observing the delay of virtual interrupt injection: the time spent by
>> the "irqfd_wakeup", "irqfd_inject" function, and kworker context
>> switch.
>> The less the better.
>>
>> ITS translation cache greatly reduces the delay of interrupt injection
>> compared to kworker thread, because it eliminate wakeup and uncertain
>> scheduling delay:
>>                    kworker              ITS translation cache    improved
>>    1 NIC           6.692 us                 1.766 us
>> 73.6%
>>   10 NICs          7.536 us                 2.574 us
>> 65.8%
> 
> OK, that's pretty interesting. It would have been good to post such
> data in reply to the ITS cache series.
> 
>>
>> Increases the number of lpi_translation_cache reduce lock competition.
>> Multi-interrupt concurrent injections perform better:
>>
>>              ITS translation cache      with patch             improved
>>     1 NIC        1.766 us                 1.694 us                4.1%
>>   10 NICs        2.574 us                 1.848 us               28.2%
> 
> 
> That's sort off interesting too, but it doesn't answer any of the
> questions I had in response to your patch: How do you ensure mutual
> exclusion with the LPI list being concurrently updated? How does the
> new locking fit in the current locking scheme?
> 
> Thanks,
> 
> 	M.
> 
>> Regards,
>>
>> Xiangyou
>>
>> On 2019/7/24 19:09, Marc Zyngier wrote:
>>> Hi Xiangyou,
>>>
>>> On 24/07/2019 10:04, Xiangyou Xie wrote:
>>>> Because dist->lpi_list_lock is a perVM lock, when a virtual machine
>>>> is configured with multiple virtual NIC devices and receives
>>>> network packets at the same time, dist->lpi_list_lock will become
>>>> a performance bottleneck.
>>>
>>> I'm sorry, but you'll have to show me some real numbers before I
>>> consider any of this. There is a reason why the original series still
>>> isn't in mainline, and that's because people don't post any numbers.
>>> Adding more patches is not going to change that small fact.
>>>
>>>> This patch increases the number of lpi_translation_cache to eight,
>>>> hashes the cpuid that executes irqfd_wakeup, and chooses which
>>>> lpi_translation_cache to use.
>>>
>>> So you've now switched to a per-cache lock, meaning that the rest of the
>>> ITS code can manipulate the the lpi_list without synchronization with
>>> the caches. Have you worked out all the possible races? Also, how does
>>> this new lock class fits in the whole locking hierarchy?
>>>
>>> If you want something that is actually scalable, do it the right way.
>>> Use a better data structure than a list, switch to using RCU rather than
>>> the current locking strategy. But your current approach looks quite fragile.
>>>
>>> Thanks,
>>>
>>> 	M.
>>>
>>
> 

_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-07-27  6:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-24  9:04 [PATCH 0/3] KVM: arm/arm64: Optimize lpi translation cache performance Xiangyou Xie
2019-07-24  9:04 ` [PATCH 1/3] KVM: arm/arm64: vgic-its: Introduce multiple LPI translation caches Xiangyou Xie
2019-07-24 11:09   ` Marc Zyngier
2019-07-26 12:35     ` Xiangyou Xie
2019-07-26 13:02       ` Marc Zyngier
2019-07-27  6:13         ` Xiangyou Xie
2019-07-24  9:04 ` [PATCH 2/3] KVM: arm/arm64: vgic-its: Do not execute invalidate MSI-LPI translation cache on movi command Xiangyou Xie
2019-07-24 11:20   ` Marc Zyngier
2019-07-24  9:04 ` [PATCH 3/3] KVM: arm/arm64: vgic: introduce vgic_cpu pending status and lowest_priority Xiangyou Xie
2019-07-24 11:39   ` Marc Zyngier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).