All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexandru Elisei <alexandru.elisei@arm.com>
To: kvm@vger.kernel.org
Cc: will@kernel.org, julien.thierry.kdev@gmail.com,
	andre.przywara@arm.com, sami.mujawar@arm.com,
	lorenzo.pieralisi@arm.com, maz@kernel.org
Subject: [PATCH v4 kvmtool 01/12] ioport: mmio: Use a mutex and reference counting for locking
Date: Thu, 14 May 2020 16:38:18 +0100	[thread overview]
Message-ID: <1589470709-4104-2-git-send-email-alexandru.elisei@arm.com> (raw)
In-Reply-To: <1589470709-4104-1-git-send-email-alexandru.elisei@arm.com>

kvmtool uses brlock for protecting accesses to the ioport and mmio
red-black trees. brlock allows concurrent reads, but only one writer, which
is assumed not to be a VCPU thread (for more information see commit
0b907ed2eaec ("kvm tools: Add a brlock)). This is done by issuing a
compiler barrier on read and pausing the entire virtual machine on writes.
When KVM_BRLOCK_DEBUG is defined, brlock uses instead a pthread read/write
lock.

When we will implement reassignable BARs, the mmio or ioport mapping will
be done as a result of a VCPU mmio access. When brlock is a pthread
read/write lock, it means that we will try to acquire a write lock with the
read lock already held by the same VCPU and we will deadlock. When it's
not, a VCPU will have to call kvm__pause, which means the virtual machine
will stay paused forever.

Let's avoid all this by using a mutex and reference counting the red-black
tree entries. This way we can guarantee that we won't unregister a node
that another thread is currently using for emulation.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
---
 include/kvm/ioport.h          |   2 +
 include/kvm/rbtree-interval.h |   4 +-
 ioport.c                      |  85 ++++++++++++++++++++++-----------
 mmio.c                        | 107 ++++++++++++++++++++++++++++++++----------
 4 files changed, 143 insertions(+), 55 deletions(-)

diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h
index 62a719327e3f..039633f76bdd 100644
--- a/include/kvm/ioport.h
+++ b/include/kvm/ioport.h
@@ -22,6 +22,8 @@ struct ioport {
 	struct ioport_operations	*ops;
 	void				*priv;
 	struct device_header		dev_hdr;
+	u32				refcount;
+	bool				remove;
 };
 
 struct ioport_operations {
diff --git a/include/kvm/rbtree-interval.h b/include/kvm/rbtree-interval.h
index 730eb5e8551d..17cd3b5f3199 100644
--- a/include/kvm/rbtree-interval.h
+++ b/include/kvm/rbtree-interval.h
@@ -6,7 +6,9 @@
 
 #define RB_INT_INIT(l, h) \
 	(struct rb_int_node){.low = l, .high = h}
-#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+#define rb_int(n)	rb_entry(n, struct rb_int_node, node)
+#define rb_int_start(n)	((n)->low)
+#define rb_int_end(n)	((n)->low + (n)->high - 1)
 
 struct rb_int_node {
 	struct rb_node	node;
diff --git a/ioport.c b/ioport.c
index d9f2e8ea3c3b..844a832d25a4 100644
--- a/ioport.c
+++ b/ioport.c
@@ -2,7 +2,6 @@
 
 #include "kvm/kvm.h"
 #include "kvm/util.h"
-#include "kvm/brlock.h"
 #include "kvm/rbtree-interval.h"
 #include "kvm/mutex.h"
 
@@ -16,6 +15,8 @@
 
 #define ioport_node(n) rb_entry(n, struct ioport, node)
 
+static DEFINE_MUTEX(ioport_lock);
+
 static struct rb_root		ioport_tree = RB_ROOT;
 
 static struct ioport *ioport_search(struct rb_root *root, u64 addr)
@@ -39,6 +40,36 @@ static void ioport_remove(struct rb_root *root, struct ioport *data)
 	rb_int_erase(root, &data->node);
 }
 
+static struct ioport *ioport_get(struct rb_root *root, u64 addr)
+{
+	struct ioport *ioport;
+
+	mutex_lock(&ioport_lock);
+	ioport = ioport_search(root, addr);
+	if (ioport)
+		ioport->refcount++;
+	mutex_unlock(&ioport_lock);
+
+	return ioport;
+}
+
+/* Called with ioport_lock held. */
+static void ioport_unregister(struct rb_root *root, struct ioport *data)
+{
+	device__unregister(&data->dev_hdr);
+	ioport_remove(root, data);
+	free(data);
+}
+
+static void ioport_put(struct rb_root *root, struct ioport *data)
+{
+	mutex_lock(&ioport_lock);
+	data->refcount--;
+	if (data->remove && data->refcount == 0)
+		ioport_unregister(root, data);
+	mutex_unlock(&ioport_lock);
+}
+
 #ifdef CONFIG_HAS_LIBFDT
 static void generate_ioport_fdt_node(void *fdt,
 				     struct device_header *dev_hdr,
@@ -80,16 +111,22 @@ int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, i
 			.bus_type	= DEVICE_BUS_IOPORT,
 			.data		= generate_ioport_fdt_node,
 		},
+		/*
+		 * Start from 0 because ioport__unregister() doesn't decrement
+		 * the reference count.
+		 */
+		.refcount	= 0,
+		.remove		= false,
 	};
 
-	br_write_lock(kvm);
+	mutex_lock(&ioport_lock);
 	r = ioport_insert(&ioport_tree, entry);
 	if (r < 0)
 		goto out_free;
 	r = device__register(&entry->dev_hdr);
 	if (r < 0)
 		goto out_remove;
-	br_write_unlock(kvm);
+	mutex_unlock(&ioport_lock);
 
 	return port;
 
@@ -97,33 +134,28 @@ out_remove:
 	ioport_remove(&ioport_tree, entry);
 out_free:
 	free(entry);
-	br_write_unlock(kvm);
+	mutex_unlock(&ioport_lock);
 	return r;
 }
 
 int ioport__unregister(struct kvm *kvm, u16 port)
 {
 	struct ioport *entry;
-	int r;
-
-	br_write_lock(kvm);
 
-	r = -ENOENT;
+	mutex_lock(&ioport_lock);
 	entry = ioport_search(&ioport_tree, port);
-	if (!entry)
-		goto done;
-
-	device__unregister(&entry->dev_hdr);
-	ioport_remove(&ioport_tree, entry);
-
-	free(entry);
-
-	r = 0;
-
-done:
-	br_write_unlock(kvm);
+	if (!entry) {
+		mutex_unlock(&ioport_lock);
+		return -ENOENT;
+	}
+	/* The same reasoning from kvm__deregister_mmio() applies. */
+	if (entry->refcount == 0)
+		ioport_unregister(&ioport_tree, entry);
+	else
+		entry->remove = true;
+	mutex_unlock(&ioport_lock);
 
-	return r;
+	return 0;
 }
 
 static void ioport__unregister_all(void)
@@ -136,9 +168,7 @@ static void ioport__unregister_all(void)
 	while (rb) {
 		rb_node = rb_int(rb);
 		entry = ioport_node(rb_node);
-		device__unregister(&entry->dev_hdr);
-		ioport_remove(&ioport_tree, entry);
-		free(entry);
+		ioport_unregister(&ioport_tree, entry);
 		rb = rb_first(&ioport_tree);
 	}
 }
@@ -164,8 +194,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction,
 	void *ptr = data;
 	struct kvm *kvm = vcpu->kvm;
 
-	br_read_lock(kvm);
-	entry = ioport_search(&ioport_tree, port);
+	entry = ioport_get(&ioport_tree, port);
 	if (!entry)
 		goto out;
 
@@ -180,9 +209,9 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction,
 		ptr += size;
 	}
 
-out:
-	br_read_unlock(kvm);
+	ioport_put(&ioport_tree, entry);
 
+out:
 	if (ret)
 		return true;
 
diff --git a/mmio.c b/mmio.c
index 61e1d47a587d..cd141cd30750 100644
--- a/mmio.c
+++ b/mmio.c
@@ -1,7 +1,7 @@
 #include "kvm/kvm.h"
 #include "kvm/kvm-cpu.h"
 #include "kvm/rbtree-interval.h"
-#include "kvm/brlock.h"
+#include "kvm/mutex.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -15,10 +15,14 @@
 
 #define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
 
+static DEFINE_MUTEX(mmio_lock);
+
 struct mmio_mapping {
 	struct rb_int_node	node;
 	void			(*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
 	void			*ptr;
+	u32			refcount;
+	bool			remove;
 };
 
 static struct rb_root mmio_tree = RB_ROOT;
@@ -51,6 +55,11 @@ static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
 	return rb_int_insert(root, &data->node);
 }
 
+static void mmio_remove(struct rb_root *root, struct mmio_mapping *data)
+{
+	rb_int_erase(root, &data->node);
+}
+
 static const char *to_direction(u8 is_write)
 {
 	if (is_write)
@@ -59,6 +68,41 @@ static const char *to_direction(u8 is_write)
 	return "read";
 }
 
+static struct mmio_mapping *mmio_get(struct rb_root *root, u64 phys_addr, u32 len)
+{
+	struct mmio_mapping *mmio;
+
+	mutex_lock(&mmio_lock);
+	mmio = mmio_search(root, phys_addr, len);
+	if (mmio)
+		mmio->refcount++;
+	mutex_unlock(&mmio_lock);
+
+	return mmio;
+}
+
+/* Called with mmio_lock held. */
+static void mmio_deregister(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio)
+{
+	struct kvm_coalesced_mmio_zone zone = (struct kvm_coalesced_mmio_zone) {
+		.addr	= rb_int_start(&mmio->node),
+		.size	= 1,
+	};
+	ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+	mmio_remove(root, mmio);
+	free(mmio);
+}
+
+static void mmio_put(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio)
+{
+	mutex_lock(&mmio_lock);
+	mmio->refcount--;
+	if (mmio->remove && mmio->refcount == 0)
+		mmio_deregister(kvm, root, mmio);
+	mutex_unlock(&mmio_lock);
+}
+
 int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
 		       void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
 			void *ptr)
@@ -72,9 +116,15 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c
 		return -ENOMEM;
 
 	*mmio = (struct mmio_mapping) {
-		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
-		.mmio_fn = mmio_fn,
-		.ptr	= ptr,
+		.node		= RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+		.mmio_fn	= mmio_fn,
+		.ptr		= ptr,
+		/*
+		 * Start from 0 because kvm__deregister_mmio() doesn't decrement
+		 * the reference count.
+		 */
+		.refcount	= 0,
+		.remove		= false,
 	};
 
 	if (coalesce) {
@@ -88,9 +138,9 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c
 			return -errno;
 		}
 	}
-	br_write_lock(kvm);
+	mutex_lock(&mmio_lock);
 	ret = mmio_insert(&mmio_tree, mmio);
-	br_write_unlock(kvm);
+	mutex_unlock(&mmio_lock);
 
 	return ret;
 }
@@ -98,25 +148,30 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c
 bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
 {
 	struct mmio_mapping *mmio;
-	struct kvm_coalesced_mmio_zone zone;
 
-	br_write_lock(kvm);
+	mutex_lock(&mmio_lock);
 	mmio = mmio_search_single(&mmio_tree, phys_addr);
 	if (mmio == NULL) {
-		br_write_unlock(kvm);
+		mutex_unlock(&mmio_lock);
 		return false;
 	}
+	/*
+	 * The PCI emulation code calls this function when memory access is
+	 * disabled for a device, or when a BAR has a new address assigned. PCI
+	 * emulation doesn't use any locks and as a result we can end up in a
+	 * situation where we have called mmio_get() to do emulation on one VCPU
+	 * thread (let's call it VCPU0), and several other VCPU threads have
+	 * called kvm__deregister_mmio(). In this case, if we decrement refcount
+	 * kvm__deregister_mmio() (either directly, or by calling mmio_put()),
+	 * refcount will reach 0 and we will free the mmio node before VCPU0 has
+	 * called mmio_put(). This will trigger use-after-free errors on VCPU0.
+	 */
+	if (mmio->refcount == 0)
+		mmio_deregister(kvm, &mmio_tree, mmio);
+	else
+		mmio->remove = true;
+	mutex_unlock(&mmio_lock);
 
-	zone = (struct kvm_coalesced_mmio_zone) {
-		.addr	= phys_addr,
-		.size	= 1,
-	};
-	ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
-
-	rb_int_erase(&mmio_tree, &mmio->node);
-	br_write_unlock(kvm);
-
-	free(mmio);
 	return true;
 }
 
@@ -124,18 +179,18 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u
 {
 	struct mmio_mapping *mmio;
 
-	br_read_lock(vcpu->kvm);
-	mmio = mmio_search(&mmio_tree, phys_addr, len);
-
-	if (mmio)
-		mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
-	else {
+	mmio = mmio_get(&mmio_tree, phys_addr, len);
+	if (!mmio) {
 		if (vcpu->kvm->cfg.mmio_debug)
 			fprintf(stderr,	"Warning: Ignoring MMIO %s at %016llx (length %u)\n",
 				to_direction(is_write),
 				(unsigned long long)phys_addr, len);
+		goto out;
 	}
-	br_read_unlock(vcpu->kvm);
 
+	mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
+	mmio_put(vcpu->kvm, &mmio_tree, mmio);
+
+out:
 	return true;
 }
-- 
2.7.4


  reply	other threads:[~2020-05-14 15:38 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-14 15:38 [PATCH v4 kvmtool 00/12] Add reassignable BARs Alexandru Elisei
2020-05-14 15:38 ` Alexandru Elisei [this message]
2020-05-15 10:13   ` [PATCH v4 kvmtool 01/12] ioport: mmio: Use a mutex and reference counting for locking André Przywara
2020-05-15 13:18     ` Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 02/12] pci: Add helpers for BAR values and memory/IO space access Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 03/12] virtio/pci: Get emulated region address from BARs Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 04/12] vfio: Reserve ioports when configuring the BAR Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 05/12] pci: Limit configuration transaction size to 32 bits Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 06/12] vfio/pci: Don't write configuration value twice Alexandru Elisei
2020-05-14 16:55   ` André Przywara
2020-05-14 15:38 ` [PATCH v4 kvmtool 07/12] Don't allow more than one framebuffers Alexandru Elisei
2020-05-14 16:56   ` André Przywara
2020-05-14 15:38 ` [PATCH v4 kvmtool 08/12] pci: Implement callbacks for toggling BAR emulation Alexandru Elisei
2020-05-14 16:56   ` André Przywara
2020-05-14 15:38 ` [PATCH v4 kvmtool 09/12] pci: Toggle BAR I/O and memory space emulation Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 10/12] pci: Implement reassignable BARs Alexandru Elisei
2020-05-14 16:56   ` André Przywara
2020-05-15 13:25     ` Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 11/12] arm/fdt: Remove 'linux,pci-probe-only' property Alexandru Elisei
2020-05-14 15:38 ` [PATCH v4 kvmtool 12/12] vfio: Trap MMIO access to BAR addresses which aren't page aligned Alexandru Elisei
2020-05-15 15:38 ` [PATCH v4 kvmtool 00/12] Add reassignable BARs André Przywara
2020-05-19 16:46 ` Will Deacon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1589470709-4104-2-git-send-email-alexandru.elisei@arm.com \
    --to=alexandru.elisei@arm.com \
    --cc=andre.przywara@arm.com \
    --cc=julien.thierry.kdev@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lorenzo.pieralisi@arm.com \
    --cc=maz@kernel.org \
    --cc=sami.mujawar@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.