All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Williamson <alex.williamson@redhat.com>
To: alex.williamson@redhat.com
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: [PATCH v2] vfio-pci: Disallow device from using NoSnoop transactions
Date: Thu, 10 Oct 2013 16:36:22 -0600	[thread overview]
Message-ID: <20131010223220.6379.52257.stgit@bling.home> (raw)

NoSnoop is a PCIe attribute that allows devices to issue transactions
that bypass cache.  If devices are allowed to do this then the
hypervisor must emulate instructions like WBINVD or else drivers in
the guest have no way to synchronize RAM for the device.  Instead of
forcing WBINVD when a device is assigned, we can instead prevent the
device from using NoSnoop, making all transactions coherent.  The
danger here is whether we can expect devices to fully support this
bit, but this is an improvement over neglecting the problem.

This fixes the Code 43 error seen on Windows guests with Intel host
systems when trying to assign Nvidia VGA devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2: I'm counting the RFC as v1
 - Move clearing NoSnoop Enable to post reset in case the device
   re-enables during reset.  This bit is part of what the kernel
   will save/restore around reset, but I don't think we want to
   rely on that obscure dependency for this.

 hw/misc/vfio.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index a2d5283..65100cf 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -163,6 +163,7 @@ typedef struct VFIODevice {
     VFIOINTx intx;
     unsigned int config_size;
     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
+    uint8_t *read_only_config_bits; /* Bits not guest modifiable to VFIO */
     off_t config_offset; /* Offset of config space region within device fd */
     unsigned int rom_size;
     off_t rom_offset; /* Offset of ROM region within device fd */
@@ -183,6 +184,7 @@ typedef struct VFIODevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
     int32_t bootindex;
     uint8_t pm_cap;
+    uint8_t pcie_cap;
     bool reset_works;
     bool has_vga;
     bool pci_aer;
@@ -1968,13 +1970,20 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
                                   uint32_t val, int len)
 {
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
-    uint32_t val_le = cpu_to_le32(val);
+    uint32_t ro_bits = 0, ro_val, val_le;
 
     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, addr, val, len);
 
-    /* Write everything to VFIO, let it filter out what we can't write */
+    /*
+     * Read-only bits should be cleared in pdev->wmask and set to the
+     * value we want for hardware in pdev->config.  ro_bits is already le.
+     */
+    memcpy(&ro_bits, vdev->read_only_config_bits + addr, len);
+    ro_val = pci_default_read_config(pdev, addr, len);
+    val_le = (cpu_to_le32(ro_val) & ro_bits) | (cpu_to_le32(val) & ~ro_bits);
+
     if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
                      __func__, vdev->host.domain, vdev->host.bus,
@@ -2552,6 +2561,64 @@ static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
 }
 
+/*
+ * The hypervisor needs to disable NoSnoop+ on the device.  NoSnoop
+ * transactions will bypass processor cache creating a coherency problem
+ * between cache and memory.  Instructions like WBINVD on x86 are intended
+ * to allow drivers to have a synchronization mechanism, but KVM doesn't
+ * emulate them.  We therefore prevent the device from using NoSnoop via
+ * the PCIe device control register.  If we can't clear the bit, abort
+ * since we can't guarantee coherency.
+ *
+ * NB - Devices like graphics may re-enable this through PCI config space
+ * backdoors.  We rely on hardware to honor the clearing of this bit.
+ */
+static void vfio_pcie_reset(VFIODevice *vdev)
+{
+    ssize_t ret;
+    uint16_t devctl;
+
+    if (!vdev->pcie_cap) {
+        return;
+    }
+
+    ret = pread(vdev->fd, &devctl, sizeof(devctl),
+                vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl)) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to read PCIe device control register\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+
+    devctl = le16_to_cpu(devctl);
+
+    if (!(devctl & PCI_EXP_DEVCTL_NOSNOOP_EN)) {
+        return;
+    }
+
+    devctl = cpu_to_le16(devctl & ~PCI_EXP_DEVCTL_NOSNOOP_EN);
+
+    ret = pwrite(vdev->fd, &devctl, sizeof(devctl),
+                 vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl)) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to write PCIe device control register\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+
+    ret = pread(vdev->fd, &devctl, sizeof(devctl),
+                vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl) ||
+        le16_to_cpu(devctl) & PCI_EXP_DEVCTL_NOSNOOP_EN) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to verify PCIe NoSnoop disable\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+}
+
 static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
 {
     uint16_t flags;
@@ -2569,6 +2636,17 @@ static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
         return -EINVAL;
     }
 
+    vdev->pcie_cap = pos;
+
+    /*
+     * PCIe NoSnoop is cleared, fixed, and read-only to the guest.  See
+     * vfio_pcie_reset() for details.
+     */
+    vfio_add_emulated_word(vdev, pos + PCI_EXP_DEVCTL, 0,
+                           PCI_EXP_DEVCTL_NOSNOOP_EN);
+    vfio_set_word_bits(vdev->read_only_config_bits + pos + PCI_EXP_DEVCTL,
+                       PCI_EXP_DEVCTL_NOSNOOP_EN, PCI_EXP_DEVCTL_NOSNOOP_EN);
+
     if (!pci_bus_is_express(vdev->pdev.bus)) {
         /*
          * Use express capability as-is on PCI bus.  It doesn't make much
@@ -2807,6 +2885,7 @@ static void vfio_pci_pre_reset(VFIODevice *vdev)
 static void vfio_pci_post_reset(VFIODevice *vdev)
 {
     vfio_enable_intx(vdev);
+    vfio_pcie_reset(vdev);
 }
 
 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
@@ -3559,6 +3638,7 @@ static int vfio_initfn(PCIDevice *pdev)
 
     /* vfio emulates a lot for us, but some bits need extra love */
     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
+    vdev->read_only_config_bits = g_malloc0(vdev->config_size);
 
     /* QEMU can choose to expose the ROM or not */
     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
@@ -3621,6 +3701,7 @@ out_teardown:
     vfio_unmap_bars(vdev);
 out_put:
     g_free(vdev->emulated_config_bits);
+    g_free(vdev->read_only_config_bits);
     vfio_put_device(vdev);
     vfio_put_group(group);
     return ret;
@@ -3640,6 +3721,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     vfio_teardown_msi(vdev);
     vfio_unmap_bars(vdev);
     g_free(vdev->emulated_config_bits);
+    g_free(vdev->read_only_config_bits);
     g_free(vdev->rom);
     vfio_put_device(vdev);
     vfio_put_group(group);


WARNING: multiple messages have this Message-ID (diff)
From: Alex Williamson <alex.williamson@redhat.com>
To: alex.williamson@redhat.com
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: [Qemu-devel] [PATCH v2] vfio-pci: Disallow device from using NoSnoop transactions
Date: Thu, 10 Oct 2013 16:36:22 -0600	[thread overview]
Message-ID: <20131010223220.6379.52257.stgit@bling.home> (raw)

NoSnoop is a PCIe attribute that allows devices to issue transactions
that bypass cache.  If devices are allowed to do this then the
hypervisor must emulate instructions like WBINVD or else drivers in
the guest have no way to synchronize RAM for the device.  Instead of
forcing WBINVD when a device is assigned, we can instead prevent the
device from using NoSnoop, making all transactions coherent.  The
danger here is whether we can expect devices to fully support this
bit, but this is an improvement over neglecting the problem.

This fixes the Code 43 error seen on Windows guests with Intel host
systems when trying to assign Nvidia VGA devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2: I'm counting the RFC as v1
 - Move clearing NoSnoop Enable to post reset in case the device
   re-enables during reset.  This bit is part of what the kernel
   will save/restore around reset, but I don't think we want to
   rely on that obscure dependency for this.

 hw/misc/vfio.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index a2d5283..65100cf 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -163,6 +163,7 @@ typedef struct VFIODevice {
     VFIOINTx intx;
     unsigned int config_size;
     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
+    uint8_t *read_only_config_bits; /* Bits not guest modifiable to VFIO */
     off_t config_offset; /* Offset of config space region within device fd */
     unsigned int rom_size;
     off_t rom_offset; /* Offset of ROM region within device fd */
@@ -183,6 +184,7 @@ typedef struct VFIODevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
     int32_t bootindex;
     uint8_t pm_cap;
+    uint8_t pcie_cap;
     bool reset_works;
     bool has_vga;
     bool pci_aer;
@@ -1968,13 +1970,20 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
                                   uint32_t val, int len)
 {
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
-    uint32_t val_le = cpu_to_le32(val);
+    uint32_t ro_bits = 0, ro_val, val_le;
 
     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, addr, val, len);
 
-    /* Write everything to VFIO, let it filter out what we can't write */
+    /*
+     * Read-only bits should be cleared in pdev->wmask and set to the
+     * value we want for hardware in pdev->config.  ro_bits is already le.
+     */
+    memcpy(&ro_bits, vdev->read_only_config_bits + addr, len);
+    ro_val = pci_default_read_config(pdev, addr, len);
+    val_le = (cpu_to_le32(ro_val) & ro_bits) | (cpu_to_le32(val) & ~ro_bits);
+
     if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
                      __func__, vdev->host.domain, vdev->host.bus,
@@ -2552,6 +2561,64 @@ static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
 }
 
+/*
+ * The hypervisor needs to disable NoSnoop+ on the device.  NoSnoop
+ * transactions will bypass processor cache creating a coherency problem
+ * between cache and memory.  Instructions like WBINVD on x86 are intended
+ * to allow drivers to have a synchronization mechanism, but KVM doesn't
+ * emulate them.  We therefore prevent the device from using NoSnoop via
+ * the PCIe device control register.  If we can't clear the bit, abort
+ * since we can't guarantee coherency.
+ *
+ * NB - Devices like graphics may re-enable this through PCI config space
+ * backdoors.  We rely on hardware to honor the clearing of this bit.
+ */
+static void vfio_pcie_reset(VFIODevice *vdev)
+{
+    ssize_t ret;
+    uint16_t devctl;
+
+    if (!vdev->pcie_cap) {
+        return;
+    }
+
+    ret = pread(vdev->fd, &devctl, sizeof(devctl),
+                vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl)) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to read PCIe device control register\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+
+    devctl = le16_to_cpu(devctl);
+
+    if (!(devctl & PCI_EXP_DEVCTL_NOSNOOP_EN)) {
+        return;
+    }
+
+    devctl = cpu_to_le16(devctl & ~PCI_EXP_DEVCTL_NOSNOOP_EN);
+
+    ret = pwrite(vdev->fd, &devctl, sizeof(devctl),
+                 vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl)) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to write PCIe device control register\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+
+    ret = pread(vdev->fd, &devctl, sizeof(devctl),
+                vdev->config_offset + vdev->pcie_cap + PCI_EXP_DEVCTL);
+    if (ret != sizeof(devctl) ||
+        le16_to_cpu(devctl) & PCI_EXP_DEVCTL_NOSNOOP_EN) {
+        hw_error("%s: %04x:%02x:%02x.%x "
+                 "Failed to verify PCIe NoSnoop disable\n", __func__,
+                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                 vdev->host.function);
+    }
+}
+
 static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
 {
     uint16_t flags;
@@ -2569,6 +2636,17 @@ static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
         return -EINVAL;
     }
 
+    vdev->pcie_cap = pos;
+
+    /*
+     * PCIe NoSnoop is cleared, fixed, and read-only to the guest.  See
+     * vfio_pcie_reset() for details.
+     */
+    vfio_add_emulated_word(vdev, pos + PCI_EXP_DEVCTL, 0,
+                           PCI_EXP_DEVCTL_NOSNOOP_EN);
+    vfio_set_word_bits(vdev->read_only_config_bits + pos + PCI_EXP_DEVCTL,
+                       PCI_EXP_DEVCTL_NOSNOOP_EN, PCI_EXP_DEVCTL_NOSNOOP_EN);
+
     if (!pci_bus_is_express(vdev->pdev.bus)) {
         /*
          * Use express capability as-is on PCI bus.  It doesn't make much
@@ -2807,6 +2885,7 @@ static void vfio_pci_pre_reset(VFIODevice *vdev)
 static void vfio_pci_post_reset(VFIODevice *vdev)
 {
     vfio_enable_intx(vdev);
+    vfio_pcie_reset(vdev);
 }
 
 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
@@ -3559,6 +3638,7 @@ static int vfio_initfn(PCIDevice *pdev)
 
     /* vfio emulates a lot for us, but some bits need extra love */
     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
+    vdev->read_only_config_bits = g_malloc0(vdev->config_size);
 
     /* QEMU can choose to expose the ROM or not */
     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
@@ -3621,6 +3701,7 @@ out_teardown:
     vfio_unmap_bars(vdev);
 out_put:
     g_free(vdev->emulated_config_bits);
+    g_free(vdev->read_only_config_bits);
     vfio_put_device(vdev);
     vfio_put_group(group);
     return ret;
@@ -3640,6 +3721,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     vfio_teardown_msi(vdev);
     vfio_unmap_bars(vdev);
     g_free(vdev->emulated_config_bits);
+    g_free(vdev->read_only_config_bits);
     g_free(vdev->rom);
     vfio_put_device(vdev);
     vfio_put_group(group);

             reply	other threads:[~2013-10-10 22:36 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-10 22:36 Alex Williamson [this message]
2013-10-10 22:36 ` [Qemu-devel] [PATCH v2] vfio-pci: Disallow device from using NoSnoop transactions Alex Williamson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131010223220.6379.52257.stgit@bling.home \
    --to=alex.williamson@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.