From mboxrd@z Thu Jan 1 00:00:00 1970 From: Auger Eric Subject: Re: [RFC PATCH 3/5] vfio/quirks: Automatic ioeventfd enabling for NVIDIA BAR0 quirks Date: Thu, 8 Feb 2018 12:10:02 +0100 Message-ID: <983af1c6-a57a-43d8-2237-88e0d736905f@redhat.com> References: <20180207001615.1156.10547.stgit@gimli.home> <20180207002632.1156.53770.stgit@gimli.home> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org To: Alex Williamson , qemu-devel@nongnu.org Return-path: Received: from mx3-rdu2.redhat.com ([66.187.233.73]:36698 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1750806AbeBHLKJ (ORCPT ); Thu, 8 Feb 2018 06:10:09 -0500 In-Reply-To: <20180207002632.1156.53770.stgit@gimli.home> Sender: kvm-owner@vger.kernel.org List-ID: Hi Alex, On 07/02/18 01:26, Alex Williamson wrote: > Record data writes that come through the NVIDIA BAR0 quirk, if we get > enough in a row that we're only passing through, automatically enable > an ioeventfd for it. The primary target for this is the MSI-ACK > that NVIDIA uses to allow the MSI interrupt to re-trigger, which is a > 4-byte write, data value 0x0 to offset 0x704 into the quirk, 0x88704 > into BAR0 MMIO space. For an interrupt latency sensitive micro- > benchmark, this takes us from 83% of performance versus disabling the > quirk entirely (which GeForce cannot do), to to almost 90%. > > Signed-off-by: Alex Williamson > --- > hw/vfio/pci-quirks.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- > hw/vfio/pci.h | 2 + > 2 files changed, 89 insertions(+), 2 deletions(-) > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > index e4cf4ea2dd9c..e739efe601b1 100644lg > --- a/hw/vfio/pci-quirks.c > +++ b/hw/vfio/pci-quirks.c > @@ -203,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk { > uint32_t offset; > uint8_t bar; > MemoryRegion *mem; > + uint8_t data[]; Do you foresee other usages of data besides the LastDataSet? > } VFIOConfigMirrorQuirk; > > static uint64_t vfio_generic_quirk_mirror_read(void *opaque, > @@ -297,6 +298,50 @@ static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd) > g_free(ioeventfd); > } > add a comment? user handler in case kvm ioeventfd setup failed? > +static void vfio_ioeventfd_handler(void *opaque) > +{ > + VFIOIOEventFD *ioeventfd = opaque; > + > + if (event_notifier_test_and_clear(&ioeventfd->e)) { > + vfio_region_write(ioeventfd->region, ioeventfd->region_addr, > + ioeventfd->data, ioeventfd->size); > + } > +} > + > +static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, > + MemoryRegion *mr, hwaddr addr, > + unsigned size, uint64_t data, > + VFIORegion *region, > + hwaddr region_addr) > +{ > + VFIOIOEventFD *ioeventfd = g_malloc0(sizeof(*ioeventfd)); > + > + if (event_notifier_init(&ioeventfd->e, 0)) { > + g_free(ioeventfd); > + return NULL; > + } > + > + ioeventfd->mr = mr; > + ioeventfd->addr = addr; > + ioeventfd->size = size; > + ioeventfd->match_data = true; > + ioeventfd->data = data; > + ioeventfd->region = region; > + ioeventfd->region_addr = region_addr; I found difficult to follow the different addr semantic. I understand region_add is the offset % bar and addr is the offset % mirror region. Maybe more explicit names would help (region = bar_region and region_addr = bar_offset) > + > + qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), > + vfio_ioeventfd_handler, NULL, ioeventfd); > + memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, > + ioeventfd->size, ioeventfd->match_data, > + ioeventfd->data, &ioeventfd->e); > + > + info_report("Enabled automatic ioeventfd acceleration for %s region %d, " > + "offset 0x%"HWADDR_PRIx", data 0x%"PRIx64", size %u", > + vdev->vbasedev.name, region->nr, region_addr, data, size); > + > + return ioeventfd; > +} > + > static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) > { > VFIOQuirk *quirk; > @@ -732,6 +777,13 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) > trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); > } > > +typedef struct LastDataSet { > + hwaddr addr; > + uint64_t data; > + unsigned size; > + int count; > +} LastDataSet; > + > /* > * Finally, BAR0 itself. We want to redirect any accesses to either > * 0x1800 or 0x88000 through the PCI config space access functions. > @@ -742,6 +794,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, > VFIOConfigMirrorQuirk *mirror = opaque; > VFIOPCIDevice *vdev = mirror->vdev; > PCIDevice *pdev = &vdev->pdev; > + LastDataSet *last = (LastDataSet *)&mirror->data; > > vfio_generic_quirk_mirror_write(opaque, addr, data, size); > > @@ -756,6 +809,38 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, > addr + mirror->offset, data, size); > trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); > } > + > + /* > + * Automatically add an ioeventfd to handle any repeated write with the > + * same data and size above the standard PCI config space header. This is > + * primarily expected to accelerate the MSI-ACK behavior, such as noted > + * above. Current hardware/drivers should trigger an ioeventfd at config > + * offset 0x704 (region offset 0x88704), with data 0x0, size 4. > + */ > + if (addr > PCI_STD_HEADER_SIZEOF) { > + if (addr != last->addr || data != last->data || size != last->size) { > + last->addr = addr; > + last->data = data; > + last->size = size; > + last->count = 1; > + } else if (++last->count > 10) { So here is the naive question about the "10" choice and also the fact this needs to be consecutive accesses. I guess you observed this works but at first sight this is not obvious to me. Does anyone check potential overlaps between ioeventfd's [addr, addr + size -1]? > + VFIOIOEventFD *ioeventfd; > + > + ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, data, > + &vdev->bars[mirror->bar].region, > + mirror->offset + addr); > + if (ioeventfd) { > + VFIOQuirk *quirk; > + > + QLIST_FOREACH(quirk, &vdev->bars[mirror->bar].quirks, next) { > + if (quirk->data == mirror) { > + QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); > + break; > + } > + } > + } > + } > + } Thanks Eric > } > > static const MemoryRegionOps vfio_nvidia_mirror_quirk = { > @@ -776,7 +861,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) > } > > quirk = vfio_quirk_alloc(1); > - mirror = quirk->data = g_malloc0(sizeof(*mirror)); > + mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); > mirror->mem = quirk->mem; > mirror->vdev = vdev; > mirror->offset = 0x88000; > @@ -794,7 +879,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) > /* The 0x1800 offset mirror only seems to get used by legacy VGA */ > if (vdev->vga) { > quirk = vfio_quirk_alloc(1); > - mirror = quirk->data = g_malloc0(sizeof(*mirror)); > + mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); > mirror->mem = quirk->mem; > mirror->vdev = vdev; > mirror->offset = 0x1800; > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 146065c2f715..ec53b9935725 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -32,6 +32,8 @@ typedef struct VFIOIOEventFD { > bool match_data; > uint64_t data; > EventNotifier e; > + VFIORegion *region; > + hwaddr region_addr; > } VFIOIOEventFD; > > typedef struct VFIOQuirk { > From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:46871) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ejk61-0006Iz-O6 for qemu-devel@nongnu.org; Thu, 08 Feb 2018 06:10:31 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ejk5y-0005IM-33 for qemu-devel@nongnu.org; Thu, 08 Feb 2018 06:10:29 -0500 Received: from mx3-rdu2.redhat.com ([66.187.233.73]:60240 helo=mx1.redhat.com) by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.71) (envelope-from ) id 1ejk5x-0005F9-P1 for qemu-devel@nongnu.org; Thu, 08 Feb 2018 06:10:25 -0500 References: <20180207001615.1156.10547.stgit@gimli.home> <20180207002632.1156.53770.stgit@gimli.home> From: Auger Eric Message-ID: <983af1c6-a57a-43d8-2237-88e0d736905f@redhat.com> Date: Thu, 8 Feb 2018 12:10:02 +0100 MIME-Version: 1.0 In-Reply-To: <20180207002632.1156.53770.stgit@gimli.home> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [RFC PATCH 3/5] vfio/quirks: Automatic ioeventfd enabling for NVIDIA BAR0 quirks List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Alex Williamson , qemu-devel@nongnu.org Cc: kvm@vger.kernel.org Hi Alex, On 07/02/18 01:26, Alex Williamson wrote: > Record data writes that come through the NVIDIA BAR0 quirk, if we get > enough in a row that we're only passing through, automatically enable > an ioeventfd for it. The primary target for this is the MSI-ACK > that NVIDIA uses to allow the MSI interrupt to re-trigger, which is a > 4-byte write, data value 0x0 to offset 0x704 into the quirk, 0x88704 > into BAR0 MMIO space. For an interrupt latency sensitive micro- > benchmark, this takes us from 83% of performance versus disabling the > quirk entirely (which GeForce cannot do), to to almost 90%. > > Signed-off-by: Alex Williamson > --- > hw/vfio/pci-quirks.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- > hw/vfio/pci.h | 2 + > 2 files changed, 89 insertions(+), 2 deletions(-) > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > index e4cf4ea2dd9c..e739efe601b1 100644lg > --- a/hw/vfio/pci-quirks.c > +++ b/hw/vfio/pci-quirks.c > @@ -203,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk { > uint32_t offset; > uint8_t bar; > MemoryRegion *mem; > + uint8_t data[]; Do you foresee other usages of data besides the LastDataSet? > } VFIOConfigMirrorQuirk; > > static uint64_t vfio_generic_quirk_mirror_read(void *opaque, > @@ -297,6 +298,50 @@ static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd) > g_free(ioeventfd); > } > add a comment? user handler in case kvm ioeventfd setup failed? > +static void vfio_ioeventfd_handler(void *opaque) > +{ > + VFIOIOEventFD *ioeventfd = opaque; > + > + if (event_notifier_test_and_clear(&ioeventfd->e)) { > + vfio_region_write(ioeventfd->region, ioeventfd->region_addr, > + ioeventfd->data, ioeventfd->size); > + } > +} > + > +static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, > + MemoryRegion *mr, hwaddr addr, > + unsigned size, uint64_t data, > + VFIORegion *region, > + hwaddr region_addr) > +{ > + VFIOIOEventFD *ioeventfd = g_malloc0(sizeof(*ioeventfd)); > + > + if (event_notifier_init(&ioeventfd->e, 0)) { > + g_free(ioeventfd); > + return NULL; > + } > + > + ioeventfd->mr = mr; > + ioeventfd->addr = addr; > + ioeventfd->size = size; > + ioeventfd->match_data = true; > + ioeventfd->data = data; > + ioeventfd->region = region; > + ioeventfd->region_addr = region_addr; I found difficult to follow the different addr semantic. I understand region_add is the offset % bar and addr is the offset % mirror region. Maybe more explicit names would help (region = bar_region and region_addr = bar_offset) > + > + qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), > + vfio_ioeventfd_handler, NULL, ioeventfd); > + memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, > + ioeventfd->size, ioeventfd->match_data, > + ioeventfd->data, &ioeventfd->e); > + > + info_report("Enabled automatic ioeventfd acceleration for %s region %d, " > + "offset 0x%"HWADDR_PRIx", data 0x%"PRIx64", size %u", > + vdev->vbasedev.name, region->nr, region_addr, data, size); > + > + return ioeventfd; > +} > + > static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) > { > VFIOQuirk *quirk; > @@ -732,6 +777,13 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) > trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); > } > > +typedef struct LastDataSet { > + hwaddr addr; > + uint64_t data; > + unsigned size; > + int count; > +} LastDataSet; > + > /* > * Finally, BAR0 itself. We want to redirect any accesses to either > * 0x1800 or 0x88000 through the PCI config space access functions. > @@ -742,6 +794,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, > VFIOConfigMirrorQuirk *mirror = opaque; > VFIOPCIDevice *vdev = mirror->vdev; > PCIDevice *pdev = &vdev->pdev; > + LastDataSet *last = (LastDataSet *)&mirror->data; > > vfio_generic_quirk_mirror_write(opaque, addr, data, size); > > @@ -756,6 +809,38 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, > addr + mirror->offset, data, size); > trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); > } > + > + /* > + * Automatically add an ioeventfd to handle any repeated write with the > + * same data and size above the standard PCI config space header. This is > + * primarily expected to accelerate the MSI-ACK behavior, such as noted > + * above. Current hardware/drivers should trigger an ioeventfd at config > + * offset 0x704 (region offset 0x88704), with data 0x0, size 4. > + */ > + if (addr > PCI_STD_HEADER_SIZEOF) { > + if (addr != last->addr || data != last->data || size != last->size) { > + last->addr = addr; > + last->data = data; > + last->size = size; > + last->count = 1; > + } else if (++last->count > 10) { So here is the naive question about the "10" choice and also the fact this needs to be consecutive accesses. I guess you observed this works but at first sight this is not obvious to me. Does anyone check potential overlaps between ioeventfd's [addr, addr + size -1]? > + VFIOIOEventFD *ioeventfd; > + > + ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, data, > + &vdev->bars[mirror->bar].region, > + mirror->offset + addr); > + if (ioeventfd) { > + VFIOQuirk *quirk; > + > + QLIST_FOREACH(quirk, &vdev->bars[mirror->bar].quirks, next) { > + if (quirk->data == mirror) { > + QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); > + break; > + } > + } > + } > + } > + } Thanks Eric > } > > static const MemoryRegionOps vfio_nvidia_mirror_quirk = { > @@ -776,7 +861,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) > } > > quirk = vfio_quirk_alloc(1); > - mirror = quirk->data = g_malloc0(sizeof(*mirror)); > + mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); > mirror->mem = quirk->mem; > mirror->vdev = vdev; > mirror->offset = 0x88000; > @@ -794,7 +879,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) > /* The 0x1800 offset mirror only seems to get used by legacy VGA */ > if (vdev->vga) { > quirk = vfio_quirk_alloc(1); > - mirror = quirk->data = g_malloc0(sizeof(*mirror)); > + mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); > mirror->mem = quirk->mem; > mirror->vdev = vdev; > mirror->offset = 0x1800; > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 146065c2f715..ec53b9935725 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -32,6 +32,8 @@ typedef struct VFIOIOEventFD { > bool match_data; > uint64_t data; > EventNotifier e; > + VFIORegion *region; > + hwaddr region_addr; > } VFIOIOEventFD; > > typedef struct VFIOQuirk { >