All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Device Assignment fixes
@ 2009-12-15 18:30 Alexander Graf
  2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
                   ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
  To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright

While trying out I stumbled across several issues in the device assignment
code.

This set addresses the most major ones. Namely allowing passthrough of non page
aligned BAR region (like on lpfc) and telling users what to do when their
device is in use.

Alexander Graf (3):
  Enable non page boundary BAR device assignment
  Split off sysfs id retrieval
  Inform users about busy device assignment attempt

 hw/device-assignment.c |  238 ++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 209 insertions(+), 29 deletions(-)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
  2009-12-15 18:38   ` Michael S. Tsirkin
  2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
  2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
  2 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
  To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright

While trying to get device passthrough working with an emulex hba, kvm
refused to pass it through because it has a BAR of 256 bytes:

        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
        Region 4: I/O ports at b100 [size=256]

Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
physical to virtual addresses, we can still take the old MMIO callback route.

So let's add a second code path that allows for size & 0xFFF != 0 sized regions
by looping it through userspace.

I verified that it works by passing through an e1000 with this additional patch
applied and the card acted the same way it did without this patch:

             map_func = assigned_dev_iomem_map;
-            if (cur_region->size & 0xFFF) {
+            if (i != PCI_ROM_SLOT){
                 fprintf(stderr, "PCI region %d at address 0x%llx "

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - don't use map_func function pointer
  - use the same code for mmap on fast and slow path

v2 -> v3:

  - address mst's comments
---
 hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 13a86bb..000fa61 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     return value;
 }
 
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    *out = val;
+}
+
+static CPUWriteMemoryFunc * const slow_bar_write[] = {
+    &slow_bar_writeb,
+    &slow_bar_writew,
+    &slow_bar_writel
+};
+
+static CPUReadMemoryFunc * const slow_bar_read[] = {
+    &slow_bar_readb,
+    &slow_bar_readw,
+    &slow_bar_readl
+};
+
+static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
+                                        pcibus_t e_phys, pcibus_t e_size,
+                                        int type)
+{
+    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+    int m;
+
+    DEBUG("slow map\n");
+    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
+    cpu_register_physical_memory(e_phys, e_size, m);
+
+    /* MSI-X MMIO page */
+    if ((e_size > 0) &&
+        real_region->base_addr <= r_dev->msix_table_addr &&
+        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
+        int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+        cpu_register_physical_memory(e_phys + offset,
+                TARGET_PAGE_SIZE, r_dev->mmio_index);
+    }
+}
+
 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
                                    pcibus_t e_phys, pcibus_t e_size, int type)
 {
@@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
         /* handle memory io regions */
         if (cur_region->type & IORESOURCE_MEM) {
+            int slow_map = 0;
             int t = cur_region->type & IORESOURCE_PREFETCH
                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
             if (cur_region->size & 0xFFF) {
-                fprintf(stderr, "Unable to assign device: PCI region %d "
-                        "at address 0x%llx has size 0x%x, "
-                        " which is not a multiple of 4K\n",
+                fprintf(stderr, "PCI region %d at address 0x%llx "
+                        "has size 0x%x, which is not a multiple of 4K. "
+                        "You might experience some performance hit due to that.\n",
                         i, (unsigned long long)cur_region->base_addr,
                         cur_region->size);
+                slow_map = 1;
+            }
+
+            if (slow_map && (i == PCI_ROM_SLOT)) {
+                fprintf(stderr, "ROM not aligned - can't continue\n");
                 return -1;
             }
 
@@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
             } else {
                 pci_dev->v_addrs[i].u.r_virtbase =
                     mmap(NULL,
-                         (cur_region->size + 0xFFF) & 0xFFFFF000,
+                         cur_region->size,
                          PROT_WRITE | PROT_READ, MAP_SHARED,
                          cur_region->resource_fd, (off_t) 0);
             }
@@ -434,7 +540,8 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
             pci_register_bar((PCIDevice *) pci_dev, i,
                              cur_region->size, t,
-                             assigned_dev_iomem_map);
+                             slow_map ? assigned_dev_iomem_map_slow
+                                      : assigned_dev_iomem_map);
             continue;
         }
         /* handle port io regions */
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/3] Split off sysfs id retrieval
  2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
  2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
  2009-12-15 18:39   ` Michael S. Tsirkin
  2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
  2 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
  To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright

To retreive device and vendor ID from a PCI device, we need to read a
sysfs file. That code is currently hand written at least two times,
the later patch introducing two more calls.

So let's move that out to a function.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/device-assignment.c |   66 ++++++++++++++++++++++++++++++++----------------
 1 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 000fa61..566671c 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -562,14 +562,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
     return 0;
 }
 
+static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
+{
+    FILE *f;
+    char name[128];
+    long id;
+
+    snprintf(name, sizeof(name), "%s%s", devpath, idname);
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return -1;
+    }
+    if (fscanf(f, "%li\n", &id) == 1) {
+        *val = id;
+    } else {
+        return -1;
+    }
+    fclose(f);
+
+    return 0;
+}
+
+static int get_real_vendor_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "vendor", val);
+}
+
+static int get_real_device_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "device", val);
+}
+
 static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
                            uint8_t r_dev, uint8_t r_func)
 {
     char dir[128], name[128];
-    int fd, r = 0;
+    int fd, r = 0, v;
     FILE *f;
     unsigned long long start, end, size, flags;
-    unsigned long id;
+    uint16_t id;
     struct stat statbuf;
     PCIRegion *rp;
     PCIDevRegions *dev = &pci_dev->real_device;
@@ -635,31 +667,21 @@ again:
 
     fclose(f);
 
-    /* read and fill device ID */
-    snprintf(name, sizeof(name), "%svendor", dir);
-    f = fopen(name, "r");
-    if (f == NULL) {
-        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+    /* read and fill vendor ID */
+    v = get_real_vendor_id(dir, &id);
+    if (v) {
         return 1;
     }
-    if (fscanf(f, "%li\n", &id) == 1) {
-	pci_dev->dev.config[0] = id & 0xff;
-	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
-    }
-    fclose(f);
+    pci_dev->dev.config[0] = id & 0xff;
+    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
 
-    /* read and fill vendor ID */
-    snprintf(name, sizeof(name), "%sdevice", dir);
-    f = fopen(name, "r");
-    if (f == NULL) {
-        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+    /* read and fill device ID */
+    v = get_real_device_id(dir, &id);
+    if (v) {
         return 1;
     }
-    if (fscanf(f, "%li\n", &id) == 1) {
-	pci_dev->dev.config[2] = id & 0xff;
-	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
-    }
-    fclose(f);
+    pci_dev->dev.config[2] = id & 0xff;
+    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
 
     /* dealing with virtual function device */
     snprintf(name, sizeof(name), "%sphysfn/", dir);
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
  2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
  2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
  2009-12-15 18:41   ` Michael S. Tsirkin
  2009-12-15 20:28   ` Chris Wright
  2 siblings, 2 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
  To: kvm list
  Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright,
	Daniel P. Berrange

When using -pcidevice on a device that is already in use by a kernel driver
all the user gets is the following (very useful) information:

  Failed to assign device "04:00.0" : Device or resource busy
  Failed to deassign device "04:00.0" : Invalid argument
  Error initializing device pci-assign

Since I usually prefer to have my computer do the thinking for me, I figured
it might be a good idea to check and see if a device is actually used by a
driver. If so, tell the user.

So with this patch applied you get the following output:

  Failed to assign device "04:00.0" : Device or resource busy
  *** The driver 'igb' is occupying your device 04:00.0.
  ***
  *** You can try the following commands to free it:
  ***
  *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
  *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
  *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
  *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
  ***
  Failed to deassign device "04:00.0" : Invalid argument
  Error initializing device pci-assign

That should keep people like me from doing the most obvious misuses :-).

CC: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - add more helpful guidance thanks to Daniel Berrange

v2 -> v3:

  - clear name variable before using it, thus 0-terminating the string
  - fix region numbers
  - use correct unbind/bind names

v3 -> v4:

  - split id retrieval part
  - mst comments
---
 hw/device-assignment.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 566671c..d7a03c9 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -756,6 +756,54 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
     return (uint32_t)bus << 8 | (uint32_t)devfn;
 }
 
+static void assign_failed_examine(AssignedDevice *dev)
+{
+    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
+    uint16_t vendor_id, device_id;
+    int r;
+
+    /* XXX implement multidomain */
+    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
+             dev->host.bus, dev->host.dev, dev->host.func);
+
+    sprintf(name, "%sdriver", dir);
+
+    r = readlink(name, driver, sizeof(driver));
+    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
+        goto fail;
+    }
+
+    ns++;
+
+    if (get_real_vendor_id(dir, &vendor_id) ||
+        get_real_device_id(dir, &device_id)) {
+        goto fail;
+    }
+
+    fprintf(stderr, "*** The driver '%s' is occupying your device "
+                    "%02x:%02x.%x.\n",
+            ns, dev->host.bus, dev->host.dev, dev->host.func);
+    fprintf(stderr, "***\n");
+    fprintf(stderr, "*** You can try the following commands to free it:\n");
+    fprintf(stderr, "***\n");
+    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
+                    "new_id\n", vendor_id, device_id);
+    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+                    "%s/unbind\n",
+            dev->host.bus, dev->host.dev, dev->host.func, ns);
+    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+                    "pci-stub/bind\n",
+            dev->host.bus, dev->host.dev, dev->host.func);
+    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
+                    "/remove_id\n", vendor_id, device_id);
+    fprintf(stderr, "***\n");
+
+    return;
+
+fail:
+    fprintf(stderr, "Couldn't find out why.\n");
+}
+
 static int assign_device(AssignedDevice *dev)
 {
     struct kvm_assigned_pci_dev assigned_dev_data;
@@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
 #endif
 
     r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
-    if (r < 0)
-	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
+    if (r < 0) {
+        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
                 dev->dev.qdev.id, strerror(-r));
+
+        assign_failed_examine(dev);
+    }
     return r;
 }
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
@ 2009-12-15 18:38   ` Michael S. Tsirkin
  2009-12-15 18:47     ` Alexander Graf
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:38 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
> 
>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>         Region 4: I/O ports at b100 [size=256]
> 
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
> 
> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> by looping it through userspace.
> 
> I verified that it works by passing through an e1000 with this additional patch
> applied and the card acted the same way it did without this patch:
> 
>              map_func = assigned_dev_iomem_map;
> -            if (cur_region->size & 0xFFF) {
> +            if (i != PCI_ROM_SLOT){
>                  fprintf(stderr, "PCI region %d at address 0x%llx "
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>


Looks good.
1 question:

> ---
> 
> v1 -> v2:
> 
>   - don't use map_func function pointer
>   - use the same code for mmap on fast and slow path
> 
> v2 -> v3:
> 
>   - address mst's comments
> ---
>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 112 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..000fa61 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>      return value;
>  }
>  
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = d->u.r_virtbase + addr;
> +    uint32_t r;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = d->u.r_virtbase + addr;
> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> +    &slow_bar_writeb,
> +    &slow_bar_writew,
> +    &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> +    &slow_bar_readb,
> +    &slow_bar_readw,
> +    &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> +                                        pcibus_t e_phys, pcibus_t e_size,
> +                                        int type)
> +{
> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +    int m;
> +
> +    DEBUG("slow map\n");
> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> +    cpu_register_physical_memory(e_phys, e_size, m);
> +
> +    /* MSI-X MMIO page */
> +    if ((e_size > 0) &&
> +        real_region->base_addr <= r_dev->msix_table_addr &&
> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +        cpu_register_physical_memory(e_phys + offset,
> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
> +    }
> +}
> +
>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>  {
> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>  
>          /* handle memory io regions */
>          if (cur_region->type & IORESOURCE_MEM) {
> +            int slow_map = 0;
>              int t = cur_region->type & IORESOURCE_PREFETCH
>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
>              if (cur_region->size & 0xFFF) {
> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> -                        "at address 0x%llx has size 0x%x, "
> -                        " which is not a multiple of 4K\n",
> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "You might experience some performance hit due to that.\n",
>                          i, (unsigned long long)cur_region->base_addr,
>                          cur_region->size);
> +                slow_map = 1;
> +            }
> +
> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>                  return -1;

Why is this? Can't ROM be handled in the same way?
I think it was previously ...

>              }
>  
> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              } else {
>                  pci_dev->v_addrs[i].u.r_virtbase =
>                      mmap(NULL,
> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> +                         cur_region->size,

Hmm, one assumes this code did work at some point ...
do you know why was it done this way?

>                           PROT_WRITE | PROT_READ, MAP_SHARED,
>                           cur_region->resource_fd, (off_t) 0);
>              }
> @@ -434,7 +540,8 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>  
>              pci_register_bar((PCIDevice *) pci_dev, i,
>                               cur_region->size, t,
> -                             assigned_dev_iomem_map);
> +                             slow_map ? assigned_dev_iomem_map_slow
> +                                      : assigned_dev_iomem_map);
>              continue;
>          }
>          /* handle port io regions */
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] Split off sysfs id retrieval
  2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
@ 2009-12-15 18:39   ` Michael S. Tsirkin
  2009-12-15 18:54     ` Alexander Graf
  2009-12-15 19:57     ` Chris Wright
  0 siblings, 2 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:39 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> To retreive device and vendor ID from a PCI device, we need to read a
> sysfs file. That code is currently hand written at least two times,
> the later patch introducing two more calls.
> 
> So let's move that out to a function.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>

Shouldn't we be using libpci, as we already do
in other places?

> ---
>  hw/device-assignment.c |   66 ++++++++++++++++++++++++++++++++----------------
>  1 files changed, 44 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 000fa61..566671c 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -562,14 +562,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>      return 0;
>  }
>  
> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> +{
> +    FILE *f;
> +    char name[128];
> +    long id;
> +
> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return -1;
> +    }
> +    if (fscanf(f, "%li\n", &id) == 1) {
> +        *val = id;
> +    } else {
> +        return -1;
> +    }
> +    fclose(f);
> +
> +    return 0;
> +}
> +
> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "vendor", val);
> +}
> +
> +static int get_real_device_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "device", val);
> +}
> +
>  static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
>                             uint8_t r_dev, uint8_t r_func)
>  {
>      char dir[128], name[128];
> -    int fd, r = 0;
> +    int fd, r = 0, v;
>      FILE *f;
>      unsigned long long start, end, size, flags;
> -    unsigned long id;
> +    uint16_t id;
>      struct stat statbuf;
>      PCIRegion *rp;
>      PCIDevRegions *dev = &pci_dev->real_device;
> @@ -635,31 +667,21 @@ again:
>  
>      fclose(f);
>  
> -    /* read and fill device ID */
> -    snprintf(name, sizeof(name), "%svendor", dir);
> -    f = fopen(name, "r");
> -    if (f == NULL) {
> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +    /* read and fill vendor ID */
> +    v = get_real_vendor_id(dir, &id);
> +    if (v) {
>          return 1;
>      }
> -    if (fscanf(f, "%li\n", &id) == 1) {
> -	pci_dev->dev.config[0] = id & 0xff;
> -	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> -    }
> -    fclose(f);
> +    pci_dev->dev.config[0] = id & 0xff;
> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>  
> -    /* read and fill vendor ID */
> -    snprintf(name, sizeof(name), "%sdevice", dir);
> -    f = fopen(name, "r");
> -    if (f == NULL) {
> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +    /* read and fill device ID */
> +    v = get_real_device_id(dir, &id);
> +    if (v) {
>          return 1;
>      }
> -    if (fscanf(f, "%li\n", &id) == 1) {
> -	pci_dev->dev.config[2] = id & 0xff;
> -	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> -    }
> -    fclose(f);
> +    pci_dev->dev.config[2] = id & 0xff;
> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>  
>      /* dealing with virtual function device */
>      snprintf(name, sizeof(name), "%sphysfn/", dir);
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
@ 2009-12-15 18:41   ` Michael S. Tsirkin
  2009-12-15 20:28   ` Chris Wright
  1 sibling, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:41 UTC (permalink / raw)
  To: Alexander Graf
  Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright,
	Daniel P. Berrange

On Tue, Dec 15, 2009 at 07:30:28PM +0100, Alexander Graf wrote:
> When using -pcidevice on a device that is already in use by a kernel driver
> all the user gets is the following (very useful) information:
> 
>   Failed to assign device "04:00.0" : Device or resource busy
>   Failed to deassign device "04:00.0" : Invalid argument
>   Error initializing device pci-assign
> 
> Since I usually prefer to have my computer do the thinking for me, I figured
> it might be a good idea to check and see if a device is actually used by a
> driver. If so, tell the user.
> 
> So with this patch applied you get the following output:
> 
>   Failed to assign device "04:00.0" : Device or resource busy
>   *** The driver 'igb' is occupying your device 04:00.0.
>   ***
>   *** You can try the following commands to free it:
>   ***
>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
>   ***
>   Failed to deassign device "04:00.0" : Invalid argument
>   Error initializing device pci-assign
> 
> That should keep people like me from doing the most obvious misuses :-).
> 
> CC: Daniel P. Berrange <berrange@redhat.com>
> Signed-off-by: Alexander Graf <agraf@suse.de>


Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
> 
> v1 -> v2:
> 
>   - add more helpful guidance thanks to Daniel Berrange
> 
> v2 -> v3:
> 
>   - clear name variable before using it, thus 0-terminating the string
>   - fix region numbers
>   - use correct unbind/bind names
> 
> v3 -> v4:
> 
>   - split id retrieval part
>   - mst comments
> ---
>  hw/device-assignment.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 53 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 566671c..d7a03c9 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -756,6 +756,54 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
>      return (uint32_t)bus << 8 | (uint32_t)devfn;
>  }
>  
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> +    uint16_t vendor_id, device_id;
> +    int r;
> +
> +    /* XXX implement multidomain */
> +    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> +             dev->host.bus, dev->host.dev, dev->host.func);
> +
> +    sprintf(name, "%sdriver", dir);
> +
> +    r = readlink(name, driver, sizeof(driver));
> +    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
> +        goto fail;
> +    }
> +
> +    ns++;
> +
> +    if (get_real_vendor_id(dir, &vendor_id) ||
> +        get_real_device_id(dir, &device_id)) {
> +        goto fail;
> +    }
> +
> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> +                    "%02x:%02x.%x.\n",
> +            ns, dev->host.bus, dev->host.dev, dev->host.func);
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> +                    "new_id\n", vendor_id, device_id);
> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "%s/unbind\n",
> +            dev->host.bus, dev->host.dev, dev->host.func, ns);
> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "pci-stub/bind\n",
> +            dev->host.bus, dev->host.dev, dev->host.func);
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> +                    "/remove_id\n", vendor_id, device_id);
> +    fprintf(stderr, "***\n");
> +
> +    return;
> +
> +fail:
> +    fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
>  static int assign_device(AssignedDevice *dev)
>  {
>      struct kvm_assigned_pci_dev assigned_dev_data;
> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
>  #endif
>  
>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> -    if (r < 0)
> -	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>                  dev->dev.qdev.id, strerror(-r));
> +
> +        assign_failed_examine(dev);
> +    }
>      return r;
>  }
>  
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 18:38   ` Michael S. Tsirkin
@ 2009-12-15 18:47     ` Alexander Graf
  2009-12-15 18:50       ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:47 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>   
>> While trying to get device passthrough working with an emulex hba, kvm
>> refused to pass it through because it has a BAR of 256 bytes:
>>
>>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>         Region 4: I/O ports at b100 [size=256]
>>
>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>> physical to virtual addresses, we can still take the old MMIO callback route.
>>
>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>> by looping it through userspace.
>>
>> I verified that it works by passing through an e1000 with this additional patch
>> applied and the card acted the same way it did without this patch:
>>
>>              map_func = assigned_dev_iomem_map;
>> -            if (cur_region->size & 0xFFF) {
>> +            if (i != PCI_ROM_SLOT){
>>                  fprintf(stderr, "PCI region %d at address 0x%llx "
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>     
>
>
> Looks good.
> 1 question:
>
>   
>> ---
>>
>> v1 -> v2:
>>
>>   - don't use map_func function pointer
>>   - use the same code for mmap on fast and slow path
>>
>> v2 -> v3:
>>
>>   - address mst's comments
>> ---
>>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
>>  1 files changed, 112 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>> index 13a86bb..000fa61 100644
>> --- a/hw/device-assignment.c
>> +++ b/hw/device-assignment.c
>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>      return value;
>>  }
>>  
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *in = d->u.r_virtbase + addr;
>> +    uint32_t r;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *out = d->u.r_virtbase + addr;
>> +
>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>> +    &slow_bar_writeb,
>> +    &slow_bar_writew,
>> +    &slow_bar_writel
>> +};
>> +
>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>> +    &slow_bar_readb,
>> +    &slow_bar_readw,
>> +    &slow_bar_readl
>> +};
>> +
>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>> +                                        pcibus_t e_phys, pcibus_t e_size,
>> +                                        int type)
>> +{
>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>> +    int m;
>> +
>> +    DEBUG("slow map\n");
>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>> +    cpu_register_physical_memory(e_phys, e_size, m);
>> +
>> +    /* MSI-X MMIO page */
>> +    if ((e_size > 0) &&
>> +        real_region->base_addr <= r_dev->msix_table_addr &&
>> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
>> +
>> +        cpu_register_physical_memory(e_phys + offset,
>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
>> +    }
>> +}
>> +
>>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>>  {
>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>  
>>          /* handle memory io regions */
>>          if (cur_region->type & IORESOURCE_MEM) {
>> +            int slow_map = 0;
>>              int t = cur_region->type & IORESOURCE_PREFETCH
>>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
>> +
>>              if (cur_region->size & 0xFFF) {
>> -                fprintf(stderr, "Unable to assign device: PCI region %d "
>> -                        "at address 0x%llx has size 0x%x, "
>> -                        " which is not a multiple of 4K\n",
>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>> +                        "has size 0x%x, which is not a multiple of 4K. "
>> +                        "You might experience some performance hit due to that.\n",
>>                          i, (unsigned long long)cur_region->base_addr,
>>                          cur_region->size);
>> +                slow_map = 1;
>> +            }
>> +
>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
>> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>>                  return -1;
>>     
>
> Why is this? Can't ROM be handled in the same way?
> I think it was previously ...
>   

You can't execute code from MMIO regions, so slow_map doesn't work there.

>   
>>              }
>>  
>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>              } else {
>>                  pci_dev->v_addrs[i].u.r_virtbase =
>>                      mmap(NULL,
>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
>> +                         cur_region->size,
>>     
>
> Hmm, one assumes this code did work at some point ...
> do you know why was it done this way?
>   

Nope :-).

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 18:47     ` Alexander Graf
@ 2009-12-15 18:50       ` Michael S. Tsirkin
  2009-12-15 19:04         ` Alexander Graf
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:50 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> >   
> >> While trying to get device passthrough working with an emulex hba, kvm
> >> refused to pass it through because it has a BAR of 256 bytes:
> >>
> >>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>         Region 4: I/O ports at b100 [size=256]
> >>
> >> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> >> physical to virtual addresses, we can still take the old MMIO callback route.
> >>
> >> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> >> by looping it through userspace.
> >>
> >> I verified that it works by passing through an e1000 with this additional patch
> >> applied and the card acted the same way it did without this patch:
> >>
> >>              map_func = assigned_dev_iomem_map;
> >> -            if (cur_region->size & 0xFFF) {
> >> +            if (i != PCI_ROM_SLOT){
> >>                  fprintf(stderr, "PCI region %d at address 0x%llx "
> >>
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>     
> >
> >
> > Looks good.
> > 1 question:
> >
> >   
> >> ---
> >>
> >> v1 -> v2:
> >>
> >>   - don't use map_func function pointer
> >>   - use the same code for mmap on fast and slow path
> >>
> >> v2 -> v3:
> >>
> >>   - address mst's comments
> >> ---
> >>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
> >>  1 files changed, 112 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >> index 13a86bb..000fa61 100644
> >> --- a/hw/device-assignment.c
> >> +++ b/hw/device-assignment.c
> >> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> >>      return value;
> >>  }
> >>  
> >> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint8_t *in = d->u.r_virtbase + addr;
> >> +    uint32_t r;
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >> +}
> >> +
> >> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint16_t *in = d->u.r_virtbase + addr;
> >> +    uint32_t r;
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >> +}
> >> +
> >> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint32_t *in = d->u.r_virtbase + addr;
> >> +    uint32_t r;
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >> +}
> >> +
> >> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint8_t *out = d->u.r_virtbase + addr;
> >> +
> >> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint16_t *out = d->u.r_virtbase + addr;
> >> +
> >> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint32_t *out = d->u.r_virtbase + addr;
> >> +
> >> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> >> +    &slow_bar_writeb,
> >> +    &slow_bar_writew,
> >> +    &slow_bar_writel
> >> +};
> >> +
> >> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> >> +    &slow_bar_readb,
> >> +    &slow_bar_readw,
> >> +    &slow_bar_readl
> >> +};
> >> +
> >> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> >> +                                        pcibus_t e_phys, pcibus_t e_size,
> >> +                                        int type)
> >> +{
> >> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> >> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> >> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> >> +    int m;
> >> +
> >> +    DEBUG("slow map\n");
> >> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> >> +    cpu_register_physical_memory(e_phys, e_size, m);
> >> +
> >> +    /* MSI-X MMIO page */
> >> +    if ((e_size > 0) &&
> >> +        real_region->base_addr <= r_dev->msix_table_addr &&
> >> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> >> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> >> +
> >> +        cpu_register_physical_memory(e_phys + offset,
> >> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
> >> +    }
> >> +}
> >> +
> >>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> >>                                     pcibus_t e_phys, pcibus_t e_size, int type)
> >>  {
> >> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>  
> >>          /* handle memory io regions */
> >>          if (cur_region->type & IORESOURCE_MEM) {
> >> +            int slow_map = 0;
> >>              int t = cur_region->type & IORESOURCE_PREFETCH
> >>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
> >>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> >> +
> >>              if (cur_region->size & 0xFFF) {
> >> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> >> -                        "at address 0x%llx has size 0x%x, "
> >> -                        " which is not a multiple of 4K\n",
> >> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> >> +                        "has size 0x%x, which is not a multiple of 4K. "
> >> +                        "You might experience some performance hit due to that.\n",
> >>                          i, (unsigned long long)cur_region->base_addr,
> >>                          cur_region->size);
> >> +                slow_map = 1;
> >> +            }
> >> +
> >> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> >> +                fprintf(stderr, "ROM not aligned - can't continue\n");
> >>                  return -1;
> >>     
> >
> > Why is this? Can't ROM be handled in the same way?
> > I think it was previously ...
> >   
> 
> You can't execute code from MMIO regions, so slow_map doesn't work there.

Hmm, how does it work with passthrough?
Anyway, I don't think a lot of users run ROM
code directly, they just shadow it and run from there.

> >   
> >>              }
> >>  
> >> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>              } else {
> >>                  pci_dev->v_addrs[i].u.r_virtbase =
> >>                      mmap(NULL,
> >> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> >> +                         cur_region->size,
> >>     
> >
> > Hmm, one assumes this code did work at some point ...
> > do you know why was it done this way?
> >   
> 
> Nope :-).

So maybe keep it that way or make the change in a separate patch.

> Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] Split off sysfs id retrieval
  2009-12-15 18:39   ` Michael S. Tsirkin
@ 2009-12-15 18:54     ` Alexander Graf
  2009-12-15 21:05       ` Michael S. Tsirkin
  2009-12-15 19:57     ` Chris Wright
  1 sibling, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
>   
>> To retreive device and vendor ID from a PCI device, we need to read a
>> sysfs file. That code is currently hand written at least two times,
>> the later patch introducing two more calls.
>>
>> So let's move that out to a function.
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>     
>
> Shouldn't we be using libpci, as we already do
> in other places?
>   

Yes, please! But converting it to libvirt is a different beast from just
splitting up existing code :-).

Tackling the device assignment code to make it readable and mergable is
definitely something we need to address - just not now :-).

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 18:50       ` Michael S. Tsirkin
@ 2009-12-15 19:04         ` Alexander Graf
  2009-12-15 19:50           ` Chris Wright
  2009-12-15 21:04           ` Michael S. Tsirkin
  0 siblings, 2 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 19:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>>>   
>>>       
>>>> While trying to get device passthrough working with an emulex hba, kvm
>>>> refused to pass it through because it has a BAR of 256 bytes:
>>>>
>>>>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>>         Region 4: I/O ports at b100 [size=256]
>>>>
>>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>>>> physical to virtual addresses, we can still take the old MMIO callback route.
>>>>
>>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>>>> by looping it through userspace.
>>>>
>>>> I verified that it works by passing through an e1000 with this additional patch
>>>> applied and the card acted the same way it did without this patch:
>>>>
>>>>              map_func = assigned_dev_iomem_map;
>>>> -            if (cur_region->size & 0xFFF) {
>>>> +            if (i != PCI_ROM_SLOT){
>>>>                  fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>
>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>     
>>>>         
>>> Looks good.
>>> 1 question:
>>>
>>>   
>>>       
>>>> ---
>>>>
>>>> v1 -> v2:
>>>>
>>>>   - don't use map_func function pointer
>>>>   - use the same code for mmap on fast and slow path
>>>>
>>>> v2 -> v3:
>>>>
>>>>   - address mst's comments
>>>> ---
>>>>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
>>>>  1 files changed, 112 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>>>> index 13a86bb..000fa61 100644
>>>> --- a/hw/device-assignment.c
>>>> +++ b/hw/device-assignment.c
>>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>>>      return value;
>>>>  }
>>>>  
>>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>> +    uint32_t r;
>>>> +
>>>> +    r = *in;
>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> +    return r;
>>>> +}
>>>> +
>>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint16_t *in = d->u.r_virtbase + addr;
>>>> +    uint32_t r;
>>>> +
>>>> +    r = *in;
>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> +    return r;
>>>> +}
>>>> +
>>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint32_t *in = d->u.r_virtbase + addr;
>>>> +    uint32_t r;
>>>> +
>>>> +    r = *in;
>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> +    return r;
>>>> +}
>>>> +
>>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint8_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>>>> +    *out = val;
>>>> +}
>>>> +
>>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint16_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>>>> +    *out = val;
>>>> +}
>>>> +
>>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> +    AssignedDevRegion *d = opaque;
>>>> +    uint32_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>>>> +    *out = val;
>>>> +}
>>>> +
>>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>>>> +    &slow_bar_writeb,
>>>> +    &slow_bar_writew,
>>>> +    &slow_bar_writel
>>>> +};
>>>> +
>>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>>>> +    &slow_bar_readb,
>>>> +    &slow_bar_readw,
>>>> +    &slow_bar_readl
>>>> +};
>>>> +
>>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>>>> +                                        pcibus_t e_phys, pcibus_t e_size,
>>>> +                                        int type)
>>>> +{
>>>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>>>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>>>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>>>> +    int m;
>>>> +
>>>> +    DEBUG("slow map\n");
>>>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>>>> +    cpu_register_physical_memory(e_phys, e_size, m);
>>>> +
>>>> +    /* MSI-X MMIO page */
>>>> +    if ((e_size > 0) &&
>>>> +        real_region->base_addr <= r_dev->msix_table_addr &&
>>>> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>>>> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
>>>> +
>>>> +        cpu_register_physical_memory(e_phys + offset,
>>>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
>>>> +    }
>>>> +}
>>>> +
>>>>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>>>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>>>>  {
>>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>  
>>>>          /* handle memory io regions */
>>>>          if (cur_region->type & IORESOURCE_MEM) {
>>>> +            int slow_map = 0;
>>>>              int t = cur_region->type & IORESOURCE_PREFETCH
>>>>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>>>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
>>>> +
>>>>              if (cur_region->size & 0xFFF) {
>>>> -                fprintf(stderr, "Unable to assign device: PCI region %d "
>>>> -                        "at address 0x%llx has size 0x%x, "
>>>> -                        " which is not a multiple of 4K\n",
>>>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>>>> +                        "has size 0x%x, which is not a multiple of 4K. "
>>>> +                        "You might experience some performance hit due to that.\n",
>>>>                          i, (unsigned long long)cur_region->base_addr,
>>>>                          cur_region->size);
>>>> +                slow_map = 1;
>>>> +            }
>>>> +
>>>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
>>>> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>>>>                  return -1;
>>>>     
>>>>         
>>> Why is this? Can't ROM be handled in the same way?
>>> I think it was previously ...
>>>   
>>>       
>> You can't execute code from MMIO regions, so slow_map doesn't work there.
>>     
>
> Hmm, how does it work with passthrough?
> Anyway, I don't think a lot of users run ROM
> code directly, they just shadow it and run from there.
>
>   
>>>   
>>>       
>>>>              }
>>>>  
>>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>              } else {
>>>>                  pci_dev->v_addrs[i].u.r_virtbase =
>>>>                      mmap(NULL,
>>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
>>>> +                         cur_region->size,
>>>>     
>>>>         
>>> Hmm, one assumes this code did work at some point ...
>>> do you know why was it done this way?
>>>   
>>>       
>> Nope :-).
>>     
>
> So maybe keep it that way or make the change in a separate patch.
>   

Well we're sure the normal path is size aligned, no? In fact, you
recommended I should change the code to what this looks like ;-).

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 19:04         ` Alexander Graf
@ 2009-12-15 19:50           ` Chris Wright
  2009-12-15 21:12             ` Michael S. Tsirkin
  2009-12-15 21:04           ` Michael S. Tsirkin
  1 sibling, 1 reply; 25+ messages in thread
From: Chris Wright @ 2009-12-15 19:50 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Michael S. Tsirkin, kvm list, Gleb Natapov, Muli Ben-Yehuda,
	Chris Wright

* Alexander Graf (agraf@suse.de) wrote:
> >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>>              } else {
> >>>>                  pci_dev->v_addrs[i].u.r_virtbase =
> >>>>                      mmap(NULL,
> >>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> >>>> +                         cur_region->size,
> >>>>     
> >>>>         
> >>> Hmm, one assumes this code did work at some point ...
> >>> do you know why was it done this way?
> >>>       
> >> Nope :-).
> >
> > So maybe keep it that way or make the change in a separate patch.
> 
> Well we're sure the normal path is size aligned, no? In fact, you
> recommended I should change the code to what this looks like ;-).

Sort of moot.  The typical case is page size aligned, and the kernel is
going to round up anyway.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] Split off sysfs id retrieval
  2009-12-15 18:39   ` Michael S. Tsirkin
  2009-12-15 18:54     ` Alexander Graf
@ 2009-12-15 19:57     ` Chris Wright
  1 sibling, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 19:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> > To retreive device and vendor ID from a PCI device, we need to read a
> > sysfs file. That code is currently hand written at least two times,
> > the later patch introducing two more calls.
> > 
> > So let's move that out to a function.
> > 
> > Signed-off-by: Alexander Graf <agraf@suse.de>
> 
> Shouldn't we be using libpci, as we already do
> in other places?

The main issue is making sure we get the cooked value of vendor and
device id (they are invalid in config space for VFs).  I looked at
this before (w/ the same intention) and concluded it wasn't a benefit
(of course, I can't recall why now...re-reading libpci, as long as we
force PCI_ACCESS_SYS_BUS_PCI it does the right read).

Anyway, this is a simple, nice cleanup.

Acked-by: Chris Wright <chrisw@redhat.com>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
  2009-12-15 18:41   ` Michael S. Tsirkin
@ 2009-12-15 20:28   ` Chris Wright
  2009-12-15 21:09     ` Michael S. Tsirkin
  2009-12-15 21:10     ` Alexander Graf
  1 sibling, 2 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 20:28 UTC (permalink / raw)
  To: Alexander Graf
  Cc: kvm list, Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda,
	Chris Wright, Daniel P. Berrange

* Alexander Graf (agraf@suse.de) wrote:
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> +    uint16_t vendor_id, device_id;
> +    int r;
> +
> +    /* XXX implement multidomain */
> +    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> +             dev->host.bus, dev->host.dev, dev->host.func);
> +
> +    sprintf(name, "%sdriver", dir);
> +
> +    r = readlink(name, driver, sizeof(driver));
> +    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {

While the symlink should never be that long, I think you want to
check bytes stored in driver before strrchr, else you may walk off a
non-NULL-terminated buffer.

> +        goto fail;
> +    }
> +
> +    ns++;
> +
> +    if (get_real_vendor_id(dir, &vendor_id) ||
> +        get_real_device_id(dir, &device_id)) {
> +        goto fail;
> +    }
> +
> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> +                    "%02x:%02x.%x.\n",
> +            ns, dev->host.bus, dev->host.dev, dev->host.func);
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> +    fprintf(stderr, "***\n");
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> +                    "new_id\n", vendor_id, device_id);
> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "%s/unbind\n",
> +            dev->host.bus, dev->host.dev, dev->host.func, ns);
> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> +                    "pci-stub/bind\n",
> +            dev->host.bus, dev->host.dev, dev->host.func);
> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> +                    "/remove_id\n", vendor_id, device_id);
> +    fprintf(stderr, "***\n");
> +
> +    return;
> +
> +fail:
> +    fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
>  static int assign_device(AssignedDevice *dev)
>  {
>      struct kvm_assigned_pci_dev assigned_dev_data;
> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
>  #endif
>  
>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> -    if (r < 0)
> -	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> +    if (r < 0) {
> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>                  dev->dev.qdev.id, strerror(-r));
> +
> +        assign_failed_examine(dev);

Only really catching the EBUSY case, maybe test for explicitly?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 19:04         ` Alexander Graf
  2009-12-15 19:50           ` Chris Wright
@ 2009-12-15 21:04           ` Michael S. Tsirkin
  2009-12-15 21:11             ` Alexander Graf
  1 sibling, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:04 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

On Tue, Dec 15, 2009 at 08:04:10PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
> >   
> >> Michael S. Tsirkin wrote:
> >>     
> >>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> >>>   
> >>>       
> >>>> While trying to get device passthrough working with an emulex hba, kvm
> >>>> refused to pass it through because it has a BAR of 256 bytes:
> >>>>
> >>>>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>>>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>>>         Region 4: I/O ports at b100 [size=256]
> >>>>
> >>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> >>>> physical to virtual addresses, we can still take the old MMIO callback route.
> >>>>
> >>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> >>>> by looping it through userspace.
> >>>>
> >>>> I verified that it works by passing through an e1000 with this additional patch
> >>>> applied and the card acted the same way it did without this patch:
> >>>>
> >>>>              map_func = assigned_dev_iomem_map;
> >>>> -            if (cur_region->size & 0xFFF) {
> >>>> +            if (i != PCI_ROM_SLOT){
> >>>>                  fprintf(stderr, "PCI region %d at address 0x%llx "
> >>>>
> >>>> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>>>     
> >>>>         
> >>> Looks good.
> >>> 1 question:
> >>>
> >>>   
> >>>       
> >>>> ---
> >>>>
> >>>> v1 -> v2:
> >>>>
> >>>>   - don't use map_func function pointer
> >>>>   - use the same code for mmap on fast and slow path
> >>>>
> >>>> v2 -> v3:
> >>>>
> >>>>   - address mst's comments
> >>>> ---
> >>>>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
> >>>>  1 files changed, 112 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >>>> index 13a86bb..000fa61 100644
> >>>> --- a/hw/device-assignment.c
> >>>> +++ b/hw/device-assignment.c
> >>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> >>>>      return value;
> >>>>  }
> >>>>  
> >>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint8_t *in = d->u.r_virtbase + addr;
> >>>> +    uint32_t r;
> >>>> +
> >>>> +    r = *in;
> >>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> +    return r;
> >>>> +}
> >>>> +
> >>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint16_t *in = d->u.r_virtbase + addr;
> >>>> +    uint32_t r;
> >>>> +
> >>>> +    r = *in;
> >>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> +    return r;
> >>>> +}
> >>>> +
> >>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint32_t *in = d->u.r_virtbase + addr;
> >>>> +    uint32_t r;
> >>>> +
> >>>> +    r = *in;
> >>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> +    return r;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint8_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> >>>> +    *out = val;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint16_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> >>>> +    *out = val;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> +    AssignedDevRegion *d = opaque;
> >>>> +    uint32_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> >>>> +    *out = val;
> >>>> +}
> >>>> +
> >>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> >>>> +    &slow_bar_writeb,
> >>>> +    &slow_bar_writew,
> >>>> +    &slow_bar_writel
> >>>> +};
> >>>> +
> >>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> >>>> +    &slow_bar_readb,
> >>>> +    &slow_bar_readw,
> >>>> +    &slow_bar_readl
> >>>> +};
> >>>> +
> >>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> >>>> +                                        pcibus_t e_phys, pcibus_t e_size,
> >>>> +                                        int type)
> >>>> +{
> >>>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> >>>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> >>>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> >>>> +    int m;
> >>>> +
> >>>> +    DEBUG("slow map\n");
> >>>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> >>>> +    cpu_register_physical_memory(e_phys, e_size, m);
> >>>> +
> >>>> +    /* MSI-X MMIO page */
> >>>> +    if ((e_size > 0) &&
> >>>> +        real_region->base_addr <= r_dev->msix_table_addr &&
> >>>> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> >>>> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> >>>> +
> >>>> +        cpu_register_physical_memory(e_phys + offset,
> >>>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> >>>>                                     pcibus_t e_phys, pcibus_t e_size, int type)
> >>>>  {
> >>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>>  
> >>>>          /* handle memory io regions */
> >>>>          if (cur_region->type & IORESOURCE_MEM) {
> >>>> +            int slow_map = 0;
> >>>>              int t = cur_region->type & IORESOURCE_PREFETCH
> >>>>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
> >>>>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> >>>> +
> >>>>              if (cur_region->size & 0xFFF) {
> >>>> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> >>>> -                        "at address 0x%llx has size 0x%x, "
> >>>> -                        " which is not a multiple of 4K\n",
> >>>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> >>>> +                        "has size 0x%x, which is not a multiple of 4K. "
> >>>> +                        "You might experience some performance hit due to that.\n",
> >>>>                          i, (unsigned long long)cur_region->base_addr,
> >>>>                          cur_region->size);
> >>>> +                slow_map = 1;
> >>>> +            }
> >>>> +
> >>>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> >>>> +                fprintf(stderr, "ROM not aligned - can't continue\n");
> >>>>                  return -1;
> >>>>     
> >>>>         
> >>> Why is this? Can't ROM be handled in the same way?
> >>> I think it was previously ...
> >>>   
> >>>       
> >> You can't execute code from MMIO regions, so slow_map doesn't work there.
> >>     
> >
> > Hmm, how does it work with passthrough?
> > Anyway, I don't think a lot of users run ROM
> > code directly, they just shadow it and run from there.
> >
> >   
> >>>   
> >>>       
> >>>>              }
> >>>>  
> >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>>              } else {
> >>>>                  pci_dev->v_addrs[i].u.r_virtbase =
> >>>>                      mmap(NULL,
> >>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> >>>> +                         cur_region->size,
> >>>>     
> >>>>         
> >>> Hmm, one assumes this code did work at some point ...
> >>> do you know why was it done this way?
> >>>   
> >>>       
> >> Nope :-).
> >>     
> >
> > So maybe keep it that way or make the change in a separate patch.
> >   
> 
> Well we're sure the normal path is size aligned, no? In fact, you
> recommended I should change the code to what this looks like ;-).
> 
> Alex


Existing code first page aligned address, passes this to mmap,
then adds page offset. I do not know why. Could be e.g. to handle
different libc/kernel versions. If we are not sure it is
not needed, let's keep it this way. If we are sure, let's
cleanup in a separate patch. Makes sense?

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/3] Split off sysfs id retrieval
  2009-12-15 18:54     ` Alexander Graf
@ 2009-12-15 21:05       ` Michael S. Tsirkin
  0 siblings, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:05 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

On Tue, Dec 15, 2009 at 07:54:41PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> >   
> >> To retreive device and vendor ID from a PCI device, we need to read a
> >> sysfs file. That code is currently hand written at least two times,
> >> the later patch introducing two more calls.
> >>
> >> So let's move that out to a function.
> >>
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>     
> >
> > Shouldn't we be using libpci, as we already do
> > in other places?
> >   
> 
> Yes, please! But converting it to libvirt is a different beast from just
> splitting up existing code :-).
> 
> Tackling the device assignment code to make it readable and mergable is
> definitely something we need to address - just not now :-).
> 
> Alex


Hmm, you are right.

Acked-by: Michael S. Tsirkin <mst@redhat.com>



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 20:28   ` Chris Wright
@ 2009-12-15 21:09     ` Michael S. Tsirkin
  2009-12-15 21:10     ` Alexander Graf
  1 sibling, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:09 UTC (permalink / raw)
  To: Chris Wright
  Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda,
	Daniel P. Berrange

On Tue, Dec 15, 2009 at 12:28:52PM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > +static void assign_failed_examine(AssignedDevice *dev)
> > +{
> > +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> > +    uint16_t vendor_id, device_id;
> > +    int r;
> > +
> > +    /* XXX implement multidomain */
> > +    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> > +             dev->host.bus, dev->host.dev, dev->host.func);
> > +
> > +    sprintf(name, "%sdriver", dir);
> > +
> > +    r = readlink(name, driver, sizeof(driver));
> > +    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
> 
> While the symlink should never be that long, I think you want to
> check bytes stored in driver before strrchr, else you may walk off a
> non-NULL-terminated buffer.

On I missed this.  Yes, sizeof check must be done before strrchr.
Good catch!

> > +        goto fail;
> > +    }
> > +
> > +    ns++;
> > +
> > +    if (get_real_vendor_id(dir, &vendor_id) ||
> > +        get_real_device_id(dir, &device_id)) {
> > +        goto fail;
> > +    }
> > +
> > +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> > +                    "%02x:%02x.%x.\n",
> > +            ns, dev->host.bus, dev->host.dev, dev->host.func);
> > +    fprintf(stderr, "***\n");
> > +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> > +    fprintf(stderr, "***\n");
> > +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> > +                    "new_id\n", vendor_id, device_id);
> > +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> > +                    "%s/unbind\n",
> > +            dev->host.bus, dev->host.dev, dev->host.func, ns);
> > +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> > +                    "pci-stub/bind\n",
> > +            dev->host.bus, dev->host.dev, dev->host.func);
> > +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> > +                    "/remove_id\n", vendor_id, device_id);
> > +    fprintf(stderr, "***\n");
> > +
> > +    return;
> > +
> > +fail:
> > +    fprintf(stderr, "Couldn't find out why.\n");
> > +}
> > +
> >  static int assign_device(AssignedDevice *dev)
> >  {
> >      struct kvm_assigned_pci_dev assigned_dev_data;
> > @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> >  #endif
> >  
> >      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> > -    if (r < 0)
> > -	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> > +    if (r < 0) {
> > +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >                  dev->dev.qdev.id, strerror(-r));
> > +
> > +        assign_failed_examine(dev);
> 
> Only really catching the EBUSY case, maybe test for explicitly?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 20:28   ` Chris Wright
  2009-12-15 21:09     ` Michael S. Tsirkin
@ 2009-12-15 21:10     ` Alexander Graf
  2009-12-15 23:13       ` Chris Wright
  1 sibling, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 21:10 UTC (permalink / raw)
  To: Chris Wright
  Cc: kvm list, Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda,
	Daniel P. Berrange

Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
>   
>> +static void assign_failed_examine(AssignedDevice *dev)
>> +{
>> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
>> +    uint16_t vendor_id, device_id;
>> +    int r;
>> +
>> +    /* XXX implement multidomain */
>> +    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
>> +             dev->host.bus, dev->host.dev, dev->host.func);
>> +
>> +    sprintf(name, "%sdriver", dir);
>> +
>> +    r = readlink(name, driver, sizeof(driver));
>> +    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
>>     
>
> While the symlink should never be that long, I think you want to
> check bytes stored in driver before strrchr, else you may walk off a
> non-NULL-terminated buffer.
>   

Hum, yeah. I'm starting to understand why the shell was invented.

>   
>> +        goto fail;
>> +    }
>> +
>> +    ns++;
>> +
>> +    if (get_real_vendor_id(dir, &vendor_id) ||
>> +        get_real_device_id(dir, &device_id)) {
>> +        goto fail;
>> +    }
>> +
>> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
>> +                    "%02x:%02x.%x.\n",
>> +            ns, dev->host.bus, dev->host.dev, dev->host.func);
>> +    fprintf(stderr, "***\n");
>> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
>> +    fprintf(stderr, "***\n");
>> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
>> +                    "new_id\n", vendor_id, device_id);
>> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> +                    "%s/unbind\n",
>> +            dev->host.bus, dev->host.dev, dev->host.func, ns);
>> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> +                    "pci-stub/bind\n",
>> +            dev->host.bus, dev->host.dev, dev->host.func);
>> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
>> +                    "/remove_id\n", vendor_id, device_id);
>> +    fprintf(stderr, "***\n");
>> +
>> +    return;
>> +
>> +fail:
>> +    fprintf(stderr, "Couldn't find out why.\n");
>> +}
>> +
>>  static int assign_device(AssignedDevice *dev)
>>  {
>>      struct kvm_assigned_pci_dev assigned_dev_data;
>> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
>>  #endif
>>  
>>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
>> -    if (r < 0)
>> -	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>> +    if (r < 0) {
>> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>>                  dev->dev.qdev.id, strerror(-r));
>> +
>> +        assign_failed_examine(dev);
>>     
>
> Only really catching the EBUSY case, maybe test for explicitly?
>   

What others do we have?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 21:04           ` Michael S. Tsirkin
@ 2009-12-15 21:11             ` Alexander Graf
  2009-12-16  5:47               ` Chris Wright
  0 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 21:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright

Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 08:04:10PM +0100, Alexander Graf wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
>>>   
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>>>>>   
>>>>>       
>>>>>           
>>>>>> While trying to get device passthrough working with an emulex hba, kvm
>>>>>> refused to pass it through because it has a BAR of 256 bytes:
>>>>>>
>>>>>>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>>>>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>>>>         Region 4: I/O ports at b100 [size=256]
>>>>>>
>>>>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>>>>>> physical to virtual addresses, we can still take the old MMIO callback route.
>>>>>>
>>>>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>>>>>> by looping it through userspace.
>>>>>>
>>>>>> I verified that it works by passing through an e1000 with this additional patch
>>>>>> applied and the card acted the same way it did without this patch:
>>>>>>
>>>>>>              map_func = assigned_dev_iomem_map;
>>>>>> -            if (cur_region->size & 0xFFF) {
>>>>>> +            if (i != PCI_ROM_SLOT){
>>>>>>                  fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>>>
>>>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> Looks good.
>>>>> 1 question:
>>>>>
>>>>>   
>>>>>       
>>>>>           
>>>>>> ---
>>>>>>
>>>>>> v1 -> v2:
>>>>>>
>>>>>>   - don't use map_func function pointer
>>>>>>   - use the same code for mmap on fast and slow path
>>>>>>
>>>>>> v2 -> v3:
>>>>>>
>>>>>>   - address mst's comments
>>>>>> ---
>>>>>>  hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
>>>>>>  1 files changed, 112 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>>>>>> index 13a86bb..000fa61 100644
>>>>>> --- a/hw/device-assignment.c
>>>>>> +++ b/hw/device-assignment.c
>>>>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>>>>>      return value;
>>>>>>  }
>>>>>>  
>>>>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint8_t *in = d->u.r_virtbase + addr;
>>>>>> +    uint32_t r;
>>>>>> +
>>>>>> +    r = *in;
>>>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> +    return r;
>>>>>> +}
>>>>>> +
>>>>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint16_t *in = d->u.r_virtbase + addr;
>>>>>> +    uint32_t r;
>>>>>> +
>>>>>> +    r = *in;
>>>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> +    return r;
>>>>>> +}
>>>>>> +
>>>>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint32_t *in = d->u.r_virtbase + addr;
>>>>>> +    uint32_t r;
>>>>>> +
>>>>>> +    r = *in;
>>>>>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> +    return r;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint8_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>>>>>> +    *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint16_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>>>>>> +    *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> +    AssignedDevRegion *d = opaque;
>>>>>> +    uint32_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>>>>>> +    *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>>>>>> +    &slow_bar_writeb,
>>>>>> +    &slow_bar_writew,
>>>>>> +    &slow_bar_writel
>>>>>> +};
>>>>>> +
>>>>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>>>>>> +    &slow_bar_readb,
>>>>>> +    &slow_bar_readw,
>>>>>> +    &slow_bar_readl
>>>>>> +};
>>>>>> +
>>>>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>>>>>> +                                        pcibus_t e_phys, pcibus_t e_size,
>>>>>> +                                        int type)
>>>>>> +{
>>>>>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>>>>>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>>>>>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>>>>>> +    int m;
>>>>>> +
>>>>>> +    DEBUG("slow map\n");
>>>>>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>>>>>> +    cpu_register_physical_memory(e_phys, e_size, m);
>>>>>> +
>>>>>> +    /* MSI-X MMIO page */
>>>>>> +    if ((e_size > 0) &&
>>>>>> +        real_region->base_addr <= r_dev->msix_table_addr &&
>>>>>> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>>>>>> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
>>>>>> +
>>>>>> +        cpu_register_physical_memory(e_phys + offset,
>>>>>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>>>>>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>>>>>>  {
>>>>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>>>  
>>>>>>          /* handle memory io regions */
>>>>>>          if (cur_region->type & IORESOURCE_MEM) {
>>>>>> +            int slow_map = 0;
>>>>>>              int t = cur_region->type & IORESOURCE_PREFETCH
>>>>>>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>>>>>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
>>>>>> +
>>>>>>              if (cur_region->size & 0xFFF) {
>>>>>> -                fprintf(stderr, "Unable to assign device: PCI region %d "
>>>>>> -                        "at address 0x%llx has size 0x%x, "
>>>>>> -                        " which is not a multiple of 4K\n",
>>>>>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>>> +                        "has size 0x%x, which is not a multiple of 4K. "
>>>>>> +                        "You might experience some performance hit due to that.\n",
>>>>>>                          i, (unsigned long long)cur_region->base_addr,
>>>>>>                          cur_region->size);
>>>>>> +                slow_map = 1;
>>>>>> +            }
>>>>>> +
>>>>>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
>>>>>> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>>>>>>                  return -1;
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> Why is this? Can't ROM be handled in the same way?
>>>>> I think it was previously ...
>>>>>   
>>>>>       
>>>>>           
>>>> You can't execute code from MMIO regions, so slow_map doesn't work there.
>>>>     
>>>>         
>>> Hmm, how does it work with passthrough?
>>> Anyway, I don't think a lot of users run ROM
>>> code directly, they just shadow it and run from there.
>>>
>>>   
>>>       
>>>>>   
>>>>>       
>>>>>           
>>>>>>              }
>>>>>>  
>>>>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>>>              } else {
>>>>>>                  pci_dev->v_addrs[i].u.r_virtbase =
>>>>>>                      mmap(NULL,
>>>>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
>>>>>> +                         cur_region->size,
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> Hmm, one assumes this code did work at some point ...
>>>>> do you know why was it done this way?
>>>>>   
>>>>>       
>>>>>           
>>>> Nope :-).
>>>>     
>>>>         
>>> So maybe keep it that way or make the change in a separate patch.
>>>   
>>>       
>> Well we're sure the normal path is size aligned, no? In fact, you
>> recommended I should change the code to what this looks like ;-).
>>
>> Alex
>>     
>
>
> Existing code first page aligned address, passes this to mmap,
> then adds page offset. I do not know why. Could be e.g. to handle
> different libc/kernel versions. If we are not sure it is
> not needed, let's keep it this way. If we are sure, let's
> cleanup in a separate patch. Makes sense?
>   


Chris, want to make a statement here? :-)

Alex

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 19:50           ` Chris Wright
@ 2009-12-15 21:12             ` Michael S. Tsirkin
  2009-12-16  5:44               ` Chris Wright
  0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:12 UTC (permalink / raw)
  To: Chris Wright; +Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda

On Tue, Dec 15, 2009 at 11:50:23AM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> > >>>>              } else {
> > >>>>                  pci_dev->v_addrs[i].u.r_virtbase =
> > >>>>                      mmap(NULL,
> > >>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> > >>>> +                         cur_region->size,
> > >>>>     
> > >>>>         
> > >>> Hmm, one assumes this code did work at some point ...
> > >>> do you know why was it done this way?
> > >>>       
> > >> Nope :-).
> > >
> > > So maybe keep it that way or make the change in a separate patch.
> > 
> > Well we're sure the normal path is size aligned, no? In fact, you
> > recommended I should change the code to what this looks like ;-).
> 
> Sort of moot.  The typical case is page size aligned, and the kernel is
> going to round up anyway.

Why was this code here originally then? Any idea?

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/3] Inform users about busy device assignment attempt
  2009-12-15 21:10     ` Alexander Graf
@ 2009-12-15 23:13       ` Chris Wright
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 23:13 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Chris Wright, kvm list, Gleb Natapov, Michael S. Tsirkin,
	Muli Ben-Yehuda, Daniel P. Berrange

* Alexander Graf (agraf@suse.de) wrote:
> Chris Wright wrote:
> > * Alexander Graf (agraf@suse.de) wrote:
> >   
> >> +static void assign_failed_examine(AssignedDevice *dev)
> >> +{
> >> +    char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> >> +    uint16_t vendor_id, device_id;
> >> +    int r;
> >> +
> >> +    /* XXX implement multidomain */
> >> +    sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> >> +             dev->host.bus, dev->host.dev, dev->host.func);
> >> +
> >> +    sprintf(name, "%sdriver", dir);
> >> +
> >> +    r = readlink(name, driver, sizeof(driver));
> >> +    if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
> >>     
> >
> > While the symlink should never be that long, I think you want to
> > check bytes stored in driver before strrchr, else you may walk off a
> > non-NULL-terminated buffer.
> >   
> 
> Hum, yeah. I'm starting to understand why the shell was invented.

Heh

> >> +        goto fail;
> >> +    }
> >> +
> >> +    ns++;
> >> +
> >> +    if (get_real_vendor_id(dir, &vendor_id) ||
> >> +        get_real_device_id(dir, &device_id)) {
> >> +        goto fail;
> >> +    }
> >> +
> >> +    fprintf(stderr, "*** The driver '%s' is occupying your device "
> >> +                    "%02x:%02x.%x.\n",
> >> +            ns, dev->host.bus, dev->host.dev, dev->host.func);
> >> +    fprintf(stderr, "***\n");
> >> +    fprintf(stderr, "*** You can try the following commands to free it:\n");
> >> +    fprintf(stderr, "***\n");
> >> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> >> +                    "new_id\n", vendor_id, device_id);
> >> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> >> +                    "%s/unbind\n",
> >> +            dev->host.bus, dev->host.dev, dev->host.func, ns);
> >> +    fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> >> +                    "pci-stub/bind\n",
> >> +            dev->host.bus, dev->host.dev, dev->host.func);
> >> +    fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> >> +                    "/remove_id\n", vendor_id, device_id);
> >> +    fprintf(stderr, "***\n");
> >> +
> >> +    return;
> >> +
> >> +fail:
> >> +    fprintf(stderr, "Couldn't find out why.\n");
> >> +}
> >> +
> >>  static int assign_device(AssignedDevice *dev)
> >>  {
> >>      struct kvm_assigned_pci_dev assigned_dev_data;
> >> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> >>  #endif
> >>  
> >>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> >> -    if (r < 0)
> >> -	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >> +    if (r < 0) {
> >> +        fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >>                  dev->dev.qdev.id, strerror(-r));
> >> +
> >> +        assign_failed_examine(dev);
> >>     
> >
> > Only really catching the EBUSY case, maybe test for explicitly?
> 
> What others do we have?

The above would trigger when you've already assigned the same device,
and the error message would be confusing: "The driver pci_stub
is occupying..." (btw, perhaps s/occupying your/bound to/ would be
clearer). EEXIST is used for that case.  I think that's it for errors the
user could do something with.

thanks,
-chris

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 21:12             ` Michael S. Tsirkin
@ 2009-12-16  5:44               ` Chris Wright
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-16  5:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Chris Wright, Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda

* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Tue, Dec 15, 2009 at 11:50:23AM -0800, Chris Wright wrote:
> > * Alexander Graf (agraf@suse.de) wrote:
> > > >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> > > >>>>              } else {
> > > >>>>                  pci_dev->v_addrs[i].u.r_virtbase =
> > > >>>>                      mmap(NULL,
> > > >>>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> > > >>>> +                         cur_region->size,
> > > >>>>     
> > > >>>>         
> > > >>> Hmm, one assumes this code did work at some point ...
> > > >>> do you know why was it done this way?
> > > >>>       
> > > >> Nope :-).
> > > >
> > > > So maybe keep it that way or make the change in a separate patch.
> > > 
> > > Well we're sure the normal path is size aligned, no? In fact, you
> > > recommended I should change the code to what this looks like ;-).
> > 
> > Sort of moot.  The typical case is page size aligned, and the kernel is
> > going to round up anyway.
> 
> Why was this code here originally then? Any idea?

I think it's redundant, being paranoid to ensure a multiple of page mappings
w/out realizing that's what mmap will do anyway.

thanks,
-chris

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-15 21:11             ` Alexander Graf
@ 2009-12-16  5:47               ` Chris Wright
  2009-12-16 10:41                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wright @ 2009-12-16  5:47 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Michael S. Tsirkin, kvm list, Gleb Natapov, Muli Ben-Yehuda,
	Chris Wright

* Alexander Graf (agraf@suse.de) wrote:
> Michael S. Tsirkin wrote:
> > Existing code first page aligned address, passes this to mmap,

Did you mean page aligned size?

> > then adds page offset. I do not know why. Could be e.g. to handle
> > different libc/kernel versions. If we are not sure it is
> > not needed, let's keep it this way. If we are sure, let's
> > cleanup in a separate patch. Makes sense?
> 
> Chris, want to make a statement here? :-)

I have no issue w/ splitting that out.  I don't think it is necessary.
The "adds page offset" bit is, however.

thanks,
-chris

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-16  5:47               ` Chris Wright
@ 2009-12-16 10:41                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-16 10:41 UTC (permalink / raw)
  To: Chris Wright; +Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda

On Tue, Dec 15, 2009 at 09:47:09PM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > Michael S. Tsirkin wrote:
> > > Existing code first page aligned address, passes this to mmap,
> 
> Did you mean page aligned size?

Yes, of course.

> > > then adds page offset. I do not know why. Could be e.g. to handle
> > > different libc/kernel versions. If we are not sure it is
> > > not needed, let's keep it this way. If we are sure, let's
> > > cleanup in a separate patch. Makes sense?
> > 
> > Chris, want to make a statement here? :-)
> 
> I have no issue w/ splitting that out.  I don't think it is necessary.
> The "adds page offset" bit is, however.
> 
> thanks,
> -chris

I looked at kernel source history, as far as I can tell
all mmap implementations round size up to full page size.
So yes, this seems safe.

-- 
MST

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/3] Enable non page boundary BAR device assignment
  2009-12-17 15:04 [PATCH 0/3] Device Assignment fixes Alexander Graf
@ 2009-12-17 15:04 ` Alexander Graf
  0 siblings, 0 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-17 15:04 UTC (permalink / raw)
  To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright

While trying to get device passthrough working with an emulex hba, kvm
refused to pass it through because it has a BAR of 256 bytes:

        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
        Region 4: I/O ports at b100 [size=256]

Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
physical to virtual addresses, we can still take the old MMIO callback route.

So let's add a second code path that allows for size & 0xFFF != 0 sized regions
by looping it through userspace.

I verified that it works by passing through an e1000 with this additional patch
applied and the card acted the same way it did without this patch:

             map_func = assigned_dev_iomem_map;
-            if (cur_region->size & 0xFFF) {
+            if (i != PCI_ROM_SLOT){
                 fprintf(stderr, "PCI region %d at address 0x%llx "

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - don't use map_func function pointer
  - use the same code for mmap on fast and slow path

v2 -> v3:

  - address mst's comments
---
 hw/device-assignment.c |  117 +++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 13a86bb..000fa61 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     return value;
 }
 
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *in = d->u.r_virtbase + addr;
+    uint32_t r;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *out = d->u.r_virtbase + addr;
+
+    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    *out = val;
+}
+
+static CPUWriteMemoryFunc * const slow_bar_write[] = {
+    &slow_bar_writeb,
+    &slow_bar_writew,
+    &slow_bar_writel
+};
+
+static CPUReadMemoryFunc * const slow_bar_read[] = {
+    &slow_bar_readb,
+    &slow_bar_readw,
+    &slow_bar_readl
+};
+
+static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
+                                        pcibus_t e_phys, pcibus_t e_size,
+                                        int type)
+{
+    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+    int m;
+
+    DEBUG("slow map\n");
+    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
+    cpu_register_physical_memory(e_phys, e_size, m);
+
+    /* MSI-X MMIO page */
+    if ((e_size > 0) &&
+        real_region->base_addr <= r_dev->msix_table_addr &&
+        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
+        int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+        cpu_register_physical_memory(e_phys + offset,
+                TARGET_PAGE_SIZE, r_dev->mmio_index);
+    }
+}
+
 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
                                    pcibus_t e_phys, pcibus_t e_size, int type)
 {
@@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
         /* handle memory io regions */
         if (cur_region->type & IORESOURCE_MEM) {
+            int slow_map = 0;
             int t = cur_region->type & IORESOURCE_PREFETCH
                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
             if (cur_region->size & 0xFFF) {
-                fprintf(stderr, "Unable to assign device: PCI region %d "
-                        "at address 0x%llx has size 0x%x, "
-                        " which is not a multiple of 4K\n",
+                fprintf(stderr, "PCI region %d at address 0x%llx "
+                        "has size 0x%x, which is not a multiple of 4K. "
+                        "You might experience some performance hit due to that.\n",
                         i, (unsigned long long)cur_region->base_addr,
                         cur_region->size);
+                slow_map = 1;
+            }
+
+            if (slow_map && (i == PCI_ROM_SLOT)) {
+                fprintf(stderr, "ROM not aligned - can't continue\n");
                 return -1;
             }
 
@@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
             } else {
                 pci_dev->v_addrs[i].u.r_virtbase =
                     mmap(NULL,
-                         (cur_region->size + 0xFFF) & 0xFFFFF000,
+                         cur_region->size,
                          PROT_WRITE | PROT_READ, MAP_SHARED,
                          cur_region->resource_fd, (off_t) 0);
             }
@@ -434,7 +540,8 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
             pci_register_bar((PCIDevice *) pci_dev, i,
                              cur_region->size, t,
-                             assigned_dev_iomem_map);
+                             slow_map ? assigned_dev_iomem_map_slow
+                                      : assigned_dev_iomem_map);
             continue;
         }
         /* handle port io regions */
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2009-12-17 15:04 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
2009-12-15 18:38   ` Michael S. Tsirkin
2009-12-15 18:47     ` Alexander Graf
2009-12-15 18:50       ` Michael S. Tsirkin
2009-12-15 19:04         ` Alexander Graf
2009-12-15 19:50           ` Chris Wright
2009-12-15 21:12             ` Michael S. Tsirkin
2009-12-16  5:44               ` Chris Wright
2009-12-15 21:04           ` Michael S. Tsirkin
2009-12-15 21:11             ` Alexander Graf
2009-12-16  5:47               ` Chris Wright
2009-12-16 10:41                 ` Michael S. Tsirkin
2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
2009-12-15 18:39   ` Michael S. Tsirkin
2009-12-15 18:54     ` Alexander Graf
2009-12-15 21:05       ` Michael S. Tsirkin
2009-12-15 19:57     ` Chris Wright
2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
2009-12-15 18:41   ` Michael S. Tsirkin
2009-12-15 20:28   ` Chris Wright
2009-12-15 21:09     ` Michael S. Tsirkin
2009-12-15 21:10     ` Alexander Graf
2009-12-15 23:13       ` Chris Wright
2009-12-17 15:04 [PATCH 0/3] Device Assignment fixes Alexander Graf
2009-12-17 15:04 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.