All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer
@ 2011-06-06 16:26 Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 1/9] make qbus_reset_all public Paolo Bonzini
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

This is the third part in my SCSI s/g series.  From quick benchmarking,
it speeds up some workloads by 2-4 times.  Benefits vary depending on
the number of sectors read/written by the typical operation.  More precise
numbers of course will come.

Unlike the previous attempts, which forced devices to use an iovec
provided by the HBA, here the devices can choose whether to use the
old mechanism or the iovecs.  In addition, I do not plan to add this
to real devices, only paravirtual.  For this reason, this series includes
an emulation of the vmware pvscsi device (the emulation is complete
except for hotplug and PIO operation, both of which are undocumented
AFAICS).

The series currently depends on the cpu_physical_memory_fast operations.
The dependency can be removed if necessary though.  In fact, I would like
an overall comment on the API since I might as well drop pvscsi completely
and just implement this for virtio-scsi.

Paolo Bonzini (9):
  make qbus_reset_all public
  pvscsi: first commit
  pvscsi: check validity of DMA addresses in advance
  scsi: always use get_sense
  scsi-disk: lazily allocate bounce buffer
  allow switching a qiov between internal and external storage
  scsi: push qiov to SCSIRequest
  scsi: add get_iovec to SCSIBusOps
  pvscsi: implement s/g operation without a bounce buffer

 Makefile.objs           |    1 +
 cutils.c                |   14 +-
 default-configs/pci.mak |    1 +
 hw/pci.h                |    1 +
 hw/qdev.c               |    7 +-
 hw/qdev.h               |    2 +-
 hw/scsi-bus.c           |   20 +-
 hw/scsi-disk.c          |   62 ++-
 hw/scsi.h               |    4 +
 hw/spapr_vscsi.c        |   90 +----
 hw/vmw_pvscsi.c         | 1014 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/vmw_pvscsi.h         |  389 ++++++++++++++++++
 trace-events            |   15 +
 vl.c                    |    2 +-
 14 files changed, 1518 insertions(+), 104 deletions(-)
 create mode 100644 hw/vmw_pvscsi.c
 create mode 100644 hw/vmw_pvscsi.h

-- 
1.7.4.4

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 1/9] make qbus_reset_all public
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 2/9] pvscsi: first commit Paolo Bonzini
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/qdev.c |    7 +++----
 hw/qdev.h |    2 +-
 vl.c      |    2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/hw/qdev.c b/hw/qdev.c
index 9519f5d..ccfe68d 100644
--- a/hw/qdev.c
+++ b/hw/qdev.c
@@ -344,9 +344,8 @@ void qdev_reset_all(DeviceState *dev)
     qdev_walk_children(dev, qdev_reset_one, qbus_reset_one, NULL);
 }
 
-void qbus_reset_all_fn(void *opaque)
+void qbus_reset_all(BusState *bus)
 {
-    BusState *bus = opaque;
     qbus_walk_children(bus, qdev_reset_one, qbus_reset_one, NULL);
 }
 
@@ -766,7 +765,7 @@ void qbus_create_inplace(BusState *bus, BusInfo *info,
     } else if (bus != main_system_bus) {
         /* TODO: once all bus devices are qdevified,
            only reset handler for main_system_bus should be registered here. */
-        qemu_register_reset(qbus_reset_all_fn, bus);
+        qemu_register_reset((void (*)(void *))qbus_reset_all, bus);
     }
 }
 
@@ -792,7 +791,7 @@ void qbus_free(BusState *bus)
         bus->parent->num_child_bus--;
     } else {
         assert(bus != main_system_bus); /* main_system_bus is never freed */
-        qemu_unregister_reset(qbus_reset_all_fn, bus);
+        qemu_unregister_reset((void (*)(void *))qbus_reset_all, bus);
     }
     qemu_free((void*)bus->name);
     if (bus->qdev_allocated) {
diff --git a/hw/qdev.h b/hw/qdev.h
index 8a13ec9..0ecd31a 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -200,7 +200,7 @@ int qbus_walk_children(BusState *bus, qdev_walkerfn *devfn,
 int qdev_walk_children(DeviceState *dev, qdev_walkerfn *devfn,
                        qbus_walkerfn *busfn, void *opaque);
 void qdev_reset_all(DeviceState *dev);
-void qbus_reset_all_fn(void *opaque);
+void qbus_reset_all(BusState *bus);
 
 void qbus_free(BusState *bus);
 
diff --git a/vl.c b/vl.c
index b362871..d359917 100644
--- a/vl.c
+++ b/vl.c
@@ -3266,7 +3266,7 @@ int main(int argc, char **argv, char **envp)
 
     /* TODO: once all bus devices are qdevified, this should be done
      * when bus is created by qdev.c */
-    qemu_register_reset(qbus_reset_all_fn, sysbus_get_default());
+    qemu_register_reset((void (*)(void *))qbus_reset_all, sysbus_get_default());
     qemu_run_machine_init_done_notifiers();
 
     qemu_system_reset();
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 2/9] pvscsi: first commit
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 1/9] make qbus_reset_all public Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 3/9] pvscsi: check validity of DMA addresses in advance Paolo Bonzini
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.objs           |    1 +
 default-configs/pci.mak |    1 +
 hw/pci.h                |    1 +
 hw/vmw_pvscsi.c         |  912 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/vmw_pvscsi.h         |  389 ++++++++++++++++++++
 trace-events            |   15 +
 6 files changed, 1319 insertions(+), 0 deletions(-)
 create mode 100644 hw/vmw_pvscsi.c
 create mode 100644 hw/vmw_pvscsi.h

diff --git a/Makefile.objs b/Makefile.objs
index 90838f6..0a39d4f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -255,6 +255,7 @@ hw-obj-$(CONFIG_AHCI) += ide/ich.o
 
 # SCSI layer
 hw-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
+hw-obj-$(CONFIG_VMWARE_PVSCSI_PCI) += vmw_pvscsi.o
 hw-obj-$(CONFIG_ESP) += esp.o
 
 hw-obj-y += dma-helpers.o sysbus.o isa-bus.o
diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 22bd350..280101b 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -9,6 +9,7 @@ CONFIG_EEPRO100_PCI=y
 CONFIG_PCNET_PCI=y
 CONFIG_PCNET_COMMON=y
 CONFIG_LSI_SCSI_PCI=y
+CONFIG_VMWARE_PVSCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
 CONFIG_IDE_CORE=y
diff --git a/hw/pci.h b/hw/pci.h
index 0d288ce..4499a30 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -59,6 +59,7 @@
 #define PCI_DEVICE_ID_VMWARE_NET         0x0720
 #define PCI_DEVICE_ID_VMWARE_SCSI        0x0730
 #define PCI_DEVICE_ID_VMWARE_IDE         0x1729
+#define PCI_DEVICE_ID_VMWARE_PVSCSI      0x07c0
 
 /* Intel (0x8086) */
 #define PCI_DEVICE_ID_INTEL_82551IT      0x1209
diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
new file mode 100644
index 0000000..6f0b0b9
--- /dev/null
+++ b/hw/vmw_pvscsi.c
@@ -0,0 +1,912 @@
+/*
+ * VMware Paravirtualized SCSI Host Bus Adapter emulation
+ *
+ * Copyright (c) 2011 Red Hat, Inc.
+ * Written by Paolo Bonzini
+ *
+ * This code is licensed under GPLv2.
+ */
+
+#include <assert.h>
+
+#include "hw.h"
+#include "pci.h"
+#include "scsi.h"
+#include "scsi-defs.h"
+#include "vmw_pvscsi.h"
+#include "block_int.h"
+#include "host-utils.h"
+#include "trace.h"
+
+#define PVSCSI_MAX_DEVS 127
+#define PAGE_SIZE       4096
+#define PAGE_SHIFT      12
+
+typedef struct PVSCSISGState {
+    target_phys_addr_t elemAddr;
+    target_phys_addr_t dataAddr;
+    uint32_t resid;
+} PVSCSISGState;
+
+typedef struct PVSCSIRequest {
+    SCSIDevice *sdev;
+    SCSIRequest *sreq;
+    uint8_t sensing;
+    uint8_t sense_key;
+    uint8_t completed;
+    int lun;
+    uint64_t resid;
+    PVSCSISGState sg;
+    struct PVSCSIRingReqDesc req;
+    struct PVSCSIRingCmpDesc cmp;
+    QTAILQ_ENTRY(PVSCSIRequest) next;
+} PVSCSIRequest;
+
+typedef QTAILQ_HEAD(, PVSCSIRequest) PVSCSIRequestList;
+
+typedef struct {
+    PCIDevice dev;
+    SCSIBus bus;
+    QEMUBH *complete_reqs_bh;
+
+    int mmio_io_addr;
+
+    /* zeroed on reset */
+    uint32_t cmd_latch;
+    uint32_t cmd_buffer[sizeof(struct PVSCSICmdDescSetupRings)
+                        / sizeof(uint32_t)];
+    uint32_t cmd_ptr;
+    uint32_t cmd_status;
+    uint32_t intr_status;
+    uint32_t intr_mask;
+    uint32_t intr_cmpl;
+    uint32_t intr_msg;
+    struct PVSCSICmdDescSetupRings rings;
+    struct PVSCSICmdDescSetupMsgRing msgRing;
+    uint32_t reqNumEntriesLog2;
+    uint32_t cmpNumEntriesLog2;
+    uint32_t msgNumEntriesLog2;
+
+    PVSCSIRequestList pending_queue;
+    PVSCSIRequestList complete_queue;
+} PVSCSIState;
+
+\f

+static inline int pvscsi_get_lun(uint8_t *lun)
+{
+    if (lun[0] || lun[2] || lun[3] || lun[4] || lun[5] || lun[6] || lun[7]) {
+        return -1;
+    }
+    return lun[1];
+}
+
+static inline int pvscsi_get_dev_lun(PVSCSIState *s,
+                                     uint8_t *lun, uint32_t target,
+                                     SCSIDevice **sdev)
+{
+    SCSIBus *bus = &s->bus;
+    int lunval;
+    *sdev = NULL;
+    if (target > PVSCSI_MAX_DEVS) {
+        return -1;
+    }
+    lunval = pvscsi_get_lun(lun);
+    if (lunval < 0) {
+        return -1;
+    }
+    *sdev = bus->devs[target];
+    if (!sdev) {
+        return -1;
+    }
+    return lunval;
+}
+
+\f

+/* Add a command to the pending queue.  */
+static PVSCSIRequest *pvscsi_queue_request(PVSCSIState *s, SCSIDevice **d,
+                                           struct PVSCSIRingReqDesc *req)
+{
+    PVSCSIRequest *p;
+    int lun;
+
+    trace_pvscsi_queue_request(req->context, req->cdb[0], req->dataLen);
+
+    p = qemu_mallocz(sizeof(*p));
+    p->req = *req;
+    p->cmp.context = p->req.context;
+    QTAILQ_INSERT_TAIL(&s->pending_queue, p, next);
+
+    lun = pvscsi_get_dev_lun(s, req->lun, req->target, d);
+    if (!*d) {
+        return p;
+    }
+
+    p->lun = lun;
+    return p;
+}
+
+/* Get PVSCSIRequest for this tag.  */
+static PVSCSIRequest *pvscsi_find_request(PVSCSIState *s, SCSIRequest *sreq)
+{
+    PVSCSIRequest *p;
+
+    QTAILQ_FOREACH(p, &s->pending_queue, next) {
+        if (p->sreq == sreq) {
+            return p;
+        }
+    }
+    return NULL;
+}
+
+static void pvscsi_free_queue(PVSCSIRequestList *q)
+{
+    PVSCSIRequest *p;
+
+    while (!QTAILQ_EMPTY(q)) {
+        p = QTAILQ_FIRST(q);
+        QTAILQ_REMOVE(q, p, next);
+        qemu_free(p);
+    }
+}
+
+static void pvscsi_soft_reset(PVSCSIState *s)
+{
+    qbus_reset_all(&s->bus.qbus);
+    pvscsi_free_queue(&s->complete_queue);
+    assert(QTAILQ_EMPTY(&s->pending_queue));
+    memset(&s->cmd_latch, 0, sizeof(*s) - offsetof(PVSCSIState, cmd_latch));
+    s->intr_cmpl = PVSCSI_INTR_CMPL_0;
+    s->intr_msg = PVSCSI_INTR_MSG_0;
+    QTAILQ_INIT(&s->pending_queue);
+    QTAILQ_INIT(&s->complete_queue);
+}
+
+\f

+static void pvscsi_raise_intr(PVSCSIState *s, int mask)
+{
+    int intr_raised = mask & ~s->intr_status;
+    s->intr_status |= mask;
+    trace_pvscsi_raise_intr(intr_raised,
+                            (intr_raised & s->intr_mask) == 0 ? "masked" : "");
+    if (intr_raised & s->intr_mask) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+}
+
+static void pvscsi_acknowledge_intr(PVSCSIState *s, int mask)
+{
+    trace_pvscsi_acknowledge_intr(mask);
+    s->intr_status &= ~mask;
+    if (mask == s->intr_cmpl) {
+        s->intr_cmpl ^= PVSCSI_INTR_CMPL_MASK;
+
+        /* Try putting more complete requests on the ring.  */
+        if (!QTAILQ_EMPTY(&s->complete_queue)) {
+            qemu_bh_schedule(s->complete_reqs_bh);
+        }
+    }
+    if (mask == s->intr_msg) {
+        s->intr_msg ^= PVSCSI_INTR_MSG_MASK;
+    }
+    if ((s->intr_status & s->intr_mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+static void pvscsi_set_intr_mask(PVSCSIState *s, int mask)
+{
+    int intr_enabled = mask & ~s->intr_mask;
+    s->intr_mask = mask;
+    if (s->intr_status & intr_enabled) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+    if ((s->intr_status & mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+\f

+#define pvscsi_ld_ring_state(s, field) \
+    ldl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field))
+
+#define pvscsi_st_ring_state(s, field, val) \
+    stl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field), \
+             val)
+
+/* Return number of free elements in the completion ring.  */
+static inline int pvscsi_cmp_free(PVSCSIState *s)
+{
+    return ((1 << s->cmpNumEntriesLog2) - 1 -
+            (pvscsi_ld_ring_state(s, cmpProdIdx) - pvscsi_ld_ring_state(s, cmpConsIdx)));
+}
+
+/* Return number of pending elements in the request ring.  */
+static inline int pvscsi_req_pending(PVSCSIState *s)
+{
+    return pvscsi_ld_ring_state(s, reqProdIdx) - pvscsi_ld_ring_state(s, reqConsIdx);
+}
+
+/* Return the physical address of the idx-th element in the ring
+ * whose physical page numbers are given by ppn.  Each element in
+ * the ring has size bytes.  */
+static target_phys_addr_t pvscsi_get_ring_addr(PVSCSIState *s, int idx,
+                                               int size, uint64_t *ppn)
+{
+    uint32_t ofs = idx * size;
+    return (ppn[ofs >> PAGE_SHIFT] * PAGE_SIZE) | (ofs & (PAGE_SIZE - 1));
+}
+\f

+
+#define barrier()
+
+/* Copy cmp_desc on the completion ring, assuming there is a free entry.  */
+static void pvscsi_cmp_ring_put(PVSCSIState *s,
+                                struct PVSCSIRingCmpDesc *cmp_desc)
+{
+    uint32_t cmp_entries = s->cmpNumEntriesLog2;
+    uint32_t val = pvscsi_ld_ring_state(s, cmpProdIdx);
+    uint32_t idx = val & MASK(cmp_entries);
+    target_phys_addr_t addr;
+
+    trace_pvscsi_cmp_ring_put(cmp_desc->context);
+    addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingCmpDesc),
+                                s->rings.cmpRingPPNs);
+
+    barrier();
+    cpu_physical_memory_write(addr, (void *)cmp_desc, sizeof(*cmp_desc));
+    barrier();
+    pvscsi_st_ring_state(s, cmpProdIdx, val + 1);
+}
+
+/* Put all completed requests on the completion ring.  */
+static void pvscsi_complete_reqs(void *opaque)
+{
+    PVSCSIState *s = opaque;
+    PVSCSIRequest *p;
+    int n = pvscsi_cmp_free(s);
+    int done = 0;
+    while (n > 0 && !QTAILQ_EMPTY(&s->complete_queue)) {
+        p = QTAILQ_FIRST(&s->complete_queue);
+        QTAILQ_REMOVE(&s->complete_queue, p, next);
+        pvscsi_cmp_ring_put(s, &p->cmp);
+        qemu_free(p);
+        n--;
+        done++;
+    }
+    if (done) {
+        pvscsi_raise_intr(s, s->intr_cmpl);
+    }
+}
+
+/* Prepare to put r on the completion ring.  */
+static void pvscsi_complete_req(PVSCSIState *s, PVSCSIRequest *p)
+{
+    assert(!p->completed);
+    trace_pvscsi_complete_req(p->cmp.context, p->cmp.dataLen, p->sense_key);
+    if (p->sreq != NULL) {
+        scsi_req_unref(p->sreq);
+        p->sreq = NULL;
+    }
+    p->completed = 1;
+    QTAILQ_REMOVE(&s->pending_queue, p, next);
+    QTAILQ_INSERT_TAIL(&s->complete_queue, p, next);
+    qemu_bh_schedule(s->complete_reqs_bh);
+}
+
+/* Fetch sense data for a completed request.  */
+/* Write sense data for a completed request.  */
+static void pvscsi_write_sense(PVSCSIRequest *p, uint8_t *buf, int len)
+{
+    p->cmp.senseLen = MIN(p->req.senseLen, len);
+    p->sense_key = buf[2];
+    cpu_physical_memory_write(p->req.senseAddr, buf, p->cmp.senseLen);
+}
+
+static bool pvscsi_send_request_sense(PVSCSIRequest *p)
+{
+    uint8_t cdb[6] = { 3, p->lun << 5, 0, 0, 96, 0 };
+    uint8_t sense[96];
+    int n;
+
+    n = scsi_req_get_sense(p->sreq, sense, sizeof(sense));
+    if (n) {
+        pvscsi_write_sense(p, sense, n);
+        return false;
+    }
+
+    trace_pvscsi_request_sense(p->sreq->tag, p->lun);
+    n = scsi_req_enqueue(p->sreq, cdb);
+    if (n < 0) {
+        /* should not happen, just leave sense data empty in this case. */
+        scsi_req_cancel(p->sreq);
+    } else if (n > 0) {
+        scsi_req_continue(p->sreq);
+    }
+    return true;
+}
+
+static void pvscsi_transfer_data_with_buffer(PVSCSIRequest *p, bool to_host,
+                                             uint8_t *buf, int len)
+{
+    if (len) {
+        cpu_physical_memory_rw(p->req.dataAddr, buf, len, to_host);
+        p->cmp.dataLen += len;
+        p->req.dataAddr += len;
+        p->resid -= len;
+    }
+}
+
+static void pvscsi_get_next_sg_elem(PVSCSISGState *sg)
+{
+    struct PVSCSISGElement elem;
+
+    for (;; sg->elemAddr = elem.addr) {
+        cpu_physical_memory_read(sg->elemAddr, (void *)&elem,
+                                 sizeof(elem));
+#if 0
+        /* PVSCSI_SGE_FLAG_CHAIN_ELEMENT not in the header file! */
+        if ((elem.flags & PVSCSI_SGE_FLAG_CHAIN_ELEMENT) == 0) {
+            break;
+        }
+#else
+        break;
+#endif
+    }
+
+    sg->elemAddr += sizeof(elem);
+    sg->dataAddr = elem.addr;
+    sg->resid = elem.length;
+}
+
+static void pvscsi_transfer_data_with_sg_list(PVSCSIRequest *p, bool to_host,
+                                              uint8_t *buf, int len)
+{
+    int n;
+    while (len) {
+        while (!p->sg.resid) {
+            pvscsi_get_next_sg_elem(&p->sg);
+            trace_pvscsi_sg_elem(p->req.context, p->sg.dataAddr, p->sg.resid);
+        }
+        assert(len > 0);
+        n = MIN((unsigned) len, p->sg.resid);
+        if (n) {
+            cpu_physical_memory_rw(p->sg.dataAddr, buf, n, to_host);
+        }
+
+        buf += n;
+        p->cmp.dataLen += n;
+        p->sg.dataAddr += n;
+
+        len -= n;
+        p->resid -= n;
+        p->sg.resid -= n;
+    }
+}
+
+/* Callback to indicate that the SCSI layer has completed a transfer.  */
+static void pvscsi_transfer_data(SCSIRequest *req, uint32_t len)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, req);
+    uint8_t *buf = scsi_req_get_buf(req);
+
+    if (!p) {
+        fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", req->tag);
+        return;
+    }
+
+    if (!p->sensing) {
+        int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
+
+        assert(p->resid);
+        trace_pvscsi_transfer_data(p->req.context, len);
+        if (!len) {
+            /* Short transfer.  */
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            scsi_req_cancel(req);
+            return;
+        }
+
+        if (len > p->resid) {
+            /* Small buffer.  */
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            scsi_req_cancel(req);
+            return;
+        }
+
+        if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+            pvscsi_transfer_data_with_sg_list(p, to_host, buf, len);
+        } else {
+            pvscsi_transfer_data_with_buffer(p, to_host, buf, len);
+        }
+    }
+
+    else if (p->sensing == 1) {
+        /* Got sense data.  Write it back and kick the device to complete
+         * the request.  */
+        if (len) {
+            pvscsi_write_sense(p, buf, len);
+            if (buf[2] == NO_SENSE) {
+                p->cmp.scsiStatus = GOOD;
+            }
+        }
+        p->sensing = 2;
+    }
+
+    scsi_req_continue(req);
+}
+
+/* Callback to indicate that the SCSI layer has completed a transfer.  */
+static void pvscsi_command_complete(SCSIRequest *req, uint32_t status)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, req);
+
+    if (!p) {
+        fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", req->tag);
+        return;
+    }
+
+    /* Here to complete the request.  */
+    if (!p->sensing) {
+        p->cmp.scsiStatus = status;
+
+        if (p->cmp.scsiStatus == CHECK_CONDITION) {
+            p->sensing = 1;
+            if (pvscsi_send_request_sense(p)) {
+                return;
+            }
+        }
+    }
+
+    pvscsi_complete_req(s, p);
+}
+
+static void pvscsi_request_cancelled(SCSIRequest *req)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, req);
+
+    if (p->cmp.hostStatus == BTSTAT_SUCCESS) {
+	p->cmp.hostStatus = BTSTAT_ABORTQUEUE;
+    }
+    pvscsi_complete_req(s, p);
+}
+\f

+
+/* Process a request from the request ring.  */
+static void pvscsi_process_req(PVSCSIState *s, struct PVSCSIRingReqDesc *r)
+{
+    SCSIDevice *d;
+    PVSCSIRequest *p = pvscsi_queue_request(s, &d, r);
+    int64_t datalen, n;
+
+    if (!d) {
+        p->cmp.hostStatus = BTSTAT_SELTIMEO;
+        pvscsi_complete_req(s, p);
+        return;
+    }
+
+    if (r->flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        p->sg.elemAddr = r->dataAddr;
+    }
+
+    p->sreq = scsi_req_new(d, r->context, p->lun);
+    n = scsi_req_enqueue(p->sreq, r->cdb);
+    if ((n > 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TODEVICE)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        scsi_req_cancel(p->sreq);
+        return;
+    }
+    if ((n < 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TOHOST)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        scsi_req_cancel(p->sreq);
+        return;
+    }
+
+    if (n) {
+        datalen = (n < 0 ? -n : n);
+        p->resid = MIN(datalen, r->dataLen);
+        scsi_req_continue(p->sreq);
+    }
+}
+
+/* Process pending requests on the request ring.  */
+static void pvscsi_process_req_ring(PVSCSIState *s)
+{
+    uint32_t req_entries = s->reqNumEntriesLog2;
+
+    trace_pvscsi_kick_io();
+    while (pvscsi_req_pending(s)) {
+        uint32_t val = pvscsi_ld_ring_state(s, reqConsIdx);
+        uint32_t idx = val & MASK(req_entries);
+        target_phys_addr_t addr;
+        struct PVSCSIRingReqDesc req_desc;
+
+        addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingReqDesc),
+                                    s->rings.reqRingPPNs);
+
+        barrier();
+        cpu_physical_memory_read(addr, (void *)&req_desc, sizeof(req_desc));
+        pvscsi_process_req(s, &req_desc);
+        barrier();
+        pvscsi_st_ring_state(s, reqConsIdx, val + 1);
+    }
+}
+
+\f

+static int32_t pvscsi_cmd_bad(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: bad command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_unimpl(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: unimplemented command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_adapter_reset(PVSCSIState *s)
+{
+    pvscsi_soft_reset(s);
+    return 0;
+}
+
+static int floor_log2(int x)
+{
+    assert(x);
+    return 31 - clz32(x);
+}
+
+/* Setup ring buffers and initialize the ring state page.  */
+static int32_t pvscsi_cmd_setup_rings(PVSCSIState *s)
+{
+    memcpy(&s->rings, s->cmd_buffer, sizeof(s->rings));
+    if (s->rings.reqRingNumPages == 0 ||
+        s->rings.cmpRingNumPages == 0) {
+        return -1;
+    }
+
+    s->reqNumEntriesLog2 = floor_log2(s->rings.reqRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingReqDesc));
+    s->cmpNumEntriesLog2 = floor_log2(s->rings.cmpRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingCmpDesc));
+
+    trace_pvscsi_setup_req_ring(s->rings.reqRingNumPages,
+                                1 << s->reqNumEntriesLog2);
+    trace_pvscsi_setup_cmp_ring(s->rings.cmpRingNumPages,
+                                1 << s->cmpNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, reqNumEntriesLog2, s->reqNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpNumEntriesLog2, s->cmpNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpProdIdx, 0);
+    pvscsi_st_ring_state(s, cmpConsIdx, 0);
+    pvscsi_st_ring_state(s, reqProdIdx, 0);
+    pvscsi_st_ring_state(s, reqConsIdx, 0);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_bus(PVSCSIState *s)
+{
+    qbus_reset_all(&s->bus.qbus);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_device(PVSCSIState *s)
+{
+    struct PVSCSICmdDescResetDevice *cmd =
+        (struct PVSCSICmdDescResetDevice *) &s->cmd_buffer;
+    SCSIDevice *sdev;
+
+    pvscsi_get_dev_lun(s, cmd->lun, cmd->target, &sdev);
+    if (sdev != NULL && sdev->info->qdev.reset) {
+        sdev->info->qdev.reset(&sdev->qdev);
+    }
+
+    return 0;
+}
+
+static int32_t pvscsi_cmd_abort_cmd(PVSCSIState *s)
+{
+    return 0;
+}
+
+static int32_t pvscsi_cmd_setup_msg_ring(PVSCSIState *s)
+{
+    memcpy(&s->msgRing, s->cmd_buffer, sizeof(s->msgRing));
+    if (s->msgRing.numPages == 0) {
+        return -1;
+    }
+
+    s->msgNumEntriesLog2 = floor_log2(s->msgRing.numPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingMsgDesc));
+
+    trace_pvscsi_setup_msg_ring(s->msgRing.numPages,
+                                1 << s->msgNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, msgNumEntriesLog2, s->msgNumEntriesLog2);
+    pvscsi_st_ring_state(s, msgProdIdx, 0);
+    pvscsi_st_ring_state(s, msgConsIdx, 0);
+    return 0;
+}
+
+typedef struct {
+    int nargs;
+    int32_t (*fn)(PVSCSIState *);
+} PVSCSICmd;
+
+static const PVSCSICmd pvscsi_commands[PVSCSI_CMD_LAST] = {
+    [PVSCSI_CMD_FIRST] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_bad,
+    },
+    [PVSCSI_CMD_ADAPTER_RESET] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_adapter_reset
+    },
+    [PVSCSI_CMD_ISSUE_SCSI] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_RINGS] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupRings) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_rings
+    },
+    [PVSCSI_CMD_RESET_BUS] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_reset_bus
+    },
+    [PVSCSI_CMD_RESET_DEVICE] = {
+        .nargs = sizeof(struct PVSCSICmdDescResetDevice) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_reset_device
+    },
+    [PVSCSI_CMD_ABORT_CMD] = {
+        .nargs = sizeof(struct PVSCSICmdDescAbortCmd) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_abort_cmd
+    },
+    [PVSCSI_CMD_CONFIG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_MSG_RING] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_msg_ring
+    },
+    [PVSCSI_CMD_DEVICE_UNPLUG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    }
+};
+
+\f

+static void pvscsi_maybe_do_cmd(PVSCSIState *s)
+{
+    int cmd = s->cmd_latch >= PVSCSI_CMD_LAST ? PVSCSI_CMD_FIRST : s->cmd_latch;
+    const PVSCSICmd *cmd_info = &pvscsi_commands[cmd];
+
+    if (s->cmd_ptr >= cmd_info->nargs) {
+        s->cmd_status = cmd_info->fn(s);
+        s->cmd_latch = 0;
+        s->cmd_ptr = 0;
+    }
+}
+
+static uint32_t pvscsi_reg_readl(PVSCSIState *s, int offset)
+{
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        fprintf(stderr, "vmw_pvscsi: read to write-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        return s->cmd_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        return s->intr_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        return s->intr_mask;
+        break;
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: read from unsupported register %x\n", offset);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static void pvscsi_reg_write(PVSCSIState *s, int offset, uint32_t val, int size)
+{
+    if (size != 4) {
+        switch (offset) {
+        case PVSCSI_REG_OFFSET_COMMAND:
+        case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_MASK:
+            abort();
+        default:
+            break;
+        }
+    }
+
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+        trace_pvscsi_cmd(val);
+        s->cmd_latch = val;
+        s->cmd_ptr = 0;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        s->cmd_buffer[s->cmd_ptr++] = val;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        fprintf(stderr, "vmw_pvscsi: write to read-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        pvscsi_acknowledge_intr(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        pvscsi_set_intr_mask(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        pvscsi_process_req_ring(s);
+        break;
+
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: write to unsupported register %x\n", offset);
+        break;
+    default:
+            break;
+    }
+}
+
+static void pvscsi_mmio_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 1);
+}
+
+static void pvscsi_mmio_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 2);
+}
+
+static void pvscsi_mmio_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 4);
+}
+
+static uint32_t pvscsi_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    return pvscsi_reg_readl(s, addr);
+}
+
+static CPUReadMemoryFunc * const pvscsi_mmio_readfn[3] = {
+    pvscsi_mmio_readb,
+    pvscsi_mmio_readw,
+    pvscsi_mmio_readl,
+};
+
+static CPUWriteMemoryFunc * const pvscsi_mmio_writefn[3] = {
+    pvscsi_mmio_writeb,
+    pvscsi_mmio_writew,
+    pvscsi_mmio_writel,
+};
+
+static void pvscsi_reset(DeviceState *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, dev);
+
+    pvscsi_soft_reset(s);
+}
+
+static int pvscsi_uninit(PCIDevice *d)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, d);
+
+    cpu_unregister_io_memory(s->mmio_io_addr);
+
+    return 0;
+}
+
+static struct SCSIBusOps pvscsi_scsi_ops = {
+    .transfer_data = pvscsi_transfer_data,
+    .complete = pvscsi_command_complete,
+    .cancel = pvscsi_request_cancelled
+};
+
+static int pvscsi_init(PCIDevice *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, dev);
+    uint8_t *pci_conf;
+
+    pci_conf = s->dev.config;
+
+    pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_VMWARE);
+    pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_VMWARE_PVSCSI);
+    pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_SCSI);
+
+    /* PCI subsystem ID */
+    pci_conf[PCI_SUBSYSTEM_ID] = 0x00;
+    pci_conf[PCI_SUBSYSTEM_ID + 1] = 0x10;
+
+    /* PCI latency timer = 255 */
+    pci_conf[PCI_LATENCY_TIMER] = 0xff;
+
+    /* Interrupt pin 1 */
+    pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+    s->mmio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar_simple(&s->dev, 0, PVSCSI_MEM_SPACE_SIZE,
+                            0, s->mmio_io_addr);
+
+#if 0
+    s->pio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar(&s->dev, 1, 256, PCI_BASE_ADDRESS_SPACE_IO,
+		     pvscsi_io_mapfunc);
+#endif
+
+    s->complete_reqs_bh = qemu_bh_new(pvscsi_complete_reqs, s);
+
+    scsi_bus_new(&s->bus, &dev->qdev, 1, PVSCSI_MAX_DEVS,
+                 &pvscsi_scsi_ops);
+    if (!dev->qdev.hotplugged) {
+        return scsi_bus_legacy_handle_cmdline(&s->bus);
+    }
+    return 0;
+}
+
+static PCIDeviceInfo pvscsi_info = {
+    .qdev.name  = "vmw_pvscsi",
+    .qdev.size  = sizeof(PVSCSIState),
+    .qdev.reset = pvscsi_reset,
+    .init       = pvscsi_init,
+    .exit       = pvscsi_uninit,
+};
+
+static void vmw_pvscsi_register_devices(void)
+{
+    pci_qdev_register(&pvscsi_info);
+}
+
+device_init(vmw_pvscsi_register_devices);
diff --git a/hw/vmw_pvscsi.h b/hw/vmw_pvscsi.h
new file mode 100644
index 0000000..b7fa3f6
--- /dev/null
+++ b/hw/vmw_pvscsi.h
@@ -0,0 +1,389 @@
+/*
+ * VMware PVSCSI header file
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _VMW_PVSCSI_H_
+#define _VMW_PVSCSI_H_
+
+#define PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT 128
+
+#define MASK(n)        ((1 << (n)) - 1)        /* make an n-bit mask */
+
+#define __packed __attribute__((packed))
+
+/*
+ * host adapter status/error codes
+ */
+enum HostBusAdapterStatus {
+   BTSTAT_SUCCESS       = 0x00,  /* CCB complete normally with no errors */
+   BTSTAT_LINKED_COMMAND_COMPLETED           = 0x0a,
+   BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG = 0x0b,
+   BTSTAT_DATA_UNDERRUN = 0x0c,
+   BTSTAT_SELTIMEO      = 0x11,  /* SCSI selection timeout */
+   BTSTAT_DATARUN       = 0x12,  /* data overrun/underrun */
+   BTSTAT_BUSFREE       = 0x13,  /* unexpected bus free */
+   BTSTAT_INVPHASE      = 0x14,  /* invalid bus phase or sequence requested by target */
+   BTSTAT_LUNMISMATCH   = 0x17,  /* linked CCB has different LUN from first CCB */
+   BTSTAT_SENSFAILED    = 0x1b,  /* auto request sense failed */
+   BTSTAT_TAGREJECT     = 0x1c,  /* SCSI II tagged queueing message rejected by target */
+   BTSTAT_BADMSG        = 0x1d,  /* unsupported message received by the host adapter */
+   BTSTAT_HAHARDWARE    = 0x20,  /* host adapter hardware failed */
+   BTSTAT_NORESPONSE    = 0x21,  /* target did not respond to SCSI ATN, sent a SCSI RST */
+   BTSTAT_SENTRST       = 0x22,  /* host adapter asserted a SCSI RST */
+   BTSTAT_RECVRST       = 0x23,  /* other SCSI devices asserted a SCSI RST */
+   BTSTAT_DISCONNECT    = 0x24,  /* target device reconnected improperly (w/o tag) */
+   BTSTAT_BUSRESET      = 0x25,  /* host adapter issued BUS device reset */
+   BTSTAT_ABORTQUEUE    = 0x26,  /* abort queue generated */
+   BTSTAT_HASOFTWARE    = 0x27,  /* host adapter software error */
+   BTSTAT_HATIMEOUT     = 0x30,  /* host adapter hardware timeout error */
+   BTSTAT_SCSIPARITY    = 0x34,  /* SCSI parity error detected */
+};
+
+/*
+ * Register offsets.
+ *
+ * These registers are accessible both via i/o space and mm i/o.
+ */
+
+enum PVSCSIRegOffset {
+	PVSCSI_REG_OFFSET_COMMAND        =    0x0,
+	PVSCSI_REG_OFFSET_COMMAND_DATA   =    0x4,
+	PVSCSI_REG_OFFSET_COMMAND_STATUS =    0x8,
+	PVSCSI_REG_OFFSET_LAST_STS_0     =  0x100,
+	PVSCSI_REG_OFFSET_LAST_STS_1     =  0x104,
+	PVSCSI_REG_OFFSET_LAST_STS_2     =  0x108,
+	PVSCSI_REG_OFFSET_LAST_STS_3     =  0x10c,
+	PVSCSI_REG_OFFSET_INTR_STATUS    = 0x100c,
+	PVSCSI_REG_OFFSET_INTR_MASK      = 0x2010,
+	PVSCSI_REG_OFFSET_KICK_NON_RW_IO = 0x3014,
+	PVSCSI_REG_OFFSET_DEBUG          = 0x3018,
+	PVSCSI_REG_OFFSET_KICK_RW_IO     = 0x4018,
+};
+
+/*
+ * Virtual h/w commands.
+ */
+
+enum PVSCSICommands {
+	PVSCSI_CMD_FIRST             = 0, /* has to be first */
+
+	PVSCSI_CMD_ADAPTER_RESET     = 1,
+	PVSCSI_CMD_ISSUE_SCSI        = 2,
+	PVSCSI_CMD_SETUP_RINGS       = 3,
+	PVSCSI_CMD_RESET_BUS         = 4,
+	PVSCSI_CMD_RESET_DEVICE      = 5,
+	PVSCSI_CMD_ABORT_CMD         = 6,
+	PVSCSI_CMD_CONFIG            = 7,
+	PVSCSI_CMD_SETUP_MSG_RING    = 8,
+	PVSCSI_CMD_DEVICE_UNPLUG     = 9,
+
+	PVSCSI_CMD_LAST              = 10  /* has to be last */
+};
+
+/*
+ * Command descriptor for PVSCSI_CMD_RESET_DEVICE --
+ */
+
+struct PVSCSICmdDescResetDevice {
+	uint32_t	target;
+	uint8_t		lun[8];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_ABORT_CMD --
+ *
+ * - currently does not support specifying the LUN.
+ * - _pad should be 0.
+ */
+
+struct PVSCSICmdDescAbortCmd {
+	uint64_t	context;
+	uint32_t	target;
+	uint32_t	_pad;
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_RINGS --
+ *
+ * Notes:
+ * - reqRingNumPages and cmpRingNumPages need to be power of two.
+ * - reqRingNumPages and cmpRingNumPages need to be different from 0,
+ * - reqRingNumPages and cmpRingNumPages need to be inferior to
+ *   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES.
+ */
+
+#define PVSCSI_SETUP_RINGS_MAX_NUM_PAGES        32
+struct PVSCSICmdDescSetupRings {
+	uint32_t	reqRingNumPages;
+	uint32_t	cmpRingNumPages;
+	uint64_t	ringsStatePPN;
+	uint64_t	reqRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+	uint64_t	cmpRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_MSG_RING --
+ *
+ * Notes:
+ * - this command was not supported in the initial revision of the h/w
+ *   interface. Before using it, you need to check that it is supported by
+ *   writing PVSCSI_CMD_SETUP_MSG_RING to the 'command' register, then
+ *   immediately after read the 'command status' register:
+ *       * a value of -1 means that the cmd is NOT supported,
+ *       * a value != -1 means that the cmd IS supported.
+ *   If it's supported the 'command status' register should return:
+ *      sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t).
+ * - this command should be issued _after_ the usual SETUP_RINGS so that the
+ *   RingsState page is already setup. If not, the command is a nop.
+ * - numPages needs to be a power of two,
+ * - numPages needs to be different from 0,
+ * - _pad should be zero.
+ */
+
+#define PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES  16
+
+struct PVSCSICmdDescSetupMsgRing {
+	uint32_t	numPages;
+	uint32_t	_pad;
+	uint64_t	ringPPNs[PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES];
+} __packed;
+
+enum PVSCSIMsgType {
+	PVSCSI_MSG_DEV_ADDED          = 0,
+	PVSCSI_MSG_DEV_REMOVED        = 1,
+	PVSCSI_MSG_LAST               = 2,
+};
+
+/*
+ * Msg descriptor.
+ *
+ * sizeof(struct PVSCSIRingMsgDesc) == 128.
+ *
+ * - type is of type enum PVSCSIMsgType.
+ * - the content of args depend on the type of event being delivered.
+ */
+
+struct PVSCSIRingMsgDesc {
+	uint32_t	type;
+	uint32_t	args[31];
+} __packed;
+
+struct PVSCSIMsgDescDevStatusChanged {
+	uint32_t	type;  /* PVSCSI_MSG_DEV _ADDED / _REMOVED */
+	uint32_t	bus;
+	uint32_t	target;
+	uint8_t		lun[8];
+	uint32_t	pad[27];
+} __packed;
+
+/*
+ * Rings state.
+ *
+ * - the fields:
+ *    . msgProdIdx,
+ *    . msgConsIdx,
+ *    . msgNumEntriesLog2,
+ *   .. are only used once the SETUP_MSG_RING cmd has been issued.
+ * - '_pad' helps to ensure that the msg related fields are on their own
+ *   cache-line.
+ */
+
+struct PVSCSIRingsState {
+	uint32_t	reqProdIdx;
+	uint32_t	reqConsIdx;
+	uint32_t	reqNumEntriesLog2;
+
+	uint32_t	cmpProdIdx;
+	uint32_t	cmpConsIdx;
+	uint32_t	cmpNumEntriesLog2;
+
+	uint8_t		_pad[104];
+
+	uint32_t	msgProdIdx;
+	uint32_t	msgConsIdx;
+	uint32_t	msgNumEntriesLog2;
+} __packed;
+
+/*
+ * Request descriptor.
+ *
+ * sizeof(RingReqDesc) = 128
+ *
+ * - context: is a unique identifier of a command. It could normally be any
+ *   64bit value, however we currently store it in the serialNumber variable
+ *   of struct SCSI_Command, so we have the following restrictions due to the
+ *   way this field is handled in the vmkernel storage stack:
+ *    * this value can't be 0,
+ *    * the upper 32bit need to be 0 since serialNumber is as a uint32_t.
+ *   Currently tracked as PR 292060.
+ * - dataLen: contains the total number of bytes that need to be transferred.
+ * - dataAddr:
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is set: dataAddr is the PA of the first
+ *     s/g table segment, each s/g segment is entirely contained on a single
+ *     page of physical memory,
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is NOT set, then dataAddr is the PA of
+ *     the buffer used for the DMA transfer,
+ * - flags:
+ *   * PVSCSI_FLAG_CMD_WITH_SG_LIST: see dataAddr above,
+ *   * PVSCSI_FLAG_CMD_DIR_NONE: no DMA involved,
+ *   * PVSCSI_FLAG_CMD_DIR_TOHOST: transfer from device to main memory,
+ *   * PVSCSI_FLAG_CMD_DIR_TODEVICE: transfer from main memory to device,
+ *   * PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB: reserved to handle CDBs larger than
+ *     16bytes. To be specified.
+ * - vcpuHint: vcpuId of the processor that will be most likely waiting for the
+ *   completion of the i/o. For guest OSes that use lowest priority message
+ *   delivery mode (such as windows), we use this "hint" to deliver the
+ *   completion action to the proper vcpu. For now, we can use the vcpuId of
+ *   the processor that initiated the i/o as a likely candidate for the vcpu
+ *   that will be waiting for the completion..
+ * - bus should be 0: we currently only support bus 0 for now.
+ * - unused should be zero'd.
+ */
+
+#define PVSCSI_FLAG_CMD_WITH_SG_LIST        (1 << 0)
+#define PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB     (1 << 1)
+#define PVSCSI_FLAG_CMD_DIR_NONE            (1 << 2)
+#define PVSCSI_FLAG_CMD_DIR_TOHOST          (1 << 3)
+#define PVSCSI_FLAG_CMD_DIR_TODEVICE        (1 << 4)
+
+struct PVSCSIRingReqDesc {
+	uint64_t	context;
+	uint64_t	dataAddr;
+	uint64_t	dataLen;
+	uint64_t	senseAddr;
+	uint32_t	senseLen;
+	uint32_t	flags;
+	uint8_t		cdb[16];
+	uint8_t		cdbLen;
+	uint8_t		lun[8];
+	uint8_t		tag;
+	uint8_t		bus;
+	uint8_t		target;
+	uint8_t		vcpuHint;
+	uint8_t		unused[59];
+} __packed;
+
+/*
+ * Scatter-gather list management.
+ *
+ * As described above, when PVSCSI_FLAG_CMD_WITH_SG_LIST is set in the
+ * RingReqDesc.flags, then RingReqDesc.dataAddr is the PA of the first s/g
+ * table segment.
+ *
+ * - each segment of the s/g table contain a succession of struct
+ *   PVSCSISGElement.
+ * - each segment is entirely contained on a single physical page of memory.
+ * - a "chain" s/g element has the flag PVSCSI_SGE_FLAG_CHAIN_ELEMENT set in
+ *   PVSCSISGElement.flags and in this case:
+ *     * addr is the PA of the next s/g segment,
+ *     * length is undefined, assumed to be 0.
+ */
+
+struct PVSCSISGElement {
+	uint64_t	addr;
+	uint32_t	length;
+	uint32_t	flags;
+} __packed;
+
+/*
+ * Completion descriptor.
+ *
+ * sizeof(RingCmpDesc) = 32
+ *
+ * - context: identifier of the command. The same thing that was specified
+ *   under "context" as part of struct RingReqDesc at initiation time,
+ * - dataLen: number of bytes transferred for the actual i/o operation,
+ * - senseLen: number of bytes written into the sense buffer,
+ * - hostStatus: adapter status,
+ * - scsiStatus: device status,
+ * - _pad should be zero.
+ */
+
+struct PVSCSIRingCmpDesc {
+	uint64_t	context;
+	uint64_t	dataLen;
+	uint32_t	senseLen;
+	uint16_t	hostStatus;
+	uint16_t	scsiStatus;
+	uint32_t	_pad[2];
+} __packed;
+
+/*
+ * Interrupt status / IRQ bits.
+ */
+
+#define PVSCSI_INTR_CMPL_0                 (1 << 0)
+#define PVSCSI_INTR_CMPL_1                 (1 << 1)
+#define PVSCSI_INTR_CMPL_MASK              MASK(2)
+
+#define PVSCSI_INTR_MSG_0                  (1 << 2)
+#define PVSCSI_INTR_MSG_1                  (1 << 3)
+#define PVSCSI_INTR_MSG_MASK               (MASK(2) << 2)
+
+#define PVSCSI_INTR_ALL_SUPPORTED          MASK(4)
+
+/*
+ * Number of MSI-X vectors supported.
+ */
+#define PVSCSI_MAX_INTRS        24
+
+/*
+ * Enumeration of supported MSI-X vectors
+ */
+#define PVSCSI_VECTOR_COMPLETION   0
+
+/*
+ * Misc constants for the rings.
+ */
+
+#define PVSCSI_MAX_NUM_PAGES_REQ_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_CMP_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_MSG_RING   PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES
+
+#define PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE \
+				(PAGE_SIZE / sizeof(struct PVSCSIRingReqDesc))
+
+#define PVSCSI_MAX_REQ_QUEUE_DEPTH \
+	(PVSCSI_MAX_NUM_PAGES_REQ_RING * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE)
+
+#define PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES     1
+#define PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES 1
+#define PVSCSI_MEM_SPACE_MISC_NUM_PAGES        2
+#define PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES     2
+#define PVSCSI_MEM_SPACE_MSIX_NUM_PAGES        2
+
+enum PVSCSIMemSpace {
+	PVSCSI_MEM_SPACE_COMMAND_PAGE		= 0,
+	PVSCSI_MEM_SPACE_INTR_STATUS_PAGE	= 1,
+	PVSCSI_MEM_SPACE_MISC_PAGE		= 2,
+	PVSCSI_MEM_SPACE_KICK_IO_PAGE		= 4,
+	PVSCSI_MEM_SPACE_MSIX_TABLE_PAGE	= 6,
+	PVSCSI_MEM_SPACE_MSIX_PBA_PAGE		= 7,
+};
+
+#define PVSCSI_MEM_SPACE_NUM_PAGES \
+	(PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES +   \
+	 PVSCSI_MEM_SPACE_MISC_NUM_PAGES +          \
+	 PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_MSIX_NUM_PAGES)
+
+#define PVSCSI_MEM_SPACE_SIZE        (PVSCSI_MEM_SPACE_NUM_PAGES * PAGE_SIZE)
+
+#endif /* _VMW_PVSCSI_H_ */
diff --git a/trace-events b/trace-events
index e0e9574..b4af0fd 100644
--- a/trace-events
+++ b/trace-events
@@ -214,6 +214,21 @@ disable scsi_req_parsed(int target, int lun, int tag, int cmd, int mode, int xfe
 disable scsi_req_parsed_lba(int target, int lun, int tag, int cmd, uint64_t lba) "target %d lun %d tag %d command %d lba %"PRIu64""
 disable scsi_req_parse_bad(int target, int lun, int tag, int cmd) "target %d lun %d tag %d command %d"
 
+# hw/vmw_pvscsi.c
+disable pvscsi_queue_request(uint64_t context, uint8_t command, uint64_t dataLen) "context %"PRIu64" command %d length %"PRIu64""
+disable pvscsi_sg_elem(uint64_t context, uint64_t addr, uint64_t length) "context %"PRIu64" addr %"PRIu64" length %"PRIu64""
+disable pvscsi_transfer_data(uint64_t context, uint64_t length) "context %"PRIu64" length %"PRIu64""
+disable pvscsi_request_sense(uint64_t context, int lun) "context %"PRIu64" lun %d"
+disable pvscsi_kick_io(void) "kick request ring"
+disable pvscsi_complete_req(uint64_t context, uint64_t length, uint8_t sense) "context %"PRIu64" length %"PRIu64" sense %d"
+disable pvscsi_cmp_ring_put(uint64_t context) "context %"PRIu64""
+disable pvscsi_raise_intr(uint32_t intr, const char *state) "raised intr %d %s"
+disable pvscsi_acknowledge_intr(uint32_t intr) "acknowledged intr %d"
+disable pvscsi_setup_req_ring(uint32_t pages, uint32_t entries) "req ring - %d pages %d entries"
+disable pvscsi_setup_cmp_ring(uint32_t pages, uint32_t entries) "cmp ring - %d pages %d entries"
+disable pvscsi_setup_msg_ring(uint32_t pages, uint32_t entries) "msg ring - %d pages %d entries"
+disable pvscsi_cmd(int cmd) "command %d"
+
 # vl.c
 disable vm_state_notify(int running, int reason) "running %d reason %d"
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 3/9] pvscsi: check validity of DMA addresses in advance
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 1/9] make qbus_reset_all public Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 2/9] pvscsi: first commit Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 4/9] scsi: always use get_sense Paolo Bonzini
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

This also depends on the cpu_physical_memory_map_fast series, so I'm
keeping it separated.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/vmw_pvscsi.c |   35 +++++++++++++++++++++++++++++++++++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
index 6f0b0b9..84bcfe6 100644
--- a/hw/vmw_pvscsi.c
+++ b/hw/vmw_pvscsi.c
@@ -383,6 +383,35 @@ static void pvscsi_transfer_data_with_sg_list(PVSCSIRequest *p, bool to_host,
     }
 }
 
+static bool pvscsi_check_sg_list_addresses(PVSCSIRequest *p)
+{
+    uint64_t len = p->req.dataLen;
+    uint64_t n;
+    PVSCSISGState sg = { .elemAddr = p->req.dataAddr };
+    while (len) {
+        while (!sg.resid) {
+            pvscsi_get_next_sg_elem(&sg);
+        }
+        n = MIN(len, sg.resid);
+        if (!cpu_physical_memory_map_check(sg.dataAddr, n)) {
+            return false;
+        }
+
+        len -= n;
+	sg.resid -= n;
+    }
+    return true;
+}
+
+static bool pvscsi_check_addresses(PVSCSIRequest *p)
+{
+    if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        return pvscsi_check_sg_list_addresses(p);
+    } else {
+        return cpu_physical_memory_map_check(p->req.dataAddr, p->req.dataLen);
+    }
+}
+
 /* Callback to indicate that the SCSI layer has completed a transfer.  */
 static void pvscsi_transfer_data(SCSIRequest *req, uint32_t len)
 {
@@ -487,6 +516,12 @@ static void pvscsi_process_req(PVSCSIState *s, struct PVSCSIRingReqDesc *r)
         return;
     }
 
+    if (!pvscsi_check_addresses(p)) {
+        p->cmp.hostStatus = BTSTAT_DATARUN;
+        pvscsi_complete_req(s, p);
+        return;
+    }
+
     if (r->flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
         p->sg.elemAddr = r->dataAddr;
     }
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 4/9] scsi: always use get_sense
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (2 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 3/9] pvscsi: check validity of DMA addresses in advance Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 5/9] scsi-disk: lazily allocate bounce buffer Paolo Bonzini
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

vscsi and vmw_pvscsi support autosensing by providing sense data
directly in the response.  Previous patches added usage of get_sense,
but kept the older state machine approach that sent REQUEST SENSE
commands separately.  Remove it, from now on all SCSIDevices will
have to support autosensing (as well as REQUEST SENSE of course).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/spapr_vscsi.c |   90 ++++++++++++-----------------------------------------
 hw/vmw_pvscsi.c  |   92 +++++++++++++----------------------------------------
 2 files changed, 43 insertions(+), 139 deletions(-)

diff --git a/hw/spapr_vscsi.c b/hw/spapr_vscsi.c
index 1c901ef..57b1c09 100644
--- a/hw/spapr_vscsi.c
+++ b/hw/spapr_vscsi.c
@@ -80,7 +80,6 @@ typedef struct vscsi_req {
     int                     active;
     long                    data_len;
     int                     writing;
-    int                     sensing;
     int                     senselen;
     uint8_t                 sense[SCSI_SENSE_BUF_SIZE];
 
@@ -445,40 +444,6 @@ static int vscsi_preprocess_desc(vscsi_req *req)
     return 0;
 }
 
-static void vscsi_send_request_sense(VSCSIState *s, vscsi_req *req)
-{
-    uint8_t *cdb = req->iu.srp.cmd.cdb;
-    int n;
-
-    n = scsi_req_get_sense(req->sreq, req->sense, sizeof(req->sense));
-    if (n) {
-        req->senselen = n;
-        vscsi_send_rsp(s, req, CHECK_CONDITION, 0, 0);
-        vscsi_put_req(s, req);
-        return;
-    }
-
-    dprintf("VSCSI: Got CHECK_CONDITION, requesting sense...\n");
-    cdb[0] = 3;
-    cdb[1] = 0;
-    cdb[2] = 0;
-    cdb[3] = 0;
-    cdb[4] = 96;
-    cdb[5] = 0;
-    req->sensing = 1;
-    n = scsi_req_enqueue(req->sreq, cdb);
-    dprintf("VSCSI: Queued request sense tag 0x%x\n", req->qtag);
-    if (n < 0) {
-        fprintf(stderr, "VSCSI: REQUEST_SENSE wants write data !?!?!?\n");
-        vscsi_makeup_sense(s, req, HARDWARE_ERROR, 0, 0);
-        scsi_req_abort(req->sreq, CHECK_CONDITION);
-        return;
-    } else if (n == 0) {
-        return;
-    }
-    scsi_req_continue(req->sreq);
-}
-
 /* Callback to indicate that the SCSI layer has completed a transfer.  */
 static void vscsi_transfer_data(SCSIRequest *sreq, uint32_t len)
 {
@@ -494,23 +459,6 @@ static void vscsi_transfer_data(SCSIRequest *sreq, uint32_t len)
         return;
     }
 
-    if (req->sensing) {
-        uint8_t *buf = scsi_req_get_buf(sreq);
-
-        len = MIN(len, SCSI_SENSE_BUF_SIZE);
-        dprintf("VSCSI: Sense data, %d bytes:\n", len);
-        dprintf("       %02x  %02x  %02x  %02x  %02x  %02x  %02x  %02x\n",
-                buf[0], buf[1], buf[2], buf[3],
-                buf[4], buf[5], buf[6], buf[7]);
-        dprintf("       %02x  %02x  %02x  %02x  %02x  %02x  %02x  %02x\n",
-                buf[8], buf[9], buf[10], buf[11],
-                buf[12], buf[13], buf[14], buf[15]);
-        memcpy(req->sense, buf, len);
-        req->senselen = len;
-        scsi_req_continue(req->sreq);
-        return;
-    }
-
     if (len) {
         buf = scsi_req_get_buf(sreq);
         rc = vscsi_srp_transfer_data(s, req, req->writing, buf, len);
@@ -541,28 +489,30 @@ static void vscsi_command_complete(SCSIRequest *sreq, uint32_t status)
         return;
     }
 
-    if (!req->sensing && status == CHECK_CONDITION) {
-        vscsi_send_request_sense(s, req);
-        return;
+    if (status == CHECK_CONDITION) {
+        req->senselen = scsi_req_get_sense(req->sreq, req->sense,
+					   sizeof(req->sense));
+        dprintf("VSCSI: Sense data, %d bytes:\n", len);
+        dprintf("       %02x  %02x  %02x  %02x  %02x  %02x  %02x  %02x\n",
+                req->sense[0], req->sense[1], req->sense[2], req->sense[3],
+                req->sense[4], req->sense[5], req->sense[6], req->sense[7]);
+        dprintf("       %02x  %02x  %02x  %02x  %02x  %02x  %02x  %02x\n",
+                req->sense[8], req->sense[9], req->sense[10], req->sense[11],
+                req->sense[12], req->sense[13], req->sense[14], req->sense[15]);
     }
 
-    if (req->sensing) {
-        dprintf("VSCSI: Sense done !\n");
-        status = CHECK_CONDITION;
-    } else {
-        dprintf("VSCSI: Command complete err=%d\n", status);
-        if (status == 0) {
-            /* We handle overflows, not underflows for normal commands,
-             * but hopefully nobody cares
-             */
-            if (req->writing) {
-                res_out = req->data_len;
-            } else {
-                res_in = req->data_len;
-            }
+    dprintf("VSCSI: Command complete err=%d\n", status);
+    if (status == 0) {
+        /* We handle overflows, not underflows for normal commands,
+         * but hopefully nobody cares
+         */
+        if (req->writing) {
+            res_out = req->data_len;
+        } else {
+            res_in = req->data_len;
         }
     }
-    vscsi_send_rsp(s, req, 0, res_in, res_out);
+    vscsi_send_rsp(s, req, status, res_in, res_out);
     vscsi_put_req(s, req);
 }
 
diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
index 84bcfe6..1cb6715 100644
--- a/hw/vmw_pvscsi.c
+++ b/hw/vmw_pvscsi.c
@@ -31,7 +31,6 @@ typedef struct PVSCSISGState {
 typedef struct PVSCSIRequest {
     SCSIDevice *sdev;
     SCSIRequest *sreq;
-    uint8_t sensing;
     uint8_t sense_key;
     uint8_t completed;
     int lun;
@@ -293,7 +292,6 @@ static void pvscsi_complete_req(PVSCSIState *s, PVSCSIRequest *p)
     qemu_bh_schedule(s->complete_reqs_bh);
 }
 
-/* Fetch sense data for a completed request.  */
 /* Write sense data for a completed request.  */
 static void pvscsi_write_sense(PVSCSIRequest *p, uint8_t *buf, int len)
 {
@@ -302,29 +300,6 @@ static void pvscsi_write_sense(PVSCSIRequest *p, uint8_t *buf, int len)
     cpu_physical_memory_write(p->req.senseAddr, buf, p->cmp.senseLen);
 }
 
-static bool pvscsi_send_request_sense(PVSCSIRequest *p)
-{
-    uint8_t cdb[6] = { 3, p->lun << 5, 0, 0, 96, 0 };
-    uint8_t sense[96];
-    int n;
-
-    n = scsi_req_get_sense(p->sreq, sense, sizeof(sense));
-    if (n) {
-        pvscsi_write_sense(p, sense, n);
-        return false;
-    }
-
-    trace_pvscsi_request_sense(p->sreq->tag, p->lun);
-    n = scsi_req_enqueue(p->sreq, cdb);
-    if (n < 0) {
-        /* should not happen, just leave sense data empty in this case. */
-        scsi_req_cancel(p->sreq);
-    } else if (n > 0) {
-        scsi_req_continue(p->sreq);
-    }
-    return true;
-}
-
 static void pvscsi_transfer_data_with_buffer(PVSCSIRequest *p, bool to_host,
                                              uint8_t *buf, int len)
 {
@@ -418,48 +393,33 @@ static void pvscsi_transfer_data(SCSIRequest *req, uint32_t len)
     PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
     PVSCSIRequest *p = pvscsi_find_request(s, req);
     uint8_t *buf = scsi_req_get_buf(req);
+    int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
 
     if (!p) {
         fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", req->tag);
         return;
     }
 
-    if (!p->sensing) {
-        int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
-
-        assert(p->resid);
-        trace_pvscsi_transfer_data(p->req.context, len);
-        if (!len) {
-            /* Short transfer.  */
-            p->cmp.hostStatus = BTSTAT_DATARUN;
-            scsi_req_cancel(req);
-            return;
-        }
-
-        if (len > p->resid) {
-            /* Small buffer.  */
-            p->cmp.hostStatus = BTSTAT_DATARUN;
-            scsi_req_cancel(req);
-            return;
-        }
+    assert(p->resid);
+    trace_pvscsi_transfer_data(p->req.context, len);
+    if (!len) {
+        /* Short transfer.  */
+        p->cmp.hostStatus = BTSTAT_DATARUN;
+        scsi_req_cancel(req);
+        return;
+    }
 
-        if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
-            pvscsi_transfer_data_with_sg_list(p, to_host, buf, len);
-        } else {
-            pvscsi_transfer_data_with_buffer(p, to_host, buf, len);
-        }
+    if (len > p->resid) {
+        /* Small buffer.  */
+        p->cmp.hostStatus = BTSTAT_DATARUN;
+        scsi_req_cancel(req);
+        return;
     }
 
-    else if (p->sensing == 1) {
-        /* Got sense data.  Write it back and kick the device to complete
-         * the request.  */
-        if (len) {
-            pvscsi_write_sense(p, buf, len);
-            if (buf[2] == NO_SENSE) {
-                p->cmp.scsiStatus = GOOD;
-            }
-        }
-        p->sensing = 2;
+    if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        pvscsi_transfer_data_with_sg_list(p, to_host, buf, len);
+    } else {
+        pvscsi_transfer_data_with_buffer(p, to_host, buf, len);
     }
 
     scsi_req_continue(req);
@@ -476,18 +436,12 @@ static void pvscsi_command_complete(SCSIRequest *req, uint32_t status)
         return;
     }
 
-    /* Here to complete the request.  */
-    if (!p->sensing) {
-        p->cmp.scsiStatus = status;
-
-        if (p->cmp.scsiStatus == CHECK_CONDITION) {
-            p->sensing = 1;
-            if (pvscsi_send_request_sense(p)) {
-                return;
-            }
-        }
+    p->cmp.scsiStatus = status;
+    if (p->cmp.scsiStatus == CHECK_CONDITION) {
+	uint8_t sense[96];
+        int n = scsi_req_get_sense(p->sreq, sense, sizeof(sense));
+        pvscsi_write_sense(p, sense, n);
     }
-
     pvscsi_complete_req(s, p);
 }
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 5/9] scsi-disk: lazily allocate bounce buffer
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (3 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 4/9] scsi: always use get_sense Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 6/9] allow switching a qiov between internal and external storage Paolo Bonzini
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi-disk.c |   32 ++++++++++++++++++++++----------
 1 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 21eb249..bc6003c 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -54,6 +54,7 @@ typedef struct SCSIDiskReq {
     /* Both sector and sector_count are in terms of qemu 512 byte blocks.  */
     uint64_t sector;
     uint32_t sector_count;
+    uint32_t buflen;
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
@@ -78,7 +79,7 @@ struct SCSIDiskState
 };
 
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type);
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf);
+static int scsi_disk_emulate_command(SCSIDiskReq *r);
 
 static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag,
         uint32_t lun)
@@ -89,7 +90,6 @@ static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag,
 
     req = scsi_req_alloc(sizeof(SCSIDiskReq), &s->qdev, tag, lun);
     r = DO_UPCAST(SCSIDiskReq, req, req);
-    r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE);
     return req;
 }
 
@@ -97,7 +97,9 @@ static void scsi_free_request(SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
 
-    qemu_vfree(r->iov.iov_base);
+    if (r->iov.iov_base) {
+        qemu_vfree(r->iov.iov_base);
+    }
 }
 
 static void scsi_disk_clear_sense(SCSIDiskState *s)
@@ -136,8 +138,13 @@ static void scsi_cancel_io(SCSIRequest *req)
 
 static uint32_t scsi_init_iovec(SCSIDiskReq *r)
 {
-    n = MIN(r->sector_count, SCSI_DMA_BUF_SIZE / 512);
-    r->iov.iov_len = n * 512;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    if (!r->iov.iov_base) {
+        r->buflen = SCSI_DMA_BUF_SIZE;
+        r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+    }
+    r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
     return r->qiov.size / 512;
 }
@@ -322,7 +329,7 @@ static void scsi_dma_restart_bh(void *opaque)
                 scsi_write_data(&r->req);
                 break;
             case SCSI_REQ_STATUS_RETRY_FLUSH:
-                ret = scsi_disk_emulate_command(r, r->iov.iov_base);
+                ret = scsi_disk_emulate_command(r);
                 if (ret == 0) {
                     scsi_command_complete(r, GOOD, SENSE_CODE(NO_SENSE));
                 }
@@ -815,14 +822,21 @@ static int scsi_disk_emulate_read_toc(SCSIRequest *req, uint8_t *outbuf)
     return toclen;
 }
 
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
+static int scsi_disk_emulate_command(SCSIDiskReq *r)
 {
     SCSIRequest *req = &r->req;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     uint64_t nb_sectors;
+    uint8_t *outbuf;
     int buflen = 0;
     int ret;
 
+    if (!r->iov.iov_base) {
+        r->iov.iov_base = qemu_blockalign(s->bs, 4096);
+        r->buflen = 4096;
+    }
+
+    outbuf = r->iov.iov_base;
     switch (req->cmd.buf[0]) {
     case TEST_UNIT_READY:
         if (!bdrv_is_inserted(s->bs))
@@ -1000,11 +1014,9 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     int32_t len;
     uint8_t command;
-    uint8_t *outbuf;
     int rc;
 
     command = buf[0];
-    outbuf = (uint8_t *)r->iov.iov_base;
     DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", lun, tag, buf[0]);
 
     if (scsi_req_parse(&r->req, buf) != 0) {
@@ -1051,7 +1063,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case REPORT_LUNS:
     case VERIFY:
     case REZERO_UNIT:
-        rc = scsi_disk_emulate_command(r, outbuf);
+        rc = scsi_disk_emulate_command(r);
         if (rc < 0) {
             return 0;
         }
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 6/9] allow switching a qiov between internal and external storage
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (4 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 5/9] scsi-disk: lazily allocate bounce buffer Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 7/9] scsi: push qiov to SCSIRequest Paolo Bonzini
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

qemu_iovec_reset and qemu_iovec_destroy will switch from external to
internal storage (it was previously forbidden to call it with external
storage).  So, qemu_iovec_destroy followed by qemu_iovec_init_external
will not leak memory when called on a qiov that already had internal
storage allocated.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cutils.c |   14 +++++++++++---
 1 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cutils.c b/cutils.c
index f9a7e36..0fef64a 100644
--- a/cutils.c
+++ b/cutils.c
@@ -215,14 +215,22 @@ void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
 
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
-    assert(qiov->nalloc != -1);
+    if (qiov->nalloc != -1) {
+        qemu_free(qiov->iov);
+        qiov->nalloc = 0;
+        qiov->iov = NULL;
+    }
 
-    qemu_free(qiov->iov);
+    qiov->niov = 0;
+    qiov->size = 0;
 }
 
 void qemu_iovec_reset(QEMUIOVector *qiov)
 {
-    assert(qiov->nalloc != -1);
+    if (qiov->nalloc == -1) {
+        qiov->nalloc = 0;
+        qiov->iov = NULL;
+    }
 
     qiov->niov = 0;
     qiov->size = 0;
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 7/9] scsi: push qiov to SCSIRequest
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (5 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 6/9] allow switching a qiov between internal and external storage Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 8/9] scsi: add get_iovec/unmap_iovec to SCSIBusOps Paolo Bonzini
  2011-06-06 16:27 ` [Qemu-devel] [RFC PATCH 9/9] pvscsi: implement s/g operation without a bounce buffer Paolo Bonzini
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

The simplest place to put an QEMUIOVec for the HBA is the SCSIRequest.
So, push the SCSIDisk's qiov member up and make it visible to the targets.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi-bus.c  |    4 +++-
 hw/scsi-disk.c |   19 +++++++++----------
 hw/scsi.h      |    1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index ad6a730..3b545ed 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -148,7 +148,9 @@ SCSIRequest *scsi_req_alloc(size_t size, SCSIDevice *d, uint32_t tag, uint32_t l
 
 SCSIRequest *scsi_req_new(SCSIDevice *d, uint32_t tag, uint32_t lun)
 {
-    return d->info->alloc_req(d, tag, lun);
+    SCSIRequest *req = d->info->alloc_req(d, tag, lun);
+    qemu_iovec_reset(&req->qiov);
+    return req;
 }
 
 uint8_t *scsi_req_get_buf(SCSIRequest *req)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index bc6003c..4f5ba1c 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -56,7 +56,6 @@ typedef struct SCSIDiskReq {
     uint32_t sector_count;
     uint32_t buflen;
     struct iovec iov;
-    QEMUIOVector qiov;
     uint32_t status;
 } SCSIDiskReq;
 
@@ -145,8 +144,8 @@ static uint32_t scsi_init_iovec(SCSIDiskReq *r)
         r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
     }
     r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
-    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
-    return r->qiov.size / 512;
+    qemu_iovec_init_external(&r->req.qiov, &r->iov, 1);
+    return r->req.qiov.size / 512;
 }
 
 static void scsi_read_complete(void * opaque, int ret)
@@ -164,10 +163,10 @@ static void scsi_read_complete(void * opaque, int ret)
 
     DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
 
-    n = r->qiov.size / 512;
+    n = r->req.qiov.size / 512;
     r->sector += n;
     r->sector_count -= n;
-    scsi_req_data(&r->req, r->qiov.size);
+    scsi_req_data(&r->req, r->req.qiov.size);
 }
 
 
@@ -200,7 +199,7 @@ static void scsi_read_data(SCSIRequest *req)
     }
 
     n = scsi_init_iovec(r);
-    r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
+    r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->req.qiov, n,
                               scsi_read_complete, r);
     if (r->req.aiocb == NULL) {
         scsi_read_complete(r, -EIO);
@@ -263,7 +262,7 @@ static void scsi_write_complete(void * opaque, int ret)
         }
     }
 
-    n = r->qiov.size / 512;
+    n = r->req.qiov.size / 512;
     r->sector += n;
     r->sector_count -= n;
     if (r->sector_count == 0) {
@@ -271,7 +270,7 @@ static void scsi_write_complete(void * opaque, int ret)
     } else {
         len = scsi_init_iovec(r);
         DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, len);
-        scsi_req_data(&r->req, r->qiov.size);
+        scsi_req_data(&r->req, r->req.qiov.size);
     }
 }
 
@@ -290,9 +289,9 @@ static void scsi_write_data(SCSIRequest *req)
         return;
     }
 
-    n = r->qiov.size / 512;
+    n = r->req.qiov.size / 512;
     if (n) {
-        r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
+        r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->req.qiov, n,
                                        scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
             scsi_write_complete(r, -ENOMEM);
diff --git a/hw/scsi.h b/hw/scsi.h
index c1dca35..b551b57 100644
--- a/hw/scsi.h
+++ b/hw/scsi.h
@@ -30,6 +30,7 @@ typedef struct SCSISense {
 struct SCSIRequest {
     SCSIBus           *bus;
     SCSIDevice        *dev;
+    QEMUIOVector      qiov;
     uint32_t          refcount;
     uint32_t          tag;
     uint32_t          lun;
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 8/9] scsi: add get_iovec/unmap_iovec to SCSIBusOps
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (6 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 7/9] scsi: push qiov to SCSIRequest Paolo Bonzini
@ 2011-06-06 16:26 ` Paolo Bonzini
  2011-06-06 16:27 ` [Qemu-devel] [RFC PATCH 9/9] pvscsi: implement s/g operation without a bounce buffer Paolo Bonzini
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:26 UTC (permalink / raw)
  To: qemu-devel

This lets scsi-disk avoid a bounce buffer.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi-bus.c  |   16 ++++++++++++----
 hw/scsi-disk.c |   21 ++++++++++++++++-----
 hw/scsi.h      |    3 +++
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 3b545ed..8e888e8 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -618,6 +618,8 @@ void scsi_req_unref(SCSIRequest *req)
         if (req->dev->info->free_req) {
             req->dev->info->free_req(req);
         }
+        qemu_iovec_reset(&req->qiov);
+        qemu_iovec_destroy(&req->qiov);
         qemu_free(req);
     }
 }
@@ -634,13 +636,19 @@ void scsi_req_continue(SCSIRequest *req)
     }
 }
 
-/* Called by the devices when data is ready for the HBA.  The HBA should
-   start a DMA operation to read or fill the device's data buffer.
-   Once it completes, calling scsi_req_continue will restart I/O.  */
+/* Called by the devices when data is ready for the HBA.  Depending on
+   the transfer method it will call either unmap_iovec or transfer_data.
+   Execution of the command resumes when the HBA calls scsi_req_continue;
+   if applicable, the HBA should do so only after completing any DMA
+   operations involving the request buffer.  */
 void scsi_req_data(SCSIRequest *req, int len)
 {
     trace_scsi_req_data(req->dev->id, req->lun, req->tag, len);
-    req->bus->ops->transfer_data(req, len);
+    if (req->has_sg_list) {
+        req->bus->ops->unmap_iovec(req, len);
+    } else {
+        req->bus->ops->transfer_data(req, len);
+    }
 }
 
 void scsi_req_print(SCSIRequest *req)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 4f5ba1c..e13b633 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -139,12 +139,23 @@ static uint32_t scsi_init_iovec(SCSIDiskReq *r)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
-    if (!r->iov.iov_base) {
-        r->buflen = SCSI_DMA_BUF_SIZE;
-        r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+    qemu_iovec_reset(&r->req.qiov);
+    if (r->req.bus->ops->get_iovec) {
+        r->req.bus->ops->get_iovec(&r->req, r->sector_count * 512);
+    }
+    if (r->req.qiov.size) {
+        assert((r->req.qiov.size % 512) == 0);
+        r->req.has_sg_list = true;
+    } else {
+        if (!r->iov.iov_base) {
+            r->buflen = SCSI_DMA_BUF_SIZE;
+            r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+        }
+        r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
+        qemu_iovec_destroy(&r->req.qiov);
+        qemu_iovec_init_external(&r->req.qiov, &r->iov, 1);
+        r->req.has_sg_list = false;
     }
-    r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
-    qemu_iovec_init_external(&r->req.qiov, &r->iov, 1);
     return r->req.qiov.size / 512;
 }
 
diff --git a/hw/scsi.h b/hw/scsi.h
index b551b57..9627404 100644
--- a/hw/scsi.h
+++ b/hw/scsi.h
@@ -35,6 +35,7 @@ struct SCSIRequest {
     uint32_t          tag;
     uint32_t          lun;
     uint32_t          status;
+    bool              has_sg_list;
     struct {
         uint8_t buf[SCSI_CMD_BUF_SIZE];
         int len;
@@ -82,6 +83,8 @@ struct SCSIBusOps {
     void (*transfer_data)(SCSIRequest *req, uint32_t arg);
     void (*complete)(SCSIRequest *req, uint32_t arg);
     void (*cancel)(SCSIRequest *req);
+    void (*get_iovec)(SCSIRequest *req, uint64_t len);
+    void (*unmap_iovec)(SCSIRequest *req, uint64_t len);
 };
 
 struct SCSIBus {
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [RFC PATCH 9/9] pvscsi: implement s/g operation without a bounce buffer
  2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
                   ` (7 preceding siblings ...)
  2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 8/9] scsi: add get_iovec/unmap_iovec to SCSIBusOps Paolo Bonzini
@ 2011-06-06 16:27 ` Paolo Bonzini
  8 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2011-06-06 16:27 UTC (permalink / raw)
  To: qemu-devel

This implements the new callbacks in the pvscsi device.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/vmw_pvscsi.c |  115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 114 insertions(+), 1 deletions(-)

diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
index 1cb6715..ac16e52 100644
--- a/hw/vmw_pvscsi.c
+++ b/hw/vmw_pvscsi.c
@@ -49,6 +49,7 @@ typedef struct {
     QEMUBH *complete_reqs_bh;
 
     int mmio_io_addr;
+    uint32_t use_iovec;
 
     /* zeroed on reset */
     uint32_t cmd_latch;
@@ -387,6 +388,112 @@ static bool pvscsi_check_addresses(PVSCSIRequest *p)
     }
 }
 
+static bool pvscsi_iovec_add(PVSCSIRequest *p, target_phys_addr_t addr,
+                             uint64_t len)
+{
+    while (len) {
+        target_phys_addr_t n = len;
+        uint8_t *buf = cpu_physical_memory_map_fast(addr, &n);
+        if (!buf) {
+            return false;
+        }
+        qemu_iovec_add(&p->sreq->qiov, buf, n);
+        addr += n;
+        len -= n;
+    }
+    return true;
+}
+
+static bool pvscsi_get_sg_list_iovec(PVSCSIRequest *p, uint64_t len)
+{
+    int n;
+    PVSCSISGState sg = p->sg;
+    while (len) {
+        while (!sg.resid) {
+            pvscsi_get_next_sg_elem(&sg);
+            trace_pvscsi_sg_elem(p->req.context, sg.dataAddr, sg.resid);
+        }
+        assert(len > 0);
+        n = MIN((unsigned) len, sg.resid);
+        if (n) {
+            if (!pvscsi_iovec_add(p, sg.dataAddr, n)) {
+	        return false;
+	    }
+        }
+
+        sg.dataAddr += n;
+
+        len -= n;
+        sg.resid -= n;
+    }
+    return true;
+}
+
+static void pvscsi_get_iovec(SCSIRequest *req, uint64_t len)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, req);
+    bool ok;
+
+    if (!s->use_iovec) {
+        return;
+    }
+    if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        ok = pvscsi_get_sg_list_iovec(p, len);
+    } else {
+        ok = pvscsi_iovec_add(p, p->req.dataAddr, MIN(len, p->req.dataLen));
+    }
+    if (!ok) {
+        qemu_iovec_reset(&p->sreq->qiov);
+    }
+}
+
+/* Callback to indicate that the SCSI layer has completed a transfer.  */
+static void pvscsi_unmap_iovec(SCSIRequest *req, uint64_t len)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, req->bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, req);
+    int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
+    QEMUIOVector *qiov = &req->qiov;
+    int i;
+
+    if (!p) {
+        fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", req->tag);
+        return;
+    }
+
+    trace_pvscsi_transfer_data(p->req.context, len);
+    if (!len) {
+        /* Short transfer.  */
+        p->cmp.hostStatus = BTSTAT_DATARUN;
+        scsi_req_cancel(req);
+        return;
+    }
+
+    for (i = 0; i < qiov->niov; i++) {
+	uint64_t n = req->qiov.iov[i].iov_len;
+        uint64_t access_len = MIN(len, n);
+	cpu_physical_memory_unmap(req->qiov.iov[i].iov_base, n,
+                                  to_host, access_len);
+
+        if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+            while (!p->sg.resid) {
+                pvscsi_get_next_sg_elem(&p->sg);
+            }
+            assert(n <= p->sg.resid);
+            p->sg.dataAddr += n;
+            p->sg.resid -= n;
+        }
+
+        assert(access_len <= p->resid);
+        p->cmp.dataLen += access_len;
+        p->resid -= access_len;
+        len -= access_len;
+    }
+
+    scsi_req_continue(req);
+}
+
 /* Callback to indicate that the SCSI layer has completed a transfer.  */
 static void pvscsi_transfer_data(SCSIRequest *req, uint32_t len)
 {
@@ -837,7 +944,9 @@ static int pvscsi_uninit(PCIDevice *d)
 static struct SCSIBusOps pvscsi_scsi_ops = {
     .transfer_data = pvscsi_transfer_data,
     .complete = pvscsi_command_complete,
-    .cancel = pvscsi_request_cancelled
+    .cancel = pvscsi_request_cancelled,
+    .get_iovec = pvscsi_get_iovec,
+    .unmap_iovec = pvscsi_unmap_iovec
 };
 
 static int pvscsi_init(PCIDevice *dev)
@@ -891,6 +1000,10 @@ static PCIDeviceInfo pvscsi_info = {
     .qdev.reset = pvscsi_reset,
     .init       = pvscsi_init,
     .exit       = pvscsi_uninit,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_BIT("sg", PVSCSIState, use_iovec,   0, true),
+        DEFINE_PROP_END_OF_LIST(),
+    },
 };
 
 static void vmw_pvscsi_register_devices(void)
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-06-06 16:27 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-06 16:26 [Qemu-devel] [RFC PATCH 0/9] scsi: support s/g operation without a bounce buffer Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 1/9] make qbus_reset_all public Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 2/9] pvscsi: first commit Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 3/9] pvscsi: check validity of DMA addresses in advance Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 4/9] scsi: always use get_sense Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 5/9] scsi-disk: lazily allocate bounce buffer Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 6/9] allow switching a qiov between internal and external storage Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 7/9] scsi: push qiov to SCSIRequest Paolo Bonzini
2011-06-06 16:26 ` [Qemu-devel] [RFC PATCH 8/9] scsi: add get_iovec/unmap_iovec to SCSIBusOps Paolo Bonzini
2011-06-06 16:27 ` [Qemu-devel] [RFC PATCH 9/9] pvscsi: implement s/g operation without a bounce buffer Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.