[Qemu-devel] [RFC PATCH] implement vmware pvscsi device

* [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
@ 2011-04-15 13:42 Paolo Bonzini
  2011-04-15 14:01 ` Stefan Hajnoczi
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 13:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Zachary Amsden

Lightly tested with Linux guests; at least it can successfully partition
and format a disk.  scsi-generic also lightly tested.

Doesn't do migration, doesn't do hotplug (the device would support that,
but it is not 100% documented and the Linux driver in particular cannot
initiate hot-unplug).  I did it as quick one-day hack to study the SCSI
subsystem and it is my first real foray into device model land, please
be gentle. :)

vmw_pvscsi.h is taken from Linux, so it doesn't fully respect coding
standards.  I think that's fair.

Size is curiously close to the recently added sPAPR adapter:

  911  2354 25553 hw/vmw_pvscsi.c
  988  3177 29628 hw/spapr_vscsi.c

Sounds like that's just the amount of code it takes to implement a SCSI
HBA in QEMU. :)

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
---
 Makefile.objs           |    1 +
 default-configs/pci.mak |    1 +
 hw/pci.h                |    1 +
 hw/vmw_pvscsi.c         |  911 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/vmw_pvscsi.h         |  389 ++++++++++++++++++++
 trace-events            |   15 +
 6 files changed, 1318 insertions(+), 0 deletions(-)
 create mode 100644 hw/vmw_pvscsi.c
 create mode 100644 hw/vmw_pvscsi.h

diff --git a/Makefile.objs b/Makefile.objs
index 44ce368..f056502 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -255,6 +255,7 @@ hw-obj-$(CONFIG_AHCI) += ide/ich.o
 
 # SCSI layer
 hw-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
+hw-obj-$(CONFIG_VMWARE_PVSCSI_PCI) += vmw_pvscsi.o
 hw-obj-$(CONFIG_ESP) += esp.o
 
 hw-obj-y += dma-helpers.o sysbus.o isa-bus.o
diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 0471efb..b1817f5 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -8,6 +8,7 @@ CONFIG_EEPRO100_PCI=y
 CONFIG_PCNET_PCI=y
 CONFIG_PCNET_COMMON=y
 CONFIG_LSI_SCSI_PCI=y
+CONFIG_VMWARE_PVSCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
 CONFIG_IDE_CORE=y
diff --git a/hw/pci.h b/hw/pci.h
index 52ee8c9..26ce6d7 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -59,6 +59,7 @@
 #define PCI_DEVICE_ID_VMWARE_NET         0x0720
 #define PCI_DEVICE_ID_VMWARE_SCSI        0x0730
 #define PCI_DEVICE_ID_VMWARE_IDE         0x1729
+#define PCI_DEVICE_ID_VMWARE_PVSCSI      0x07c0
 
 /* Intel (0x8086) */
 #define PCI_DEVICE_ID_INTEL_82551IT      0x1209
diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
new file mode 100644
index 0000000..fdda652
--- /dev/null
+++ b/hw/vmw_pvscsi.c
@@ -0,0 +1,911 @@
+/*
+ * VMware Paravirtualized SCSI Host Bus Adapter emulation
+ *
+ * Copyright (c) 2011 Red Hat, Inc.
+ * Written by Paolo Bonzini
+ *
+ * This code is licensed under GPLv2 or later.
+ */
+
+#include <assert.h>
+
+#include "hw.h"
+#include "pci.h"
+#include "scsi.h"
+#include "scsi-defs.h"
+#include "vmw_pvscsi.h"
+#include "block_int.h"
+#include "host-utils.h"
+#include "trace.h"
+
+#define PVSCSI_MAX_DEVS 127
+#define PAGE_SIZE       4096
+#define PAGE_SHIFT      12
+
+typedef struct PVSCSIRequest {
+    SCSIDevice *sdev;
+    uint8_t sensing;
+    uint8_t sense_key;
+    uint8_t completed;
+    int lun;
+    target_phys_addr_t sg_current_addr;
+    target_phys_addr_t sg_current_dataAddr;
+    uint32_t sg_current_resid;
+    uint64_t resid;
+    struct PVSCSIRingReqDesc req;
+    struct PVSCSIRingCmpDesc cmp;
+    QTAILQ_ENTRY(PVSCSIRequest) next;
+} PVSCSIRequest;
+
+typedef QTAILQ_HEAD(, PVSCSIRequest) PVSCSIRequestList;
+
+typedef struct {
+    PCIDevice dev;
+    SCSIBus bus;
+    QEMUBH *complete_reqs_bh;
+
+    int mmio_io_addr;
+
+    /* zeroed on reset */
+    uint32_t cmd_latch;
+    uint32_t cmd_buffer[sizeof(struct PVSCSICmdDescSetupRings)
+                        / sizeof(uint32_t)];
+    uint32_t cmd_ptr;
+    uint32_t cmd_status;
+    uint32_t intr_status;
+    uint32_t intr_mask;
+    uint32_t intr_cmpl;
+    uint32_t intr_msg;
+    struct PVSCSICmdDescSetupRings rings;
+    struct PVSCSICmdDescSetupMsgRing msgRing;
+    uint32_t reqNumEntriesLog2;
+    uint32_t cmpNumEntriesLog2;
+    uint32_t msgNumEntriesLog2;
+
+    PVSCSIRequestList pending_queue;
+    PVSCSIRequestList complete_queue;
+} PVSCSIState;
+
+\f

+static inline int pvscsi_get_lun(uint8_t *lun)
+{
+    uint64_t lunval;
+    lunval = ((uint64_t)lun[0] << 56) || ((uint64_t)lun[1] << 48) ||
+             ((uint64_t)lun[2] << 40) || ((uint64_t)lun[3] << 32) ||
+             ((uint64_t)lun[4] << 24) || ((uint64_t)lun[5] << 16) ||
+             ((uint64_t)lun[6] <<  8) ||  (uint64_t)lun[7];
+    if ((lunval & ~(uint64_t) 255) != 0) {
+        return -1;
+    }
+    return lunval & 255;
+}
+
+static inline int pvscsi_get_dev_lun(PVSCSIState *s,
+                                     uint8_t *lun, uint32_t target,
+                                     SCSIDevice **sdev)
+{
+    SCSIBus *bus = &s->bus;
+    int lunval;
+    *sdev = NULL;
+    if (target > PVSCSI_MAX_DEVS) {
+        return -1;
+    }
+    lunval = pvscsi_get_lun(lun);
+    if (lunval < 0) {
+        return -1;
+    }
+    *sdev = bus->devs[target];
+    if (!sdev) {
+        return -1;
+    }
+    return lunval;
+}
+
+\f

+/* Add a command to the pending queue.  */
+static PVSCSIRequest *pvscsi_queue_request(PVSCSIState *s,
+                                           struct PVSCSIRingReqDesc *req)
+{
+    SCSIDevice *sdev;
+    PVSCSIRequest *p;
+    int lun;
+
+    trace_pvscsi_queue_request(req->context, req->cdb[0], req->dataLen);
+
+    p = qemu_mallocz(sizeof(*p));
+    p->req = *req;
+    p->cmp.context = p->req.context;
+    QTAILQ_INSERT_TAIL(&s->pending_queue, p, next);
+
+    lun = pvscsi_get_dev_lun(s, req->lun, req->target, &sdev);
+    if (!sdev) {
+        return p;
+    }
+
+    p->lun = lun;
+    p->sdev = sdev;
+    return p;
+}
+
+/* Get PVSCSIRequest for this tag.  */
+static PVSCSIRequest *pvscsi_find_request(PVSCSIState *s, uint32_t tag)
+{
+    PVSCSIRequest *p;
+
+    QTAILQ_FOREACH(p, &s->pending_queue, next) {
+        if (p->req.context == tag) {
+            return p;
+        }
+    }
+    return NULL;
+}
+
+static void pvscsi_free_queue(PVSCSIRequestList *q)
+{
+    PVSCSIRequest *p;
+
+    while (!QTAILQ_EMPTY(q)) {
+        p = QTAILQ_FIRST(q);
+        QTAILQ_REMOVE(q, p, next);
+        qemu_free(p);
+    }
+}
+
+static void pvscsi_soft_reset(PVSCSIState *s)
+{
+    qbus_reset_all_fn(&s->bus);
+    pvscsi_free_queue(&s->complete_queue);
+    assert(QTAILQ_EMPTY(&s->pending_queue));
+    memset(&s->cmd_latch, 0, sizeof(*s) - offsetof(PVSCSIState, cmd_latch));
+    s->intr_cmpl = PVSCSI_INTR_CMPL_0;
+    s->intr_msg = PVSCSI_INTR_MSG_0;
+    QTAILQ_INIT(&s->pending_queue);
+    QTAILQ_INIT(&s->complete_queue);
+}
+
+\f

+static void pvscsi_raise_intr(PVSCSIState *s, int mask)
+{
+    int intr_raised = mask & ~s->intr_status;
+    s->intr_status |= mask;
+    trace_pvscsi_raise_intr(intr_raised,
+                            (intr_raised & s->intr_mask) == 0 ? "masked" : "");
+    if (intr_raised & s->intr_mask) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+}
+
+static void pvscsi_acknowledge_intr(PVSCSIState *s, int mask)
+{
+    trace_pvscsi_acknowledge_intr(mask);
+    s->intr_status &= ~mask;
+    if (mask == s->intr_cmpl) {
+        s->intr_cmpl ^= PVSCSI_INTR_CMPL_MASK;
+
+        /* Try putting more complete requests on the ring.  */
+        if (!QTAILQ_EMPTY(&s->complete_queue)) {
+            qemu_bh_schedule(s->complete_reqs_bh);
+        }
+    }
+    if (mask == s->intr_msg) {
+        s->intr_msg ^= PVSCSI_INTR_MSG_MASK;
+    }
+    if ((s->intr_status & s->intr_mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+static void pvscsi_set_intr_mask(PVSCSIState *s, int mask)
+{
+    int intr_enabled = mask & ~s->intr_mask;
+    s->intr_mask = mask;
+    if (s->intr_status & intr_enabled) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+    if ((s->intr_status & mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+\f

+#define pvscsi_ld_ring_state(s, field) \
+    ldl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field))
+
+#define pvscsi_st_ring_state(s, field, val) \
+    stl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field), \
+             val)
+
+/* Return number of free elements in the completion ring.  */
+static inline int pvscsi_cmp_free(PVSCSIState *s)
+{
+    return ((1 << s->cmpNumEntriesLog2) - 1 -
+            (pvscsi_ld_ring_state(s, cmpProdIdx) - pvscsi_ld_ring_state(s, cmpConsIdx)));
+}
+
+/* Return number of pending elements in the request ring.  */
+static inline int pvscsi_req_pending(PVSCSIState *s)
+{
+    return pvscsi_ld_ring_state(s, reqProdIdx) - pvscsi_ld_ring_state(s, reqConsIdx);
+}
+
+/* Return the physical address of the idx-th element in the ring
+ * whose physical page numbers are given by ppn.  Each element in
+ * the ring has size bytes.  */
+static target_phys_addr_t pvscsi_get_ring_addr(PVSCSIState *s, int idx,
+                                               int size, uint64_t *ppn)
+{
+    uint32_t ofs = idx * size;
+    return (ppn[ofs >> PAGE_SHIFT] * PAGE_SIZE) | (ofs & (PAGE_SIZE - 1));
+}
+\f

+
+#define barrier()
+
+/* Copy cmp_desc on the completion ring, assuming there is a free entry.  */
+static void pvscsi_cmp_ring_put(PVSCSIState *s,
+                                struct PVSCSIRingCmpDesc *cmp_desc)
+{
+    uint32_t cmp_entries = s->cmpNumEntriesLog2;
+    uint32_t val = pvscsi_ld_ring_state(s, cmpProdIdx);
+    uint32_t idx = val & MASK(cmp_entries);
+    target_phys_addr_t addr;
+
+    trace_pvscsi_cmp_ring_put(cmp_desc->context);
+    addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingCmpDesc),
+                                s->rings.cmpRingPPNs);
+
+    barrier();
+    cpu_physical_memory_write(addr, (void *)cmp_desc, sizeof(*cmp_desc));
+    barrier();
+    pvscsi_st_ring_state(s, cmpProdIdx, val + 1);
+}
+
+/* Put all completed requests on the completion ring.  */
+static void pvscsi_complete_reqs(void *opaque)
+{
+    PVSCSIState *s = opaque;
+    PVSCSIRequest *p;
+    int n = pvscsi_cmp_free(s);
+    int done = 0;
+    while (n > 0 && !QTAILQ_EMPTY(&s->complete_queue)) {
+        p = QTAILQ_FIRST(&s->complete_queue);
+        QTAILQ_REMOVE(&s->complete_queue, p, next);
+        pvscsi_cmp_ring_put(s, &p->cmp);
+        qemu_free(p);
+        n--;
+        done++;
+    }
+    if (done) {
+        pvscsi_raise_intr(s, s->intr_cmpl);
+    }
+}
+
+/* Prepare to put r on the completion ring.  */
+static void pvscsi_complete_req(PVSCSIState *s, PVSCSIRequest *p)
+{
+    assert(!p->completed);
+    trace_pvscsi_complete_req(p->cmp.context, p->cmp.dataLen, p->sense_key);
+    p->completed = 1;
+    QTAILQ_REMOVE(&s->pending_queue, p, next);
+    QTAILQ_INSERT_TAIL(&s->complete_queue, p, next);
+    qemu_bh_schedule(s->complete_reqs_bh);
+}
+
+/* Fetch sense data for a completed request.  */
+static bool pvscsi_send_request_sense(SCSIDevice *sdev, int tag, int lun)
+{
+    uint8_t cdb[6] = { 3, lun << 5, 0, 0, 96, 0 };
+    trace_pvscsi_request_sense(tag, lun);
+    int n = sdev->info->send_command(sdev, tag, cdb, lun);
+    if (n < 0) {
+        /* should not happen, just leave sense data empty in this case. */
+        sdev->info->cancel_io(sdev, tag);
+    } else if (n > 0) {
+        sdev->info->read_data(sdev, tag);
+        return true;
+    }
+    return false;
+}
+
+/* Write sense data for a completed request.  */
+static void pvscsi_write_sense(PVSCSIRequest *p, uint8_t *buf, int len)
+{
+    p->cmp.senseLen = MIN(p->req.senseLen, len);
+    p->sense_key = buf[2];
+    cpu_physical_memory_write(p->req.senseAddr, buf, p->cmp.senseLen);
+}
+
+static void pvscsi_transfer_data_with_buffer(PVSCSIRequest *p, bool to_host,
+                                             uint8_t *buf, int len)
+{
+    if (len) {
+        cpu_physical_memory_rw(p->req.dataAddr, buf, len, to_host);
+        p->cmp.dataLen += len;
+        p->req.dataAddr += len;
+        p->resid -= len;
+    }
+}
+
+static void pvscsi_get_next_sg_elem(struct PVSCSIRequest *p)
+{
+    struct PVSCSISGElement elem;
+
+    for (;; p->sg_current_addr = elem.addr) {
+        cpu_physical_memory_read(p->sg_current_addr, (void *)&elem,
+                                 sizeof(elem));
+#if 0
+        /* PVSCSI_SGE_FLAG_CHAIN_ELEMENT not in the header file! */
+        if ((elem.flags & PVSCSI_SGE_FLAG_CHAIN_ELEMENT) == 0) {
+            break;
+        }
+#else
+        break;
+#endif
+    }
+
+    p->sg_current_addr += sizeof(elem);
+    p->sg_current_dataAddr = elem.addr;
+    p->sg_current_resid = elem.length;
+    trace_pvscsi_sg_elem(p->req.context, elem.addr, elem.length);
+}
+
+static void pvscsi_transfer_data_with_sg_list(PVSCSIRequest *p, bool to_host,
+                                              uint8_t *buf, int len)
+{
+    int n;
+    while (len) {
+        while (!p->sg_current_resid) {
+            pvscsi_get_next_sg_elem(p);
+        }
+        assert(len > 0);
+        n = MIN((unsigned) len, p->sg_current_resid);
+        if (n) {
+            cpu_physical_memory_rw(p->sg_current_dataAddr, buf, n, to_host);
+        }
+
+        buf += n;
+        p->cmp.dataLen += n;
+        p->sg_current_dataAddr += n;
+
+        len -= n;
+        p->resid -= n;
+        p->sg_current_resid -= n;
+    }
+}
+
+static bool pvscsi_transfer_data(PVSCSIRequest *p, void *buf, int len)
+{
+    int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
+    if (len > p->resid) {
+        /* Do nothing upon underrun.  */
+        return false;
+    }
+
+    trace_pvscsi_transfer_data(p->req.context, len);
+    if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        pvscsi_transfer_data_with_sg_list(p, to_host, buf, len);
+    } else {
+        pvscsi_transfer_data_with_buffer(p, to_host, buf, len);
+    }
+    return true;
+}
+
+static void pvscsi_kick_device(PVSCSIRequest *p)
+{
+    if (p->req.flags & PVSCSI_FLAG_CMD_DIR_TODEVICE) {
+        p->sdev->info->write_data(p->sdev, p->req.context);
+    } else {
+        p->sdev->info->read_data(p->sdev, p->req.context);
+    }
+}
+
+/* Callback to indicate that the SCSI layer has completed a transfer.  */
+static void pvscsi_command_complete(SCSIBus *bus, int reason, uint32_t tag,
+                                    uint32_t arg)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, tag);
+    SCSIDevice *sdev;
+    uint8_t *buf;
+
+    if (!p) {
+        fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", tag);
+        return;
+    }
+
+    sdev = p->sdev;
+    if (reason == SCSI_REASON_DATA && !p->sensing) {
+        assert(p->resid);
+        if (!arg) {
+            /* Short transfer.  */
+            sdev->info->cancel_io(sdev, tag);
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            p->cmp.scsiStatus = CHECK_CONDITION;
+            goto complete;
+        }
+
+        buf = sdev->info->get_buf(sdev, tag);
+        if (!pvscsi_transfer_data(p, buf, arg)) {
+            /* Small buffer.  */
+            sdev->info->cancel_io(sdev, tag);
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            p->cmp.scsiStatus = CHECK_CONDITION;
+            goto complete;
+        }
+
+        pvscsi_kick_device(p);
+
+        /* We'll be called back asynchronously, exit.  */
+        return;
+    }
+
+    /* Here to complete the request.  */
+    if (reason == SCSI_REASON_DONE) {
+        p->cmp.scsiStatus = arg;
+    }
+
+complete:
+    if (p->sensing == 0 && p->cmp.scsiStatus == CHECK_CONDITION) {
+        p->sensing = 1;
+        if (pvscsi_send_request_sense(sdev, tag, p->lun)) {
+            return;
+        }
+
+    } else if (p->sensing == 1 && reason == SCSI_REASON_DATA) {
+        /* Got sense data.  Write it back and kick the device to complete
+         * the request.  */
+        if (arg) {
+            buf = sdev->info->get_buf(sdev, tag);
+            pvscsi_write_sense(p, buf, arg);
+            if (buf[2] == NO_SENSE) {
+                p->cmp.scsiStatus = GOOD;
+            }
+        }
+        p->sensing = 2;
+        pvscsi_kick_device(p);
+        return;
+    }
+
+    pvscsi_complete_req(s, p);
+}
+\f

+
+/* Process a request from the request ring.  */
+static void pvscsi_process_req(PVSCSIState *s, struct PVSCSIRingReqDesc *r)
+{
+    PVSCSIRequest *p = pvscsi_queue_request(s, r);
+    int64_t datalen, n;
+
+    if (!p->sdev) {
+        p->cmp.hostStatus = BTSTAT_SELTIMEO;
+        goto fail_nocancel;
+    }
+
+    if (r->flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        p->sg_current_addr = r->dataAddr;
+    }
+
+    n = p->sdev->info->send_command(p->sdev, r->context, r->cdb, p->lun);
+    if ((n > 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TODEVICE)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        goto fail;
+    }
+    if ((n < 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TOHOST)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        goto fail;
+    }
+
+    datalen = (n < 0 ? -n : n);
+    p->resid = MIN(n, r->dataLen);
+    if (n) {
+        pvscsi_kick_device(p);
+    }
+    return;
+
+fail:
+    p->sdev->info->cancel_io(p->sdev, r->context);
+fail_nocancel:
+    pvscsi_complete_req(s, p);
+}
+
+/* Process pending requests on the request ring.  */
+static void pvscsi_process_req_ring(PVSCSIState *s)
+{
+    uint32_t req_entries = s->reqNumEntriesLog2;
+
+    trace_pvscsi_kick_io();
+    while (pvscsi_req_pending(s)) {
+        uint32_t val = pvscsi_ld_ring_state(s, reqConsIdx);
+        uint32_t idx = val & MASK(req_entries);
+        target_phys_addr_t addr;
+        struct PVSCSIRingReqDesc req_desc;
+
+        addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingReqDesc),
+                                    s->rings.reqRingPPNs);
+
+        barrier();
+        cpu_physical_memory_read(addr, (void *)&req_desc, sizeof(req_desc));
+        pvscsi_process_req(s, &req_desc);
+        barrier();
+        pvscsi_st_ring_state(s, reqConsIdx, val + 1);
+    }
+}
+
+\f

+static int32_t pvscsi_cmd_bad(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: bad command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_unimpl(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: unimplemented command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_adapter_reset(PVSCSIState *s)
+{
+    pvscsi_soft_reset(s);
+    return 0;
+}
+
+static int floor_log2(int x)
+{
+    assert(x);
+    return 31 - clz32(x);
+}
+
+/* Setup ring buffers and initialize the ring state page.  */
+static int32_t pvscsi_cmd_setup_rings(PVSCSIState *s)
+{
+    memcpy(&s->rings, s->cmd_buffer, sizeof(s->rings));
+    if (s->rings.reqRingNumPages == 0 ||
+        s->rings.cmpRingNumPages == 0) {
+        return -1;
+    }
+
+    s->reqNumEntriesLog2 = floor_log2(s->rings.reqRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingReqDesc));
+    s->cmpNumEntriesLog2 = floor_log2(s->rings.cmpRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingCmpDesc));
+
+    trace_pvscsi_setup_req_ring(s->rings.reqRingNumPages,
+                                1 << s->reqNumEntriesLog2);
+    trace_pvscsi_setup_cmp_ring(s->rings.cmpRingNumPages,
+                                1 << s->cmpNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, reqNumEntriesLog2, s->reqNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpNumEntriesLog2, s->cmpNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpProdIdx, 0);
+    pvscsi_st_ring_state(s, cmpConsIdx, 0);
+    pvscsi_st_ring_state(s, reqProdIdx, 0);
+    pvscsi_st_ring_state(s, reqConsIdx, 0);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_bus(PVSCSIState *s)
+{
+    qbus_reset_all_fn(&s->bus);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_device(PVSCSIState *s)
+{
+    struct PVSCSICmdDescResetDevice *cmd =
+        (struct PVSCSICmdDescResetDevice *) &s->cmd_buffer;
+    SCSIDevice *sdev;
+
+    pvscsi_get_dev_lun(s, cmd->lun, cmd->target, &sdev);
+    if (sdev != NULL && sdev->info->qdev.reset) {
+        sdev->info->qdev.reset(&sdev->qdev);
+    }
+
+    return 0;
+}
+
+static int32_t pvscsi_cmd_abort_cmd(PVSCSIState *s)
+{
+    return 0;
+}
+
+static int32_t pvscsi_cmd_setup_msg_ring(PVSCSIState *s)
+{
+    memcpy(&s->msgRing, s->cmd_buffer, sizeof(s->msgRing));
+    if (s->msgRing.numPages == 0) {
+        return -1;
+    }
+
+    s->msgNumEntriesLog2 = floor_log2(s->msgRing.numPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingMsgDesc));
+
+    trace_pvscsi_setup_msg_ring(s->msgRing.numPages,
+                                1 << s->msgNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, msgNumEntriesLog2, s->msgNumEntriesLog2);
+    pvscsi_st_ring_state(s, msgProdIdx, 0);
+    pvscsi_st_ring_state(s, msgConsIdx, 0);
+    return 0;
+}
+
+typedef struct {
+    int nargs;
+    int32_t (*fn)(PVSCSIState *);
+} PVSCSICmd;
+
+static const PVSCSICmd pvscsi_commands[PVSCSI_CMD_LAST] = {
+    [PVSCSI_CMD_FIRST] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_bad,
+    },
+    [PVSCSI_CMD_ADAPTER_RESET] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_adapter_reset
+    },
+    [PVSCSI_CMD_ISSUE_SCSI] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_RINGS] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupRings) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_rings
+    },
+    [PVSCSI_CMD_RESET_BUS] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_reset_bus
+    },
+    [PVSCSI_CMD_RESET_DEVICE] = {
+        .nargs = sizeof(struct PVSCSICmdDescResetDevice) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_reset_device
+    },
+    [PVSCSI_CMD_ABORT_CMD] = {
+        .nargs = sizeof(struct PVSCSICmdDescAbortCmd) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_abort_cmd
+    },
+    [PVSCSI_CMD_CONFIG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_MSG_RING] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_msg_ring
+    },
+    [PVSCSI_CMD_DEVICE_UNPLUG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    }
+};
+
+\f

+static void pvscsi_maybe_do_cmd(PVSCSIState *s)
+{
+    int cmd = s->cmd_latch >= PVSCSI_CMD_LAST ? PVSCSI_CMD_FIRST : s->cmd_latch;
+    const PVSCSICmd *cmd_info = &pvscsi_commands[cmd];
+
+    if (s->cmd_ptr >= cmd_info->nargs) {
+        s->cmd_status = cmd_info->fn(s);
+        s->cmd_latch = 0;
+        s->cmd_ptr = 0;
+    }
+}
+
+static uint32_t pvscsi_reg_readl(PVSCSIState *s, int offset)
+{
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        fprintf(stderr, "vmw_pvscsi: read to write-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        return s->cmd_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        return s->intr_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        return s->intr_mask;
+        break;
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: read from unsupported register %x\n", offset);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static void pvscsi_reg_write(PVSCSIState *s, int offset, uint32_t val, int size)
+{
+    if (size != 4) {
+        switch (offset) {
+        case PVSCSI_REG_OFFSET_COMMAND:
+        case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_MASK:
+            abort();
+        default:
+            break;
+        }
+    }
+
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+        trace_pvscsi_cmd(val);
+        s->cmd_latch = val;
+        s->cmd_ptr = 0;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        s->cmd_buffer[s->cmd_ptr++] = val;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        fprintf(stderr, "vmw_pvscsi: write to read-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        pvscsi_acknowledge_intr(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        pvscsi_set_intr_mask(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        pvscsi_process_req_ring(s);
+        break;
+
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: write to unsupported register %x\n", offset);
+        break;
+    default:
+            break;
+    }
+}
+
+static void pvscsi_mmio_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 1);
+}
+
+static void pvscsi_mmio_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 2);
+}
+
+static void pvscsi_mmio_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 4);
+}
+
+static uint32_t pvscsi_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    return pvscsi_reg_readl(s, addr);
+}
+
+static CPUReadMemoryFunc * const pvscsi_mmio_readfn[3] = {
+    pvscsi_mmio_readb,
+    pvscsi_mmio_readw,
+    pvscsi_mmio_readl,
+};
+
+static CPUWriteMemoryFunc * const pvscsi_mmio_writefn[3] = {
+    pvscsi_mmio_writeb,
+    pvscsi_mmio_writew,
+    pvscsi_mmio_writel,
+};
+
+static void pvscsi_mmio_mapfunc(PCIDevice *pci_dev, int region_num,
+                             pcibus_t addr, pcibus_t size, int type)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, pci_dev);
+
+    cpu_register_physical_memory(addr, PVSCSI_MEM_SPACE_SIZE, s->mmio_io_addr);
+}
+
+static void pvscsi_reset(DeviceState *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, dev);
+
+    pvscsi_soft_reset(s);
+}
+
+static int pvscsi_uninit(PCIDevice *d)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, d);
+
+    cpu_unregister_io_memory(s->mmio_io_addr);
+
+    return 0;
+}
+
+static int pvscsi_init(PCIDevice *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, dev);
+    uint8_t *pci_conf;
+
+    pci_conf = s->dev.config;
+
+    pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_VMWARE);
+    pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_VMWARE_PVSCSI);
+    pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_SCSI);
+
+    /* PCI subsystem ID */
+    pci_conf[PCI_SUBSYSTEM_ID] = 0x00;
+    pci_conf[PCI_SUBSYSTEM_ID + 1] = 0x10;
+
+    /* PCI latency timer = 255 */
+    pci_conf[PCI_LATENCY_TIMER] = 0xff;
+
+    /* Interrupt pin 1 */
+    pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+    s->mmio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar(&s->dev, 0, PVSCSI_MEM_SPACE_SIZE,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, pvscsi_mmio_mapfunc);
+
+#if 0
+    s->pio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar(&s->dev, 1, 256,
+                           PCI_BASE_ADDRESS_SPACE_IO, pvscsi_io_mapfunc);
+#endif
+
+    s->complete_reqs_bh = qemu_bh_new(pvscsi_complete_reqs, s);
+
+    scsi_bus_new(&s->bus, &dev->qdev, 1, PVSCSI_MAX_DEVS,
+                 pvscsi_command_complete);
+    if (!dev->qdev.hotplugged) {
+        return scsi_bus_legacy_handle_cmdline(&s->bus);
+    }
+    return 0;
+}
+
+static PCIDeviceInfo pvscsi_info = {
+    .qdev.name  = "vmw_pvscsi",
+    .qdev.size  = sizeof(PVSCSIState),
+    .qdev.reset = pvscsi_reset,
+    .init       = pvscsi_init,
+    .exit       = pvscsi_uninit,
+};
+
+static void vmw_pvscsi_register_devices(void)
+{
+    pci_qdev_register(&pvscsi_info);
+}
+
+device_init(vmw_pvscsi_register_devices);
diff --git a/hw/vmw_pvscsi.h b/hw/vmw_pvscsi.h
new file mode 100644
index 0000000..b7fa3f6
--- /dev/null
+++ b/hw/vmw_pvscsi.h
@@ -0,0 +1,389 @@
+/*
+ * VMware PVSCSI header file
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _VMW_PVSCSI_H_
+#define _VMW_PVSCSI_H_
+
+#define PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT 128
+
+#define MASK(n)        ((1 << (n)) - 1)        /* make an n-bit mask */
+
+#define __packed __attribute__((packed))
+
+/*
+ * host adapter status/error codes
+ */
+enum HostBusAdapterStatus {
+   BTSTAT_SUCCESS       = 0x00,  /* CCB complete normally with no errors */
+   BTSTAT_LINKED_COMMAND_COMPLETED           = 0x0a,
+   BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG = 0x0b,
+   BTSTAT_DATA_UNDERRUN = 0x0c,
+   BTSTAT_SELTIMEO      = 0x11,  /* SCSI selection timeout */
+   BTSTAT_DATARUN       = 0x12,  /* data overrun/underrun */
+   BTSTAT_BUSFREE       = 0x13,  /* unexpected bus free */
+   BTSTAT_INVPHASE      = 0x14,  /* invalid bus phase or sequence requested by target */
+   BTSTAT_LUNMISMATCH   = 0x17,  /* linked CCB has different LUN from first CCB */
+   BTSTAT_SENSFAILED    = 0x1b,  /* auto request sense failed */
+   BTSTAT_TAGREJECT     = 0x1c,  /* SCSI II tagged queueing message rejected by target */
+   BTSTAT_BADMSG        = 0x1d,  /* unsupported message received by the host adapter */
+   BTSTAT_HAHARDWARE    = 0x20,  /* host adapter hardware failed */
+   BTSTAT_NORESPONSE    = 0x21,  /* target did not respond to SCSI ATN, sent a SCSI RST */
+   BTSTAT_SENTRST       = 0x22,  /* host adapter asserted a SCSI RST */
+   BTSTAT_RECVRST       = 0x23,  /* other SCSI devices asserted a SCSI RST */
+   BTSTAT_DISCONNECT    = 0x24,  /* target device reconnected improperly (w/o tag) */
+   BTSTAT_BUSRESET      = 0x25,  /* host adapter issued BUS device reset */
+   BTSTAT_ABORTQUEUE    = 0x26,  /* abort queue generated */
+   BTSTAT_HASOFTWARE    = 0x27,  /* host adapter software error */
+   BTSTAT_HATIMEOUT     = 0x30,  /* host adapter hardware timeout error */
+   BTSTAT_SCSIPARITY    = 0x34,  /* SCSI parity error detected */
+};
+
+/*
+ * Register offsets.
+ *
+ * These registers are accessible both via i/o space and mm i/o.
+ */
+
+enum PVSCSIRegOffset {
+	PVSCSI_REG_OFFSET_COMMAND        =    0x0,
+	PVSCSI_REG_OFFSET_COMMAND_DATA   =    0x4,
+	PVSCSI_REG_OFFSET_COMMAND_STATUS =    0x8,
+	PVSCSI_REG_OFFSET_LAST_STS_0     =  0x100,
+	PVSCSI_REG_OFFSET_LAST_STS_1     =  0x104,
+	PVSCSI_REG_OFFSET_LAST_STS_2     =  0x108,
+	PVSCSI_REG_OFFSET_LAST_STS_3     =  0x10c,
+	PVSCSI_REG_OFFSET_INTR_STATUS    = 0x100c,
+	PVSCSI_REG_OFFSET_INTR_MASK      = 0x2010,
+	PVSCSI_REG_OFFSET_KICK_NON_RW_IO = 0x3014,
+	PVSCSI_REG_OFFSET_DEBUG          = 0x3018,
+	PVSCSI_REG_OFFSET_KICK_RW_IO     = 0x4018,
+};
+
+/*
+ * Virtual h/w commands.
+ */
+
+enum PVSCSICommands {
+	PVSCSI_CMD_FIRST             = 0, /* has to be first */
+
+	PVSCSI_CMD_ADAPTER_RESET     = 1,
+	PVSCSI_CMD_ISSUE_SCSI        = 2,
+	PVSCSI_CMD_SETUP_RINGS       = 3,
+	PVSCSI_CMD_RESET_BUS         = 4,
+	PVSCSI_CMD_RESET_DEVICE      = 5,
+	PVSCSI_CMD_ABORT_CMD         = 6,
+	PVSCSI_CMD_CONFIG            = 7,
+	PVSCSI_CMD_SETUP_MSG_RING    = 8,
+	PVSCSI_CMD_DEVICE_UNPLUG     = 9,
+
+	PVSCSI_CMD_LAST              = 10  /* has to be last */
+};
+
+/*
+ * Command descriptor for PVSCSI_CMD_RESET_DEVICE --
+ */
+
+struct PVSCSICmdDescResetDevice {
+	uint32_t	target;
+	uint8_t		lun[8];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_ABORT_CMD --
+ *
+ * - currently does not support specifying the LUN.
+ * - _pad should be 0.
+ */
+
+struct PVSCSICmdDescAbortCmd {
+	uint64_t	context;
+	uint32_t	target;
+	uint32_t	_pad;
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_RINGS --
+ *
+ * Notes:
+ * - reqRingNumPages and cmpRingNumPages need to be power of two.
+ * - reqRingNumPages and cmpRingNumPages need to be different from 0,
+ * - reqRingNumPages and cmpRingNumPages need to be inferior to
+ *   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES.
+ */
+
+#define PVSCSI_SETUP_RINGS_MAX_NUM_PAGES        32
+struct PVSCSICmdDescSetupRings {
+	uint32_t	reqRingNumPages;
+	uint32_t	cmpRingNumPages;
+	uint64_t	ringsStatePPN;
+	uint64_t	reqRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+	uint64_t	cmpRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_MSG_RING --
+ *
+ * Notes:
+ * - this command was not supported in the initial revision of the h/w
+ *   interface. Before using it, you need to check that it is supported by
+ *   writing PVSCSI_CMD_SETUP_MSG_RING to the 'command' register, then
+ *   immediately after read the 'command status' register:
+ *       * a value of -1 means that the cmd is NOT supported,
+ *       * a value != -1 means that the cmd IS supported.
+ *   If it's supported the 'command status' register should return:
+ *      sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t).
+ * - this command should be issued _after_ the usual SETUP_RINGS so that the
+ *   RingsState page is already setup. If not, the command is a nop.
+ * - numPages needs to be a power of two,
+ * - numPages needs to be different from 0,
+ * - _pad should be zero.
+ */
+
+#define PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES  16
+
+struct PVSCSICmdDescSetupMsgRing {
+	uint32_t	numPages;
+	uint32_t	_pad;
+	uint64_t	ringPPNs[PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES];
+} __packed;
+
+enum PVSCSIMsgType {
+	PVSCSI_MSG_DEV_ADDED          = 0,
+	PVSCSI_MSG_DEV_REMOVED        = 1,
+	PVSCSI_MSG_LAST               = 2,
+};
+
+/*
+ * Msg descriptor.
+ *
+ * sizeof(struct PVSCSIRingMsgDesc) == 128.
+ *
+ * - type is of type enum PVSCSIMsgType.
+ * - the content of args depend on the type of event being delivered.
+ */
+
+struct PVSCSIRingMsgDesc {
+	uint32_t	type;
+	uint32_t	args[31];
+} __packed;
+
+struct PVSCSIMsgDescDevStatusChanged {
+	uint32_t	type;  /* PVSCSI_MSG_DEV _ADDED / _REMOVED */
+	uint32_t	bus;
+	uint32_t	target;
+	uint8_t		lun[8];
+	uint32_t	pad[27];
+} __packed;
+
+/*
+ * Rings state.
+ *
+ * - the fields:
+ *    . msgProdIdx,
+ *    . msgConsIdx,
+ *    . msgNumEntriesLog2,
+ *   .. are only used once the SETUP_MSG_RING cmd has been issued.
+ * - '_pad' helps to ensure that the msg related fields are on their own
+ *   cache-line.
+ */
+
+struct PVSCSIRingsState {
+	uint32_t	reqProdIdx;
+	uint32_t	reqConsIdx;
+	uint32_t	reqNumEntriesLog2;
+
+	uint32_t	cmpProdIdx;
+	uint32_t	cmpConsIdx;
+	uint32_t	cmpNumEntriesLog2;
+
+	uint8_t		_pad[104];
+
+	uint32_t	msgProdIdx;
+	uint32_t	msgConsIdx;
+	uint32_t	msgNumEntriesLog2;
+} __packed;
+
+/*
+ * Request descriptor.
+ *
+ * sizeof(RingReqDesc) = 128
+ *
+ * - context: is a unique identifier of a command. It could normally be any
+ *   64bit value, however we currently store it in the serialNumber variable
+ *   of struct SCSI_Command, so we have the following restrictions due to the
+ *   way this field is handled in the vmkernel storage stack:
+ *    * this value can't be 0,
+ *    * the upper 32bit need to be 0 since serialNumber is as a uint32_t.
+ *   Currently tracked as PR 292060.
+ * - dataLen: contains the total number of bytes that need to be transferred.
+ * - dataAddr:
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is set: dataAddr is the PA of the first
+ *     s/g table segment, each s/g segment is entirely contained on a single
+ *     page of physical memory,
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is NOT set, then dataAddr is the PA of
+ *     the buffer used for the DMA transfer,
+ * - flags:
+ *   * PVSCSI_FLAG_CMD_WITH_SG_LIST: see dataAddr above,
+ *   * PVSCSI_FLAG_CMD_DIR_NONE: no DMA involved,
+ *   * PVSCSI_FLAG_CMD_DIR_TOHOST: transfer from device to main memory,
+ *   * PVSCSI_FLAG_CMD_DIR_TODEVICE: transfer from main memory to device,
+ *   * PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB: reserved to handle CDBs larger than
+ *     16bytes. To be specified.
+ * - vcpuHint: vcpuId of the processor that will be most likely waiting for the
+ *   completion of the i/o. For guest OSes that use lowest priority message
+ *   delivery mode (such as windows), we use this "hint" to deliver the
+ *   completion action to the proper vcpu. For now, we can use the vcpuId of
+ *   the processor that initiated the i/o as a likely candidate for the vcpu
+ *   that will be waiting for the completion..
+ * - bus should be 0: we currently only support bus 0 for now.
+ * - unused should be zero'd.
+ */
+
+#define PVSCSI_FLAG_CMD_WITH_SG_LIST        (1 << 0)
+#define PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB     (1 << 1)
+#define PVSCSI_FLAG_CMD_DIR_NONE            (1 << 2)
+#define PVSCSI_FLAG_CMD_DIR_TOHOST          (1 << 3)
+#define PVSCSI_FLAG_CMD_DIR_TODEVICE        (1 << 4)
+
+struct PVSCSIRingReqDesc {
+	uint64_t	context;
+	uint64_t	dataAddr;
+	uint64_t	dataLen;
+	uint64_t	senseAddr;
+	uint32_t	senseLen;
+	uint32_t	flags;
+	uint8_t		cdb[16];
+	uint8_t		cdbLen;
+	uint8_t		lun[8];
+	uint8_t		tag;
+	uint8_t		bus;
+	uint8_t		target;
+	uint8_t		vcpuHint;
+	uint8_t		unused[59];
+} __packed;
+
+/*
+ * Scatter-gather list management.
+ *
+ * As described above, when PVSCSI_FLAG_CMD_WITH_SG_LIST is set in the
+ * RingReqDesc.flags, then RingReqDesc.dataAddr is the PA of the first s/g
+ * table segment.
+ *
+ * - each segment of the s/g table contain a succession of struct
+ *   PVSCSISGElement.
+ * - each segment is entirely contained on a single physical page of memory.
+ * - a "chain" s/g element has the flag PVSCSI_SGE_FLAG_CHAIN_ELEMENT set in
+ *   PVSCSISGElement.flags and in this case:
+ *     * addr is the PA of the next s/g segment,
+ *     * length is undefined, assumed to be 0.
+ */
+
+struct PVSCSISGElement {
+	uint64_t	addr;
+	uint32_t	length;
+	uint32_t	flags;
+} __packed;
+
+/*
+ * Completion descriptor.
+ *
+ * sizeof(RingCmpDesc) = 32
+ *
+ * - context: identifier of the command. The same thing that was specified
+ *   under "context" as part of struct RingReqDesc at initiation time,
+ * - dataLen: number of bytes transferred for the actual i/o operation,
+ * - senseLen: number of bytes written into the sense buffer,
+ * - hostStatus: adapter status,
+ * - scsiStatus: device status,
+ * - _pad should be zero.
+ */
+
+struct PVSCSIRingCmpDesc {
+	uint64_t	context;
+	uint64_t	dataLen;
+	uint32_t	senseLen;
+	uint16_t	hostStatus;
+	uint16_t	scsiStatus;
+	uint32_t	_pad[2];
+} __packed;
+
+/*
+ * Interrupt status / IRQ bits.
+ */
+
+#define PVSCSI_INTR_CMPL_0                 (1 << 0)
+#define PVSCSI_INTR_CMPL_1                 (1 << 1)
+#define PVSCSI_INTR_CMPL_MASK              MASK(2)
+
+#define PVSCSI_INTR_MSG_0                  (1 << 2)
+#define PVSCSI_INTR_MSG_1                  (1 << 3)
+#define PVSCSI_INTR_MSG_MASK               (MASK(2) << 2)
+
+#define PVSCSI_INTR_ALL_SUPPORTED          MASK(4)
+
+/*
+ * Number of MSI-X vectors supported.
+ */
+#define PVSCSI_MAX_INTRS        24
+
+/*
+ * Enumeration of supported MSI-X vectors
+ */
+#define PVSCSI_VECTOR_COMPLETION   0
+
+/*
+ * Misc constants for the rings.
+ */
+
+#define PVSCSI_MAX_NUM_PAGES_REQ_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_CMP_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_MSG_RING   PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES
+
+#define PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE \
+				(PAGE_SIZE / sizeof(struct PVSCSIRingReqDesc))
+
+#define PVSCSI_MAX_REQ_QUEUE_DEPTH \
+	(PVSCSI_MAX_NUM_PAGES_REQ_RING * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE)
+
+#define PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES     1
+#define PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES 1
+#define PVSCSI_MEM_SPACE_MISC_NUM_PAGES        2
+#define PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES     2
+#define PVSCSI_MEM_SPACE_MSIX_NUM_PAGES        2
+
+enum PVSCSIMemSpace {
+	PVSCSI_MEM_SPACE_COMMAND_PAGE		= 0,
+	PVSCSI_MEM_SPACE_INTR_STATUS_PAGE	= 1,
+	PVSCSI_MEM_SPACE_MISC_PAGE		= 2,
+	PVSCSI_MEM_SPACE_KICK_IO_PAGE		= 4,
+	PVSCSI_MEM_SPACE_MSIX_TABLE_PAGE	= 6,
+	PVSCSI_MEM_SPACE_MSIX_PBA_PAGE		= 7,
+};
+
+#define PVSCSI_MEM_SPACE_NUM_PAGES \
+	(PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES +   \
+	 PVSCSI_MEM_SPACE_MISC_NUM_PAGES +          \
+	 PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_MSIX_NUM_PAGES)
+
+#define PVSCSI_MEM_SPACE_SIZE        (PVSCSI_MEM_SPACE_NUM_PAGES * PAGE_SIZE)
+
+#endif /* _VMW_PVSCSI_H_ */
diff --git a/trace-events b/trace-events
index 51e2497..7126c07 100644
--- a/trace-events
+++ b/trace-events
@@ -211,6 +211,21 @@ disable scsi_req_dequeue(int target, int lun, int tag) "target %d lun %d tag %d"
 disable scsi_req_parsed(int target, int lun, int tag, int cmd, const char *cmdname, int mode, int xfer, uint64_t lba) "target %d lun %d tag %d command %d (%s) dir %d length %d lba %"PRIu64""
 disable scsi_req_parse_bad(int target, int lun, int tag, int cmd) "target %d lun %d tag %d command %d"
 
+# hw/vmw_pvscsi.c
+disable pvscsi_queue_request(uint64_t context, uint8_t command, uint64_t dataLen) "context %"PRIu64" command %d length %"PRIu64""
+disable pvscsi_sg_elem(uint64_t context, uint64_t addr, uint64_t length) "context %"PRIu64" addr %"PRIu64" length %"PRIu64""
+disable pvscsi_transfer_data(uint64_t context, uint64_t length) "context %"PRIu64" length %"PRIu64""
+disable pvscsi_request_sense(uint64_t context, int lun) "context %"PRIu64" lun %d"
+disable pvscsi_kick_io(void) "kick request ring"
+disable pvscsi_complete_req(uint64_t context, uint64_t length, uint8_t sense) "context %"PRIu64" length %"PRIu64" sense %d"
+disable pvscsi_cmp_ring_put(uint64_t context) "context %"PRIu64""
+disable pvscsi_raise_intr(uint32_t intr, const char *state) "raised intr %d %s"
+disable pvscsi_acknowledge_intr(uint32_t intr) "acknowledged intr %d"
+disable pvscsi_setup_req_ring(uint32_t pages, uint32_t entries) "req ring - %d pages %d entries"
+disable pvscsi_setup_cmp_ring(uint32_t pages, uint32_t entries) "cmp ring - %d pages %d entries"
+disable pvscsi_setup_msg_ring(uint32_t pages, uint32_t entries) "msg ring - %d pages %d entries"
+disable pvscsi_cmd(int cmd) "command %d"
+
 # vl.c
 disable vm_state_notify(int running, int reason) "running %d reason %d"
 
-- 
1.7.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread