All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
@ 2011-04-15 13:42 Paolo Bonzini
  2011-04-15 14:01 ` Stefan Hajnoczi
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 13:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Zachary Amsden

Lightly tested with Linux guests; at least it can successfully partition
and format a disk.  scsi-generic also lightly tested.

Doesn't do migration, doesn't do hotplug (the device would support that,
but it is not 100% documented and the Linux driver in particular cannot
initiate hot-unplug).  I did it as quick one-day hack to study the SCSI
subsystem and it is my first real foray into device model land, please
be gentle. :)

vmw_pvscsi.h is taken from Linux, so it doesn't fully respect coding
standards.  I think that's fair.

Size is curiously close to the recently added sPAPR adapter:

  911  2354 25553 hw/vmw_pvscsi.c
  988  3177 29628 hw/spapr_vscsi.c

Sounds like that's just the amount of code it takes to implement a SCSI
HBA in QEMU. :)

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
---
 Makefile.objs           |    1 +
 default-configs/pci.mak |    1 +
 hw/pci.h                |    1 +
 hw/vmw_pvscsi.c         |  911 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/vmw_pvscsi.h         |  389 ++++++++++++++++++++
 trace-events            |   15 +
 6 files changed, 1318 insertions(+), 0 deletions(-)
 create mode 100644 hw/vmw_pvscsi.c
 create mode 100644 hw/vmw_pvscsi.h

diff --git a/Makefile.objs b/Makefile.objs
index 44ce368..f056502 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -255,6 +255,7 @@ hw-obj-$(CONFIG_AHCI) += ide/ich.o
 
 # SCSI layer
 hw-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
+hw-obj-$(CONFIG_VMWARE_PVSCSI_PCI) += vmw_pvscsi.o
 hw-obj-$(CONFIG_ESP) += esp.o
 
 hw-obj-y += dma-helpers.o sysbus.o isa-bus.o
diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 0471efb..b1817f5 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -8,6 +8,7 @@ CONFIG_EEPRO100_PCI=y
 CONFIG_PCNET_PCI=y
 CONFIG_PCNET_COMMON=y
 CONFIG_LSI_SCSI_PCI=y
+CONFIG_VMWARE_PVSCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
 CONFIG_IDE_CORE=y
diff --git a/hw/pci.h b/hw/pci.h
index 52ee8c9..26ce6d7 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -59,6 +59,7 @@
 #define PCI_DEVICE_ID_VMWARE_NET         0x0720
 #define PCI_DEVICE_ID_VMWARE_SCSI        0x0730
 #define PCI_DEVICE_ID_VMWARE_IDE         0x1729
+#define PCI_DEVICE_ID_VMWARE_PVSCSI      0x07c0
 
 /* Intel (0x8086) */
 #define PCI_DEVICE_ID_INTEL_82551IT      0x1209
diff --git a/hw/vmw_pvscsi.c b/hw/vmw_pvscsi.c
new file mode 100644
index 0000000..fdda652
--- /dev/null
+++ b/hw/vmw_pvscsi.c
@@ -0,0 +1,911 @@
+/*
+ * VMware Paravirtualized SCSI Host Bus Adapter emulation
+ *
+ * Copyright (c) 2011 Red Hat, Inc.
+ * Written by Paolo Bonzini
+ *
+ * This code is licensed under GPLv2 or later.
+ */
+
+#include <assert.h>
+
+#include "hw.h"
+#include "pci.h"
+#include "scsi.h"
+#include "scsi-defs.h"
+#include "vmw_pvscsi.h"
+#include "block_int.h"
+#include "host-utils.h"
+#include "trace.h"
+
+#define PVSCSI_MAX_DEVS 127
+#define PAGE_SIZE       4096
+#define PAGE_SHIFT      12
+
+typedef struct PVSCSIRequest {
+    SCSIDevice *sdev;
+    uint8_t sensing;
+    uint8_t sense_key;
+    uint8_t completed;
+    int lun;
+    target_phys_addr_t sg_current_addr;
+    target_phys_addr_t sg_current_dataAddr;
+    uint32_t sg_current_resid;
+    uint64_t resid;
+    struct PVSCSIRingReqDesc req;
+    struct PVSCSIRingCmpDesc cmp;
+    QTAILQ_ENTRY(PVSCSIRequest) next;
+} PVSCSIRequest;
+
+typedef QTAILQ_HEAD(, PVSCSIRequest) PVSCSIRequestList;
+
+typedef struct {
+    PCIDevice dev;
+    SCSIBus bus;
+    QEMUBH *complete_reqs_bh;
+
+    int mmio_io_addr;
+
+    /* zeroed on reset */
+    uint32_t cmd_latch;
+    uint32_t cmd_buffer[sizeof(struct PVSCSICmdDescSetupRings)
+                        / sizeof(uint32_t)];
+    uint32_t cmd_ptr;
+    uint32_t cmd_status;
+    uint32_t intr_status;
+    uint32_t intr_mask;
+    uint32_t intr_cmpl;
+    uint32_t intr_msg;
+    struct PVSCSICmdDescSetupRings rings;
+    struct PVSCSICmdDescSetupMsgRing msgRing;
+    uint32_t reqNumEntriesLog2;
+    uint32_t cmpNumEntriesLog2;
+    uint32_t msgNumEntriesLog2;
+
+    PVSCSIRequestList pending_queue;
+    PVSCSIRequestList complete_queue;
+} PVSCSIState;
+
+\f

+static inline int pvscsi_get_lun(uint8_t *lun)
+{
+    uint64_t lunval;
+    lunval = ((uint64_t)lun[0] << 56) || ((uint64_t)lun[1] << 48) ||
+             ((uint64_t)lun[2] << 40) || ((uint64_t)lun[3] << 32) ||
+             ((uint64_t)lun[4] << 24) || ((uint64_t)lun[5] << 16) ||
+             ((uint64_t)lun[6] <<  8) ||  (uint64_t)lun[7];
+    if ((lunval & ~(uint64_t) 255) != 0) {
+        return -1;
+    }
+    return lunval & 255;
+}
+
+static inline int pvscsi_get_dev_lun(PVSCSIState *s,
+                                     uint8_t *lun, uint32_t target,
+                                     SCSIDevice **sdev)
+{
+    SCSIBus *bus = &s->bus;
+    int lunval;
+    *sdev = NULL;
+    if (target > PVSCSI_MAX_DEVS) {
+        return -1;
+    }
+    lunval = pvscsi_get_lun(lun);
+    if (lunval < 0) {
+        return -1;
+    }
+    *sdev = bus->devs[target];
+    if (!sdev) {
+        return -1;
+    }
+    return lunval;
+}
+
+\f

+/* Add a command to the pending queue.  */
+static PVSCSIRequest *pvscsi_queue_request(PVSCSIState *s,
+                                           struct PVSCSIRingReqDesc *req)
+{
+    SCSIDevice *sdev;
+    PVSCSIRequest *p;
+    int lun;
+
+    trace_pvscsi_queue_request(req->context, req->cdb[0], req->dataLen);
+
+    p = qemu_mallocz(sizeof(*p));
+    p->req = *req;
+    p->cmp.context = p->req.context;
+    QTAILQ_INSERT_TAIL(&s->pending_queue, p, next);
+
+    lun = pvscsi_get_dev_lun(s, req->lun, req->target, &sdev);
+    if (!sdev) {
+        return p;
+    }
+
+    p->lun = lun;
+    p->sdev = sdev;
+    return p;
+}
+
+/* Get PVSCSIRequest for this tag.  */
+static PVSCSIRequest *pvscsi_find_request(PVSCSIState *s, uint32_t tag)
+{
+    PVSCSIRequest *p;
+
+    QTAILQ_FOREACH(p, &s->pending_queue, next) {
+        if (p->req.context == tag) {
+            return p;
+        }
+    }
+    return NULL;
+}
+
+static void pvscsi_free_queue(PVSCSIRequestList *q)
+{
+    PVSCSIRequest *p;
+
+    while (!QTAILQ_EMPTY(q)) {
+        p = QTAILQ_FIRST(q);
+        QTAILQ_REMOVE(q, p, next);
+        qemu_free(p);
+    }
+}
+
+static void pvscsi_soft_reset(PVSCSIState *s)
+{
+    qbus_reset_all_fn(&s->bus);
+    pvscsi_free_queue(&s->complete_queue);
+    assert(QTAILQ_EMPTY(&s->pending_queue));
+    memset(&s->cmd_latch, 0, sizeof(*s) - offsetof(PVSCSIState, cmd_latch));
+    s->intr_cmpl = PVSCSI_INTR_CMPL_0;
+    s->intr_msg = PVSCSI_INTR_MSG_0;
+    QTAILQ_INIT(&s->pending_queue);
+    QTAILQ_INIT(&s->complete_queue);
+}
+
+\f

+static void pvscsi_raise_intr(PVSCSIState *s, int mask)
+{
+    int intr_raised = mask & ~s->intr_status;
+    s->intr_status |= mask;
+    trace_pvscsi_raise_intr(intr_raised,
+                            (intr_raised & s->intr_mask) == 0 ? "masked" : "");
+    if (intr_raised & s->intr_mask) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+}
+
+static void pvscsi_acknowledge_intr(PVSCSIState *s, int mask)
+{
+    trace_pvscsi_acknowledge_intr(mask);
+    s->intr_status &= ~mask;
+    if (mask == s->intr_cmpl) {
+        s->intr_cmpl ^= PVSCSI_INTR_CMPL_MASK;
+
+        /* Try putting more complete requests on the ring.  */
+        if (!QTAILQ_EMPTY(&s->complete_queue)) {
+            qemu_bh_schedule(s->complete_reqs_bh);
+        }
+    }
+    if (mask == s->intr_msg) {
+        s->intr_msg ^= PVSCSI_INTR_MSG_MASK;
+    }
+    if ((s->intr_status & s->intr_mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+static void pvscsi_set_intr_mask(PVSCSIState *s, int mask)
+{
+    int intr_enabled = mask & ~s->intr_mask;
+    s->intr_mask = mask;
+    if (s->intr_status & intr_enabled) {
+        qemu_set_irq(s->dev.irq[0], 1);
+    }
+    if ((s->intr_status & mask) == 0) {
+        qemu_set_irq(s->dev.irq[0], 0);
+    }
+}
+
+\f

+#define pvscsi_ld_ring_state(s, field) \
+    ldl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field))
+
+#define pvscsi_st_ring_state(s, field, val) \
+    stl_phys(s->rings.ringsStatePPN * PAGE_SIZE + offsetof(struct PVSCSIRingsState, field), \
+             val)
+
+/* Return number of free elements in the completion ring.  */
+static inline int pvscsi_cmp_free(PVSCSIState *s)
+{
+    return ((1 << s->cmpNumEntriesLog2) - 1 -
+            (pvscsi_ld_ring_state(s, cmpProdIdx) - pvscsi_ld_ring_state(s, cmpConsIdx)));
+}
+
+/* Return number of pending elements in the request ring.  */
+static inline int pvscsi_req_pending(PVSCSIState *s)
+{
+    return pvscsi_ld_ring_state(s, reqProdIdx) - pvscsi_ld_ring_state(s, reqConsIdx);
+}
+
+/* Return the physical address of the idx-th element in the ring
+ * whose physical page numbers are given by ppn.  Each element in
+ * the ring has size bytes.  */
+static target_phys_addr_t pvscsi_get_ring_addr(PVSCSIState *s, int idx,
+                                               int size, uint64_t *ppn)
+{
+    uint32_t ofs = idx * size;
+    return (ppn[ofs >> PAGE_SHIFT] * PAGE_SIZE) | (ofs & (PAGE_SIZE - 1));
+}
+\f

+
+#define barrier()
+
+/* Copy cmp_desc on the completion ring, assuming there is a free entry.  */
+static void pvscsi_cmp_ring_put(PVSCSIState *s,
+                                struct PVSCSIRingCmpDesc *cmp_desc)
+{
+    uint32_t cmp_entries = s->cmpNumEntriesLog2;
+    uint32_t val = pvscsi_ld_ring_state(s, cmpProdIdx);
+    uint32_t idx = val & MASK(cmp_entries);
+    target_phys_addr_t addr;
+
+    trace_pvscsi_cmp_ring_put(cmp_desc->context);
+    addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingCmpDesc),
+                                s->rings.cmpRingPPNs);
+
+    barrier();
+    cpu_physical_memory_write(addr, (void *)cmp_desc, sizeof(*cmp_desc));
+    barrier();
+    pvscsi_st_ring_state(s, cmpProdIdx, val + 1);
+}
+
+/* Put all completed requests on the completion ring.  */
+static void pvscsi_complete_reqs(void *opaque)
+{
+    PVSCSIState *s = opaque;
+    PVSCSIRequest *p;
+    int n = pvscsi_cmp_free(s);
+    int done = 0;
+    while (n > 0 && !QTAILQ_EMPTY(&s->complete_queue)) {
+        p = QTAILQ_FIRST(&s->complete_queue);
+        QTAILQ_REMOVE(&s->complete_queue, p, next);
+        pvscsi_cmp_ring_put(s, &p->cmp);
+        qemu_free(p);
+        n--;
+        done++;
+    }
+    if (done) {
+        pvscsi_raise_intr(s, s->intr_cmpl);
+    }
+}
+
+/* Prepare to put r on the completion ring.  */
+static void pvscsi_complete_req(PVSCSIState *s, PVSCSIRequest *p)
+{
+    assert(!p->completed);
+    trace_pvscsi_complete_req(p->cmp.context, p->cmp.dataLen, p->sense_key);
+    p->completed = 1;
+    QTAILQ_REMOVE(&s->pending_queue, p, next);
+    QTAILQ_INSERT_TAIL(&s->complete_queue, p, next);
+    qemu_bh_schedule(s->complete_reqs_bh);
+}
+
+/* Fetch sense data for a completed request.  */
+static bool pvscsi_send_request_sense(SCSIDevice *sdev, int tag, int lun)
+{
+    uint8_t cdb[6] = { 3, lun << 5, 0, 0, 96, 0 };
+    trace_pvscsi_request_sense(tag, lun);
+    int n = sdev->info->send_command(sdev, tag, cdb, lun);
+    if (n < 0) {
+        /* should not happen, just leave sense data empty in this case. */
+        sdev->info->cancel_io(sdev, tag);
+    } else if (n > 0) {
+        sdev->info->read_data(sdev, tag);
+        return true;
+    }
+    return false;
+}
+
+/* Write sense data for a completed request.  */
+static void pvscsi_write_sense(PVSCSIRequest *p, uint8_t *buf, int len)
+{
+    p->cmp.senseLen = MIN(p->req.senseLen, len);
+    p->sense_key = buf[2];
+    cpu_physical_memory_write(p->req.senseAddr, buf, p->cmp.senseLen);
+}
+
+static void pvscsi_transfer_data_with_buffer(PVSCSIRequest *p, bool to_host,
+                                             uint8_t *buf, int len)
+{
+    if (len) {
+        cpu_physical_memory_rw(p->req.dataAddr, buf, len, to_host);
+        p->cmp.dataLen += len;
+        p->req.dataAddr += len;
+        p->resid -= len;
+    }
+}
+
+static void pvscsi_get_next_sg_elem(struct PVSCSIRequest *p)
+{
+    struct PVSCSISGElement elem;
+
+    for (;; p->sg_current_addr = elem.addr) {
+        cpu_physical_memory_read(p->sg_current_addr, (void *)&elem,
+                                 sizeof(elem));
+#if 0
+        /* PVSCSI_SGE_FLAG_CHAIN_ELEMENT not in the header file! */
+        if ((elem.flags & PVSCSI_SGE_FLAG_CHAIN_ELEMENT) == 0) {
+            break;
+        }
+#else
+        break;
+#endif
+    }
+
+    p->sg_current_addr += sizeof(elem);
+    p->sg_current_dataAddr = elem.addr;
+    p->sg_current_resid = elem.length;
+    trace_pvscsi_sg_elem(p->req.context, elem.addr, elem.length);
+}
+
+static void pvscsi_transfer_data_with_sg_list(PVSCSIRequest *p, bool to_host,
+                                              uint8_t *buf, int len)
+{
+    int n;
+    while (len) {
+        while (!p->sg_current_resid) {
+            pvscsi_get_next_sg_elem(p);
+        }
+        assert(len > 0);
+        n = MIN((unsigned) len, p->sg_current_resid);
+        if (n) {
+            cpu_physical_memory_rw(p->sg_current_dataAddr, buf, n, to_host);
+        }
+
+        buf += n;
+        p->cmp.dataLen += n;
+        p->sg_current_dataAddr += n;
+
+        len -= n;
+        p->resid -= n;
+        p->sg_current_resid -= n;
+    }
+}
+
+static bool pvscsi_transfer_data(PVSCSIRequest *p, void *buf, int len)
+{
+    int to_host = (p->req.flags & PVSCSI_FLAG_CMD_DIR_TOHOST) != 0;
+    if (len > p->resid) {
+        /* Do nothing upon underrun.  */
+        return false;
+    }
+
+    trace_pvscsi_transfer_data(p->req.context, len);
+    if (p->req.flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        pvscsi_transfer_data_with_sg_list(p, to_host, buf, len);
+    } else {
+        pvscsi_transfer_data_with_buffer(p, to_host, buf, len);
+    }
+    return true;
+}
+
+static void pvscsi_kick_device(PVSCSIRequest *p)
+{
+    if (p->req.flags & PVSCSI_FLAG_CMD_DIR_TODEVICE) {
+        p->sdev->info->write_data(p->sdev, p->req.context);
+    } else {
+        p->sdev->info->read_data(p->sdev, p->req.context);
+    }
+}
+
+/* Callback to indicate that the SCSI layer has completed a transfer.  */
+static void pvscsi_command_complete(SCSIBus *bus, int reason, uint32_t tag,
+                                    uint32_t arg)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, bus->qbus.parent);
+    PVSCSIRequest *p = pvscsi_find_request(s, tag);
+    SCSIDevice *sdev;
+    uint8_t *buf;
+
+    if (!p) {
+        fprintf(stderr, "PVSCSI: Can't find request for tag 0x%x\n", tag);
+        return;
+    }
+
+    sdev = p->sdev;
+    if (reason == SCSI_REASON_DATA && !p->sensing) {
+        assert(p->resid);
+        if (!arg) {
+            /* Short transfer.  */
+            sdev->info->cancel_io(sdev, tag);
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            p->cmp.scsiStatus = CHECK_CONDITION;
+            goto complete;
+        }
+
+        buf = sdev->info->get_buf(sdev, tag);
+        if (!pvscsi_transfer_data(p, buf, arg)) {
+            /* Small buffer.  */
+            sdev->info->cancel_io(sdev, tag);
+            p->cmp.hostStatus = BTSTAT_DATARUN;
+            p->cmp.scsiStatus = CHECK_CONDITION;
+            goto complete;
+        }
+
+        pvscsi_kick_device(p);
+
+        /* We'll be called back asynchronously, exit.  */
+        return;
+    }
+
+    /* Here to complete the request.  */
+    if (reason == SCSI_REASON_DONE) {
+        p->cmp.scsiStatus = arg;
+    }
+
+complete:
+    if (p->sensing == 0 && p->cmp.scsiStatus == CHECK_CONDITION) {
+        p->sensing = 1;
+        if (pvscsi_send_request_sense(sdev, tag, p->lun)) {
+            return;
+        }
+
+    } else if (p->sensing == 1 && reason == SCSI_REASON_DATA) {
+        /* Got sense data.  Write it back and kick the device to complete
+         * the request.  */
+        if (arg) {
+            buf = sdev->info->get_buf(sdev, tag);
+            pvscsi_write_sense(p, buf, arg);
+            if (buf[2] == NO_SENSE) {
+                p->cmp.scsiStatus = GOOD;
+            }
+        }
+        p->sensing = 2;
+        pvscsi_kick_device(p);
+        return;
+    }
+
+    pvscsi_complete_req(s, p);
+}
+\f

+
+/* Process a request from the request ring.  */
+static void pvscsi_process_req(PVSCSIState *s, struct PVSCSIRingReqDesc *r)
+{
+    PVSCSIRequest *p = pvscsi_queue_request(s, r);
+    int64_t datalen, n;
+
+    if (!p->sdev) {
+        p->cmp.hostStatus = BTSTAT_SELTIMEO;
+        goto fail_nocancel;
+    }
+
+    if (r->flags & PVSCSI_FLAG_CMD_WITH_SG_LIST) {
+        p->sg_current_addr = r->dataAddr;
+    }
+
+    n = p->sdev->info->send_command(p->sdev, r->context, r->cdb, p->lun);
+    if ((n > 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TODEVICE)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        goto fail;
+    }
+    if ((n < 0) && (r->flags & PVSCSI_FLAG_CMD_DIR_TOHOST)) {
+        p->cmp.hostStatus = BTSTAT_BADMSG;
+        goto fail;
+    }
+
+    datalen = (n < 0 ? -n : n);
+    p->resid = MIN(n, r->dataLen);
+    if (n) {
+        pvscsi_kick_device(p);
+    }
+    return;
+
+fail:
+    p->sdev->info->cancel_io(p->sdev, r->context);
+fail_nocancel:
+    pvscsi_complete_req(s, p);
+}
+
+/* Process pending requests on the request ring.  */
+static void pvscsi_process_req_ring(PVSCSIState *s)
+{
+    uint32_t req_entries = s->reqNumEntriesLog2;
+
+    trace_pvscsi_kick_io();
+    while (pvscsi_req_pending(s)) {
+        uint32_t val = pvscsi_ld_ring_state(s, reqConsIdx);
+        uint32_t idx = val & MASK(req_entries);
+        target_phys_addr_t addr;
+        struct PVSCSIRingReqDesc req_desc;
+
+        addr = pvscsi_get_ring_addr(s, idx, sizeof(struct PVSCSIRingReqDesc),
+                                    s->rings.reqRingPPNs);
+
+        barrier();
+        cpu_physical_memory_read(addr, (void *)&req_desc, sizeof(req_desc));
+        pvscsi_process_req(s, &req_desc);
+        barrier();
+        pvscsi_st_ring_state(s, reqConsIdx, val + 1);
+    }
+}
+
+\f

+static int32_t pvscsi_cmd_bad(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: bad command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_unimpl(PVSCSIState *s)
+{
+    fprintf(stderr, "vmw_pvscsi: unimplemented command %d\n", s->cmd_latch);
+    return -1;
+}
+
+static int32_t pvscsi_cmd_adapter_reset(PVSCSIState *s)
+{
+    pvscsi_soft_reset(s);
+    return 0;
+}
+
+static int floor_log2(int x)
+{
+    assert(x);
+    return 31 - clz32(x);
+}
+
+/* Setup ring buffers and initialize the ring state page.  */
+static int32_t pvscsi_cmd_setup_rings(PVSCSIState *s)
+{
+    memcpy(&s->rings, s->cmd_buffer, sizeof(s->rings));
+    if (s->rings.reqRingNumPages == 0 ||
+        s->rings.cmpRingNumPages == 0) {
+        return -1;
+    }
+
+    s->reqNumEntriesLog2 = floor_log2(s->rings.reqRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingReqDesc));
+    s->cmpNumEntriesLog2 = floor_log2(s->rings.cmpRingNumPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingCmpDesc));
+
+    trace_pvscsi_setup_req_ring(s->rings.reqRingNumPages,
+                                1 << s->reqNumEntriesLog2);
+    trace_pvscsi_setup_cmp_ring(s->rings.cmpRingNumPages,
+                                1 << s->cmpNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, reqNumEntriesLog2, s->reqNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpNumEntriesLog2, s->cmpNumEntriesLog2);
+    pvscsi_st_ring_state(s, cmpProdIdx, 0);
+    pvscsi_st_ring_state(s, cmpConsIdx, 0);
+    pvscsi_st_ring_state(s, reqProdIdx, 0);
+    pvscsi_st_ring_state(s, reqConsIdx, 0);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_bus(PVSCSIState *s)
+{
+    qbus_reset_all_fn(&s->bus);
+    return 0;
+}
+
+static int32_t pvscsi_cmd_reset_device(PVSCSIState *s)
+{
+    struct PVSCSICmdDescResetDevice *cmd =
+        (struct PVSCSICmdDescResetDevice *) &s->cmd_buffer;
+    SCSIDevice *sdev;
+
+    pvscsi_get_dev_lun(s, cmd->lun, cmd->target, &sdev);
+    if (sdev != NULL && sdev->info->qdev.reset) {
+        sdev->info->qdev.reset(&sdev->qdev);
+    }
+
+    return 0;
+}
+
+static int32_t pvscsi_cmd_abort_cmd(PVSCSIState *s)
+{
+    return 0;
+}
+
+static int32_t pvscsi_cmd_setup_msg_ring(PVSCSIState *s)
+{
+    memcpy(&s->msgRing, s->cmd_buffer, sizeof(s->msgRing));
+    if (s->msgRing.numPages == 0) {
+        return -1;
+    }
+
+    s->msgNumEntriesLog2 = floor_log2(s->msgRing.numPages * PAGE_SIZE
+                                      / sizeof(struct PVSCSIRingMsgDesc));
+
+    trace_pvscsi_setup_msg_ring(s->msgRing.numPages,
+                                1 << s->msgNumEntriesLog2);
+
+    pvscsi_st_ring_state(s, msgNumEntriesLog2, s->msgNumEntriesLog2);
+    pvscsi_st_ring_state(s, msgProdIdx, 0);
+    pvscsi_st_ring_state(s, msgConsIdx, 0);
+    return 0;
+}
+
+typedef struct {
+    int nargs;
+    int32_t (*fn)(PVSCSIState *);
+} PVSCSICmd;
+
+static const PVSCSICmd pvscsi_commands[PVSCSI_CMD_LAST] = {
+    [PVSCSI_CMD_FIRST] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_bad,
+    },
+    [PVSCSI_CMD_ADAPTER_RESET] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_adapter_reset
+    },
+    [PVSCSI_CMD_ISSUE_SCSI] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_RINGS] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupRings) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_rings
+    },
+    [PVSCSI_CMD_RESET_BUS] = {
+        .nargs = 0,
+        .fn = pvscsi_cmd_reset_bus
+    },
+    [PVSCSI_CMD_RESET_DEVICE] = {
+        .nargs = sizeof(struct PVSCSICmdDescResetDevice) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_reset_device
+    },
+    [PVSCSI_CMD_ABORT_CMD] = {
+        .nargs = sizeof(struct PVSCSICmdDescAbortCmd) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_abort_cmd
+    },
+    [PVSCSI_CMD_CONFIG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    },
+    [PVSCSI_CMD_SETUP_MSG_RING] = {
+        .nargs = sizeof(struct PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t),
+        .fn = pvscsi_cmd_setup_msg_ring
+    },
+    [PVSCSI_CMD_DEVICE_UNPLUG] = {
+        .nargs = 0, /* unknown */
+        .fn = pvscsi_cmd_unimpl
+    }
+};
+
+\f

+static void pvscsi_maybe_do_cmd(PVSCSIState *s)
+{
+    int cmd = s->cmd_latch >= PVSCSI_CMD_LAST ? PVSCSI_CMD_FIRST : s->cmd_latch;
+    const PVSCSICmd *cmd_info = &pvscsi_commands[cmd];
+
+    if (s->cmd_ptr >= cmd_info->nargs) {
+        s->cmd_status = cmd_info->fn(s);
+        s->cmd_latch = 0;
+        s->cmd_ptr = 0;
+    }
+}
+
+static uint32_t pvscsi_reg_readl(PVSCSIState *s, int offset)
+{
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        fprintf(stderr, "vmw_pvscsi: read to write-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        return s->cmd_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        return s->intr_status;
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        return s->intr_mask;
+        break;
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: read from unsupported register %x\n", offset);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static void pvscsi_reg_write(PVSCSIState *s, int offset, uint32_t val, int size)
+{
+    if (size != 4) {
+        switch (offset) {
+        case PVSCSI_REG_OFFSET_COMMAND:
+        case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_STATUS:
+        case PVSCSI_REG_OFFSET_INTR_MASK:
+            abort();
+        default:
+            break;
+        }
+    }
+
+    switch (offset) {
+    case PVSCSI_REG_OFFSET_COMMAND:
+        trace_pvscsi_cmd(val);
+        s->cmd_latch = val;
+        s->cmd_ptr = 0;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_DATA:
+        s->cmd_buffer[s->cmd_ptr++] = val;
+        pvscsi_maybe_do_cmd(s);
+        break;
+    case PVSCSI_REG_OFFSET_COMMAND_STATUS:
+        fprintf(stderr, "vmw_pvscsi: write to read-only register %x\n", offset);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_STATUS:
+        pvscsi_acknowledge_intr(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_INTR_MASK:
+        pvscsi_set_intr_mask(s, val);
+        break;
+    case PVSCSI_REG_OFFSET_KICK_NON_RW_IO:
+    case PVSCSI_REG_OFFSET_KICK_RW_IO:
+        pvscsi_process_req_ring(s);
+        break;
+
+    case PVSCSI_REG_OFFSET_LAST_STS_0:
+    case PVSCSI_REG_OFFSET_LAST_STS_1:
+    case PVSCSI_REG_OFFSET_LAST_STS_2:
+    case PVSCSI_REG_OFFSET_LAST_STS_3:
+    case PVSCSI_REG_OFFSET_DEBUG:
+        fprintf(stderr, "vmw_pvscsi: write to unsupported register %x\n", offset);
+        break;
+    default:
+            break;
+    }
+}
+
+static void pvscsi_mmio_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 1);
+}
+
+static void pvscsi_mmio_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 2);
+}
+
+static void pvscsi_mmio_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    pvscsi_reg_write(s, addr, val, 4);
+}
+
+static uint32_t pvscsi_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    abort();
+}
+
+static uint32_t pvscsi_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    PVSCSIState *s = opaque;
+
+    addr &= PVSCSI_MEM_SPACE_SIZE - 1;
+    return pvscsi_reg_readl(s, addr);
+}
+
+static CPUReadMemoryFunc * const pvscsi_mmio_readfn[3] = {
+    pvscsi_mmio_readb,
+    pvscsi_mmio_readw,
+    pvscsi_mmio_readl,
+};
+
+static CPUWriteMemoryFunc * const pvscsi_mmio_writefn[3] = {
+    pvscsi_mmio_writeb,
+    pvscsi_mmio_writew,
+    pvscsi_mmio_writel,
+};
+
+static void pvscsi_mmio_mapfunc(PCIDevice *pci_dev, int region_num,
+                             pcibus_t addr, pcibus_t size, int type)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, pci_dev);
+
+    cpu_register_physical_memory(addr, PVSCSI_MEM_SPACE_SIZE, s->mmio_io_addr);
+}
+
+static void pvscsi_reset(DeviceState *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev.qdev, dev);
+
+    pvscsi_soft_reset(s);
+}
+
+static int pvscsi_uninit(PCIDevice *d)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, d);
+
+    cpu_unregister_io_memory(s->mmio_io_addr);
+
+    return 0;
+}
+
+static int pvscsi_init(PCIDevice *dev)
+{
+    PVSCSIState *s = DO_UPCAST(PVSCSIState, dev, dev);
+    uint8_t *pci_conf;
+
+    pci_conf = s->dev.config;
+
+    pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_VMWARE);
+    pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_VMWARE_PVSCSI);
+    pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_SCSI);
+
+    /* PCI subsystem ID */
+    pci_conf[PCI_SUBSYSTEM_ID] = 0x00;
+    pci_conf[PCI_SUBSYSTEM_ID + 1] = 0x10;
+
+    /* PCI latency timer = 255 */
+    pci_conf[PCI_LATENCY_TIMER] = 0xff;
+
+    /* Interrupt pin 1 */
+    pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+    s->mmio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar(&s->dev, 0, PVSCSI_MEM_SPACE_SIZE,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, pvscsi_mmio_mapfunc);
+
+#if 0
+    s->pio_io_addr = cpu_register_io_memory(pvscsi_mmio_readfn,
+                                             pvscsi_mmio_writefn, s,
+                                             DEVICE_NATIVE_ENDIAN);
+    pci_register_bar(&s->dev, 1, 256,
+                           PCI_BASE_ADDRESS_SPACE_IO, pvscsi_io_mapfunc);
+#endif
+
+    s->complete_reqs_bh = qemu_bh_new(pvscsi_complete_reqs, s);
+
+    scsi_bus_new(&s->bus, &dev->qdev, 1, PVSCSI_MAX_DEVS,
+                 pvscsi_command_complete);
+    if (!dev->qdev.hotplugged) {
+        return scsi_bus_legacy_handle_cmdline(&s->bus);
+    }
+    return 0;
+}
+
+static PCIDeviceInfo pvscsi_info = {
+    .qdev.name  = "vmw_pvscsi",
+    .qdev.size  = sizeof(PVSCSIState),
+    .qdev.reset = pvscsi_reset,
+    .init       = pvscsi_init,
+    .exit       = pvscsi_uninit,
+};
+
+static void vmw_pvscsi_register_devices(void)
+{
+    pci_qdev_register(&pvscsi_info);
+}
+
+device_init(vmw_pvscsi_register_devices);
diff --git a/hw/vmw_pvscsi.h b/hw/vmw_pvscsi.h
new file mode 100644
index 0000000..b7fa3f6
--- /dev/null
+++ b/hw/vmw_pvscsi.h
@@ -0,0 +1,389 @@
+/*
+ * VMware PVSCSI header file
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _VMW_PVSCSI_H_
+#define _VMW_PVSCSI_H_
+
+#define PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT 128
+
+#define MASK(n)        ((1 << (n)) - 1)        /* make an n-bit mask */
+
+#define __packed __attribute__((packed))
+
+/*
+ * host adapter status/error codes
+ */
+enum HostBusAdapterStatus {
+   BTSTAT_SUCCESS       = 0x00,  /* CCB complete normally with no errors */
+   BTSTAT_LINKED_COMMAND_COMPLETED           = 0x0a,
+   BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG = 0x0b,
+   BTSTAT_DATA_UNDERRUN = 0x0c,
+   BTSTAT_SELTIMEO      = 0x11,  /* SCSI selection timeout */
+   BTSTAT_DATARUN       = 0x12,  /* data overrun/underrun */
+   BTSTAT_BUSFREE       = 0x13,  /* unexpected bus free */
+   BTSTAT_INVPHASE      = 0x14,  /* invalid bus phase or sequence requested by target */
+   BTSTAT_LUNMISMATCH   = 0x17,  /* linked CCB has different LUN from first CCB */
+   BTSTAT_SENSFAILED    = 0x1b,  /* auto request sense failed */
+   BTSTAT_TAGREJECT     = 0x1c,  /* SCSI II tagged queueing message rejected by target */
+   BTSTAT_BADMSG        = 0x1d,  /* unsupported message received by the host adapter */
+   BTSTAT_HAHARDWARE    = 0x20,  /* host adapter hardware failed */
+   BTSTAT_NORESPONSE    = 0x21,  /* target did not respond to SCSI ATN, sent a SCSI RST */
+   BTSTAT_SENTRST       = 0x22,  /* host adapter asserted a SCSI RST */
+   BTSTAT_RECVRST       = 0x23,  /* other SCSI devices asserted a SCSI RST */
+   BTSTAT_DISCONNECT    = 0x24,  /* target device reconnected improperly (w/o tag) */
+   BTSTAT_BUSRESET      = 0x25,  /* host adapter issued BUS device reset */
+   BTSTAT_ABORTQUEUE    = 0x26,  /* abort queue generated */
+   BTSTAT_HASOFTWARE    = 0x27,  /* host adapter software error */
+   BTSTAT_HATIMEOUT     = 0x30,  /* host adapter hardware timeout error */
+   BTSTAT_SCSIPARITY    = 0x34,  /* SCSI parity error detected */
+};
+
+/*
+ * Register offsets.
+ *
+ * These registers are accessible both via i/o space and mm i/o.
+ */
+
+enum PVSCSIRegOffset {
+	PVSCSI_REG_OFFSET_COMMAND        =    0x0,
+	PVSCSI_REG_OFFSET_COMMAND_DATA   =    0x4,
+	PVSCSI_REG_OFFSET_COMMAND_STATUS =    0x8,
+	PVSCSI_REG_OFFSET_LAST_STS_0     =  0x100,
+	PVSCSI_REG_OFFSET_LAST_STS_1     =  0x104,
+	PVSCSI_REG_OFFSET_LAST_STS_2     =  0x108,
+	PVSCSI_REG_OFFSET_LAST_STS_3     =  0x10c,
+	PVSCSI_REG_OFFSET_INTR_STATUS    = 0x100c,
+	PVSCSI_REG_OFFSET_INTR_MASK      = 0x2010,
+	PVSCSI_REG_OFFSET_KICK_NON_RW_IO = 0x3014,
+	PVSCSI_REG_OFFSET_DEBUG          = 0x3018,
+	PVSCSI_REG_OFFSET_KICK_RW_IO     = 0x4018,
+};
+
+/*
+ * Virtual h/w commands.
+ */
+
+enum PVSCSICommands {
+	PVSCSI_CMD_FIRST             = 0, /* has to be first */
+
+	PVSCSI_CMD_ADAPTER_RESET     = 1,
+	PVSCSI_CMD_ISSUE_SCSI        = 2,
+	PVSCSI_CMD_SETUP_RINGS       = 3,
+	PVSCSI_CMD_RESET_BUS         = 4,
+	PVSCSI_CMD_RESET_DEVICE      = 5,
+	PVSCSI_CMD_ABORT_CMD         = 6,
+	PVSCSI_CMD_CONFIG            = 7,
+	PVSCSI_CMD_SETUP_MSG_RING    = 8,
+	PVSCSI_CMD_DEVICE_UNPLUG     = 9,
+
+	PVSCSI_CMD_LAST              = 10  /* has to be last */
+};
+
+/*
+ * Command descriptor for PVSCSI_CMD_RESET_DEVICE --
+ */
+
+struct PVSCSICmdDescResetDevice {
+	uint32_t	target;
+	uint8_t		lun[8];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_ABORT_CMD --
+ *
+ * - currently does not support specifying the LUN.
+ * - _pad should be 0.
+ */
+
+struct PVSCSICmdDescAbortCmd {
+	uint64_t	context;
+	uint32_t	target;
+	uint32_t	_pad;
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_RINGS --
+ *
+ * Notes:
+ * - reqRingNumPages and cmpRingNumPages need to be power of two.
+ * - reqRingNumPages and cmpRingNumPages need to be different from 0,
+ * - reqRingNumPages and cmpRingNumPages need to be inferior to
+ *   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES.
+ */
+
+#define PVSCSI_SETUP_RINGS_MAX_NUM_PAGES        32
+struct PVSCSICmdDescSetupRings {
+	uint32_t	reqRingNumPages;
+	uint32_t	cmpRingNumPages;
+	uint64_t	ringsStatePPN;
+	uint64_t	reqRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+	uint64_t	cmpRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+} __packed;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_MSG_RING --
+ *
+ * Notes:
+ * - this command was not supported in the initial revision of the h/w
+ *   interface. Before using it, you need to check that it is supported by
+ *   writing PVSCSI_CMD_SETUP_MSG_RING to the 'command' register, then
+ *   immediately after read the 'command status' register:
+ *       * a value of -1 means that the cmd is NOT supported,
+ *       * a value != -1 means that the cmd IS supported.
+ *   If it's supported the 'command status' register should return:
+ *      sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t).
+ * - this command should be issued _after_ the usual SETUP_RINGS so that the
+ *   RingsState page is already setup. If not, the command is a nop.
+ * - numPages needs to be a power of two,
+ * - numPages needs to be different from 0,
+ * - _pad should be zero.
+ */
+
+#define PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES  16
+
+struct PVSCSICmdDescSetupMsgRing {
+	uint32_t	numPages;
+	uint32_t	_pad;
+	uint64_t	ringPPNs[PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES];
+} __packed;
+
+enum PVSCSIMsgType {
+	PVSCSI_MSG_DEV_ADDED          = 0,
+	PVSCSI_MSG_DEV_REMOVED        = 1,
+	PVSCSI_MSG_LAST               = 2,
+};
+
+/*
+ * Msg descriptor.
+ *
+ * sizeof(struct PVSCSIRingMsgDesc) == 128.
+ *
+ * - type is of type enum PVSCSIMsgType.
+ * - the content of args depend on the type of event being delivered.
+ */
+
+struct PVSCSIRingMsgDesc {
+	uint32_t	type;
+	uint32_t	args[31];
+} __packed;
+
+struct PVSCSIMsgDescDevStatusChanged {
+	uint32_t	type;  /* PVSCSI_MSG_DEV _ADDED / _REMOVED */
+	uint32_t	bus;
+	uint32_t	target;
+	uint8_t		lun[8];
+	uint32_t	pad[27];
+} __packed;
+
+/*
+ * Rings state.
+ *
+ * - the fields:
+ *    . msgProdIdx,
+ *    . msgConsIdx,
+ *    . msgNumEntriesLog2,
+ *   .. are only used once the SETUP_MSG_RING cmd has been issued.
+ * - '_pad' helps to ensure that the msg related fields are on their own
+ *   cache-line.
+ */
+
+struct PVSCSIRingsState {
+	uint32_t	reqProdIdx;
+	uint32_t	reqConsIdx;
+	uint32_t	reqNumEntriesLog2;
+
+	uint32_t	cmpProdIdx;
+	uint32_t	cmpConsIdx;
+	uint32_t	cmpNumEntriesLog2;
+
+	uint8_t		_pad[104];
+
+	uint32_t	msgProdIdx;
+	uint32_t	msgConsIdx;
+	uint32_t	msgNumEntriesLog2;
+} __packed;
+
+/*
+ * Request descriptor.
+ *
+ * sizeof(RingReqDesc) = 128
+ *
+ * - context: is a unique identifier of a command. It could normally be any
+ *   64bit value, however we currently store it in the serialNumber variable
+ *   of struct SCSI_Command, so we have the following restrictions due to the
+ *   way this field is handled in the vmkernel storage stack:
+ *    * this value can't be 0,
+ *    * the upper 32bit need to be 0 since serialNumber is as a uint32_t.
+ *   Currently tracked as PR 292060.
+ * - dataLen: contains the total number of bytes that need to be transferred.
+ * - dataAddr:
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is set: dataAddr is the PA of the first
+ *     s/g table segment, each s/g segment is entirely contained on a single
+ *     page of physical memory,
+ *   * if PVSCSI_FLAG_CMD_WITH_SG_LIST is NOT set, then dataAddr is the PA of
+ *     the buffer used for the DMA transfer,
+ * - flags:
+ *   * PVSCSI_FLAG_CMD_WITH_SG_LIST: see dataAddr above,
+ *   * PVSCSI_FLAG_CMD_DIR_NONE: no DMA involved,
+ *   * PVSCSI_FLAG_CMD_DIR_TOHOST: transfer from device to main memory,
+ *   * PVSCSI_FLAG_CMD_DIR_TODEVICE: transfer from main memory to device,
+ *   * PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB: reserved to handle CDBs larger than
+ *     16bytes. To be specified.
+ * - vcpuHint: vcpuId of the processor that will be most likely waiting for the
+ *   completion of the i/o. For guest OSes that use lowest priority message
+ *   delivery mode (such as windows), we use this "hint" to deliver the
+ *   completion action to the proper vcpu. For now, we can use the vcpuId of
+ *   the processor that initiated the i/o as a likely candidate for the vcpu
+ *   that will be waiting for the completion..
+ * - bus should be 0: we currently only support bus 0 for now.
+ * - unused should be zero'd.
+ */
+
+#define PVSCSI_FLAG_CMD_WITH_SG_LIST        (1 << 0)
+#define PVSCSI_FLAG_CMD_OUT_OF_BAND_CDB     (1 << 1)
+#define PVSCSI_FLAG_CMD_DIR_NONE            (1 << 2)
+#define PVSCSI_FLAG_CMD_DIR_TOHOST          (1 << 3)
+#define PVSCSI_FLAG_CMD_DIR_TODEVICE        (1 << 4)
+
+struct PVSCSIRingReqDesc {
+	uint64_t	context;
+	uint64_t	dataAddr;
+	uint64_t	dataLen;
+	uint64_t	senseAddr;
+	uint32_t	senseLen;
+	uint32_t	flags;
+	uint8_t		cdb[16];
+	uint8_t		cdbLen;
+	uint8_t		lun[8];
+	uint8_t		tag;
+	uint8_t		bus;
+	uint8_t		target;
+	uint8_t		vcpuHint;
+	uint8_t		unused[59];
+} __packed;
+
+/*
+ * Scatter-gather list management.
+ *
+ * As described above, when PVSCSI_FLAG_CMD_WITH_SG_LIST is set in the
+ * RingReqDesc.flags, then RingReqDesc.dataAddr is the PA of the first s/g
+ * table segment.
+ *
+ * - each segment of the s/g table contain a succession of struct
+ *   PVSCSISGElement.
+ * - each segment is entirely contained on a single physical page of memory.
+ * - a "chain" s/g element has the flag PVSCSI_SGE_FLAG_CHAIN_ELEMENT set in
+ *   PVSCSISGElement.flags and in this case:
+ *     * addr is the PA of the next s/g segment,
+ *     * length is undefined, assumed to be 0.
+ */
+
+struct PVSCSISGElement {
+	uint64_t	addr;
+	uint32_t	length;
+	uint32_t	flags;
+} __packed;
+
+/*
+ * Completion descriptor.
+ *
+ * sizeof(RingCmpDesc) = 32
+ *
+ * - context: identifier of the command. The same thing that was specified
+ *   under "context" as part of struct RingReqDesc at initiation time,
+ * - dataLen: number of bytes transferred for the actual i/o operation,
+ * - senseLen: number of bytes written into the sense buffer,
+ * - hostStatus: adapter status,
+ * - scsiStatus: device status,
+ * - _pad should be zero.
+ */
+
+struct PVSCSIRingCmpDesc {
+	uint64_t	context;
+	uint64_t	dataLen;
+	uint32_t	senseLen;
+	uint16_t	hostStatus;
+	uint16_t	scsiStatus;
+	uint32_t	_pad[2];
+} __packed;
+
+/*
+ * Interrupt status / IRQ bits.
+ */
+
+#define PVSCSI_INTR_CMPL_0                 (1 << 0)
+#define PVSCSI_INTR_CMPL_1                 (1 << 1)
+#define PVSCSI_INTR_CMPL_MASK              MASK(2)
+
+#define PVSCSI_INTR_MSG_0                  (1 << 2)
+#define PVSCSI_INTR_MSG_1                  (1 << 3)
+#define PVSCSI_INTR_MSG_MASK               (MASK(2) << 2)
+
+#define PVSCSI_INTR_ALL_SUPPORTED          MASK(4)
+
+/*
+ * Number of MSI-X vectors supported.
+ */
+#define PVSCSI_MAX_INTRS        24
+
+/*
+ * Enumeration of supported MSI-X vectors
+ */
+#define PVSCSI_VECTOR_COMPLETION   0
+
+/*
+ * Misc constants for the rings.
+ */
+
+#define PVSCSI_MAX_NUM_PAGES_REQ_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_CMP_RING   PVSCSI_SETUP_RINGS_MAX_NUM_PAGES
+#define PVSCSI_MAX_NUM_PAGES_MSG_RING   PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES
+
+#define PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE \
+				(PAGE_SIZE / sizeof(struct PVSCSIRingReqDesc))
+
+#define PVSCSI_MAX_REQ_QUEUE_DEPTH \
+	(PVSCSI_MAX_NUM_PAGES_REQ_RING * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE)
+
+#define PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES     1
+#define PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES 1
+#define PVSCSI_MEM_SPACE_MISC_NUM_PAGES        2
+#define PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES     2
+#define PVSCSI_MEM_SPACE_MSIX_NUM_PAGES        2
+
+enum PVSCSIMemSpace {
+	PVSCSI_MEM_SPACE_COMMAND_PAGE		= 0,
+	PVSCSI_MEM_SPACE_INTR_STATUS_PAGE	= 1,
+	PVSCSI_MEM_SPACE_MISC_PAGE		= 2,
+	PVSCSI_MEM_SPACE_KICK_IO_PAGE		= 4,
+	PVSCSI_MEM_SPACE_MSIX_TABLE_PAGE	= 6,
+	PVSCSI_MEM_SPACE_MSIX_PBA_PAGE		= 7,
+};
+
+#define PVSCSI_MEM_SPACE_NUM_PAGES \
+	(PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_INTR_STATUS_NUM_PAGES +   \
+	 PVSCSI_MEM_SPACE_MISC_NUM_PAGES +          \
+	 PVSCSI_MEM_SPACE_KICK_IO_NUM_PAGES +       \
+	 PVSCSI_MEM_SPACE_MSIX_NUM_PAGES)
+
+#define PVSCSI_MEM_SPACE_SIZE        (PVSCSI_MEM_SPACE_NUM_PAGES * PAGE_SIZE)
+
+#endif /* _VMW_PVSCSI_H_ */
diff --git a/trace-events b/trace-events
index 51e2497..7126c07 100644
--- a/trace-events
+++ b/trace-events
@@ -211,6 +211,21 @@ disable scsi_req_dequeue(int target, int lun, int tag) "target %d lun %d tag %d"
 disable scsi_req_parsed(int target, int lun, int tag, int cmd, const char *cmdname, int mode, int xfer, uint64_t lba) "target %d lun %d tag %d command %d (%s) dir %d length %d lba %"PRIu64""
 disable scsi_req_parse_bad(int target, int lun, int tag, int cmd) "target %d lun %d tag %d command %d"
 
+# hw/vmw_pvscsi.c
+disable pvscsi_queue_request(uint64_t context, uint8_t command, uint64_t dataLen) "context %"PRIu64" command %d length %"PRIu64""
+disable pvscsi_sg_elem(uint64_t context, uint64_t addr, uint64_t length) "context %"PRIu64" addr %"PRIu64" length %"PRIu64""
+disable pvscsi_transfer_data(uint64_t context, uint64_t length) "context %"PRIu64" length %"PRIu64""
+disable pvscsi_request_sense(uint64_t context, int lun) "context %"PRIu64" lun %d"
+disable pvscsi_kick_io(void) "kick request ring"
+disable pvscsi_complete_req(uint64_t context, uint64_t length, uint8_t sense) "context %"PRIu64" length %"PRIu64" sense %d"
+disable pvscsi_cmp_ring_put(uint64_t context) "context %"PRIu64""
+disable pvscsi_raise_intr(uint32_t intr, const char *state) "raised intr %d %s"
+disable pvscsi_acknowledge_intr(uint32_t intr) "acknowledged intr %d"
+disable pvscsi_setup_req_ring(uint32_t pages, uint32_t entries) "req ring - %d pages %d entries"
+disable pvscsi_setup_cmp_ring(uint32_t pages, uint32_t entries) "cmp ring - %d pages %d entries"
+disable pvscsi_setup_msg_ring(uint32_t pages, uint32_t entries) "msg ring - %d pages %d entries"
+disable pvscsi_cmd(int cmd) "command %d"
+
 # vl.c
 disable vm_state_notify(int running, int reason) "running %d reason %d"
 
-- 
1.7.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 13:42 [Qemu-devel] [RFC PATCH] implement vmware pvscsi device Paolo Bonzini
@ 2011-04-15 14:01 ` Stefan Hajnoczi
  2011-04-15 14:17   ` Paolo Bonzini
  0 siblings, 1 reply; 12+ messages in thread
From: Stefan Hajnoczi @ 2011-04-15 14:01 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Hannes Reinecke, Zachary Amsden, qemu-devel,
	Nicholas A. Bellinger, Michael S. Tsirkin

On Fri, Apr 15, 2011 at 2:42 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Lightly tested with Linux guests; at least it can successfully partition
> and format a disk.  scsi-generic also lightly tested.
>
> Doesn't do migration, doesn't do hotplug (the device would support that,
> but it is not 100% documented and the Linux driver in particular cannot
> initiate hot-unplug).  I did it as quick one-day hack to study the SCSI
> subsystem and it is my first real foray into device model land, please
> be gentle. :)
>
> vmw_pvscsi.h is taken from Linux, so it doesn't fully respect coding
> standards.  I think that's fair.
>
> Size is curiously close to the recently added sPAPR adapter:
>
>  911  2354 25553 hw/vmw_pvscsi.c
>  988  3177 29628 hw/spapr_vscsi.c
>
> Sounds like that's just the amount of code it takes to implement a SCSI
> HBA in QEMU. :)

Interesting, thanks for posting this.  I've been playing with virtio
SCSI and it is still in the early stages.  Nicholas A. Bellinger and I
have been wiring the in-kernel SCSI target up to KVM using vhost.
Feel free to take a peek at the work-in-progress:

http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/virtio-scsi
http://git.kernel.org/?p=linux/kernel/git/nab/lio-core-2.6.git;a=shortlog;h=refs/heads/tcm_vhost

I think SCSI brings many benefits.  Guests can deal with it better
than these alien vdX virtio-blk devices, which makes migration easier.
 It becomes possible to attach many disks without burning through free
PCI slots.  We don't need to update guests to add cache control,
discard, and other commands because they are part of SCSI.  We can
pass through more exotic devices.  The list goes on...

Stefan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:01 ` Stefan Hajnoczi
@ 2011-04-15 14:17   ` Paolo Bonzini
  2011-04-15 14:28     ` Stefan Hajnoczi
  2011-04-15 14:55     ` Hannes Reinecke
  0 siblings, 2 replies; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 14:17 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Hannes Reinecke, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On 04/15/2011 04:01 PM, Stefan Hajnoczi wrote:
> I think SCSI brings many benefits.  Guests can deal with it better
> than these alien vdX virtio-blk devices, which makes migration easier.
> It becomes possible to attach many disks without burning through free
> PCI slots.  We don't need to update guests to add cache control,
> discard, and other commands because they are part of SCSI.  We can
> pass through more exotic devices.  The list goes on...

And we also have to reimplement all of MMC. :)

A few questions:

1) Do you have anything posted for the virtio-scsi spec?  I had started 
working on one, but I haven't yet made it final.  It included also 
hotplug/unplug.  I can send it out on Monday.

2) Have you thought about making scsi-disk and scsi-generic provide a 
logical unit rather than a target?  Otherwise passthrough of a whole 
host or target becomes hard or messy.

3) Since I noticed Hannes is CCed, my next step for vmw_pvscsi would be 
to dust off his patches to remove the bounce buffers, and see how they 
apply to vmw_pvscsi.  But I'd like to avoid duplicated work if possible.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:17   ` Paolo Bonzini
@ 2011-04-15 14:28     ` Stefan Hajnoczi
  2011-04-15 14:37       ` Paolo Bonzini
  2011-04-15 14:55     ` Hannes Reinecke
  1 sibling, 1 reply; 12+ messages in thread
From: Stefan Hajnoczi @ 2011-04-15 14:28 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Hannes Reinecke, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On Fri, Apr 15, 2011 at 3:17 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 04/15/2011 04:01 PM, Stefan Hajnoczi wrote:
>>
>> I think SCSI brings many benefits.  Guests can deal with it better
>> than these alien vdX virtio-blk devices, which makes migration easier.
>> It becomes possible to attach many disks without burning through free
>> PCI slots.  We don't need to update guests to add cache control,
>> discard, and other commands because they are part of SCSI.  We can
>> pass through more exotic devices.  The list goes on...
>
> And we also have to reimplement all of MMC. :)
>
> A few questions:
>
> 1) Do you have anything posted for the virtio-scsi spec?  I had started
> working on one, but I haven't yet made it final.  It included also
> hotplug/unplug.  I can send it out on Monday.

Nothing formal.  I'm trying to learn SCSI as I go along:

http://git.kernel.org/?p=linux/kernel/git/nab/lio-core-2.6.git;a=blob;f=include/linux/virtio_scsi.h;hb=refs/heads/tcm_vhost

That's the interface I'm using.  Requests are:

[Header][CDB][Data-out buffers*][Data-in buffers*][Footer]

The footer gets filled in with the response.

> 2) Have you thought about making scsi-disk and scsi-generic provide a
> logical unit rather than a target?  Otherwise passthrough of a whole host or
> target becomes hard or messy.

I haven't been working at the QEMU SCSI bus level.  I want to wire up
the Linux-iSCSI.org target stack straight to the guest.  This bypasses
the QEMU SCSI and block layers completely.

I agree that the BlockDriverState in QEMU is more of a LUN than a
target and passing through multiple block devices as LUNs should be
possible.  So we probably need to restructure as you suggested and/or
provide an indirection for LUN mapping.

Stefan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:28     ` Stefan Hajnoczi
@ 2011-04-15 14:37       ` Paolo Bonzini
  2011-04-15 15:04         ` Stefan Hajnoczi
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 14:37 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Hannes Reinecke, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On 04/15/2011 04:28 PM, Stefan Hajnoczi wrote:
> Nothing formal.  I'm trying to learn SCSI as I go along:
> 
> http://git.kernel.org/?p=linux/kernel/git/nab/lio-core-2.6.git;a=blob;f=include/linux/virtio_scsi.h;hb=refs/heads/tcm_vhost
> 
> That's the interface I'm using.  Requests are:
> 
> [Header][CDB][Data-out buffers*][Data-in buffers*][Footer]
> 
> The footer gets filled in with the response.

My interface is exactly the same as virtio-blk's SCSI passthrough requests:

------------------------------ 8<-- ----------------------------

Device operation: request queue
-------------------------------

The driver queues requests to the virtqueue, and they are used by the device
(not necessarily in order).  Each request is of the form

Requests have the following format:

    struct virtio_scsi_req {
        u32 type;
        u32 ioprio;
        char cmd[];
        char data[][512];
        u8 sense[SCSI_SENSE_BUFFERSIZE];
        u32 sense_len;
        u32 residual;
        u8 status;
        u8 response;
    };

    #define VIRTIO_SCSI_T_CMD             2
    #define VIRTIO_SCSI_T_BARRIER         0x80000000

    /* status values */
    #define VIRTIO_SCSI_S_OK              0
    #define VIRTIO_SCSI_S_FAILURE         1
    #define VIRTIO_SCSI_S_CLOSED          128

The type of the request must currently be VIRTIO_SCSI_T_SCSI_CMD.
The VIRTIO_SCSI_T_BARRIER field indicates that this request acts
as a barrier and that all preceding requests must be complete
before this one, and all following requests must not be started
until this is complete.  Note that a barrier does not flush caches
in the underlying backend device in host, and thus does not serve
as data consistency guarantee.  The driver must send a SYNCHRONIZE
CACHE command to flush the host cache.

The ioprio field will indicate the priority of this request, with
higher values corresponding to higher priorities.

The cmd and data fields must reside in separate buffers.  The cmd field
indicates the command to perform and is always read-only.  The data field
may be either read-only or write-only, depending on the request.

Remaining fields are filled in by the device.  The sense_len field
indicates the number of bytes actually written to the sense buffer,
while the residual field indicates the residual size, calculated as
data_length - number_of_transferred_bytes.

The response byte is written by the device to be one of the following

The status byte is written by the device to be the SCSI status code.

- VIRTIO_SCSI_S_OK when the request was completed and the status byte
  is filled with a SCSI status code (not necessarily "GOOD").

- VIRTIO_SCSI_S_FAILURE for host or guest error.

- VIRTIO_SCSI_S_CLOSED if the virtqueue is not currently associated
  to a LU.

----------------------------------------------------------------

There is more meat to handle hotplug/hotunplug and to choose which
LUNs maps to which virtqueue, but you can wait a few days to know
the details.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:17   ` Paolo Bonzini
  2011-04-15 14:28     ` Stefan Hajnoczi
@ 2011-04-15 14:55     ` Hannes Reinecke
  2011-04-15 14:59       ` Paolo Bonzini
  1 sibling, 1 reply; 12+ messages in thread
From: Hannes Reinecke @ 2011-04-15 14:55 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Stefan Hajnoczi, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On 04/15/2011 04:17 PM, Paolo Bonzini wrote:
> On 04/15/2011 04:01 PM, Stefan Hajnoczi wrote:
>> I think SCSI brings many benefits.  Guests can deal with it better
>> than these alien vdX virtio-blk devices, which makes migration easier.
>> It becomes possible to attach many disks without burning through free
>> PCI slots.  We don't need to update guests to add cache control,
>> discard, and other commands because they are part of SCSI.  We can
>> pass through more exotic devices.  The list goes on...
> 
> And we also have to reimplement all of MMC. :)
> 
> A few questions:
> 
> 1) Do you have anything posted for the virtio-scsi spec?  I had started
> working on one, but I haven't yet made it final.  It included also
> hotplug/unplug.  I can send it out on Monday.
> 
> 2) Have you thought about making scsi-disk and scsi-generic provide a
> logical unit rather than a target?  Otherwise passthrough of a whole
> host or target becomes hard or messy.
> 
> 3) Since I noticed Hannes is CCed, my next step for vmw_pvscsi would be
> to dust off his patches to remove the bounce buffers, and see how they
> apply to vmw_pvscsi.  But I'd like to avoid duplicated work if possible.
> 

Argl.

Why vmw_pvscsi? Any paravirtualized driver doesn't improve the situation
here; we still wouldn't have a driver for unmodified guests.
So either emulate existing drivers (like megasas :-) or go the full
route and do a proper virtio-scsi.

As for the bounce buffers thing:
Good luck. Paul Brook absolutely insists on having them, but they kill
performance for any sane backend. And both are basically impossible to
reconcile; tried it once but got pushed back.

And after about the third attempt I gave up. Let me know if you have
more luck here.

But keep me in the loop for the virtio-scsi spec. I do have some ideas
what needs to get in there.
As I think hch has.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke              zSeries & Storage
hare@suse.de                  +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Markus Rex, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:55     ` Hannes Reinecke
@ 2011-04-15 14:59       ` Paolo Bonzini
  0 siblings, 0 replies; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 14:59 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Stefan Hajnoczi, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

> Why vmw_pvscsi?

Because all I wanted to do was to learn qemu's SCSI, and vmw_pvscsi is
pretty much the simplest device I could pick...  It's just an exercise,
but since it works I thought I'd post it.

> Good luck. Paul Brook absolutely insists on having them, but they kill
> performance for any sane backend. And both are basically impossible to
> reconcile; tried it once but got pushed back.
> 
> And after about the third attempt I gave up. Let me know if you have
> more luck here.

Thanks. :)

> But keep me in the loop for the virtio-scsi spec. I do have some ideas
> what needs to get in there.  As I think hch has.

I surely will, thanks.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 14:37       ` Paolo Bonzini
@ 2011-04-15 15:04         ` Stefan Hajnoczi
  2011-04-15 20:56           ` Paolo Bonzini
  0 siblings, 1 reply; 12+ messages in thread
From: Stefan Hajnoczi @ 2011-04-15 15:04 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Hannes Reinecke, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On Fri, Apr 15, 2011 at 3:37 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 04/15/2011 04:28 PM, Stefan Hajnoczi wrote:
>> Nothing formal.  I'm trying to learn SCSI as I go along:
>>
>> http://git.kernel.org/?p=linux/kernel/git/nab/lio-core-2.6.git;a=blob;f=include/linux/virtio_scsi.h;hb=refs/heads/tcm_vhost
>>
>> That's the interface I'm using.  Requests are:
>>
>> [Header][CDB][Data-out buffers*][Data-in buffers*][Footer]
>>
>> The footer gets filled in with the response.
>
> My interface is exactly the same as virtio-blk's SCSI passthrough requests:
>
> ------------------------------ 8<-- ----------------------------
>
> Device operation: request queue
> -------------------------------
>
> The driver queues requests to the virtqueue, and they are used by the device
> (not necessarily in order).  Each request is of the form
>
> Requests have the following format:
>
>    struct virtio_scsi_req {
>        u32 type;
>        u32 ioprio;
>        char cmd[];
>        char data[][512];
>        u8 sense[SCSI_SENSE_BUFFERSIZE];
>        u32 sense_len;
>        u32 residual;
>        u8 status;
>        u8 response;
>    };

The way I approached virtio-scsi was to look at the SCSI Architecture
Model document and some of the Linux SCSI code.  I'm not sure if
letting virtio-blk SCSI pass-through or scsi-generic guide us is a
good approach.

How do your ioprio and barrier relate to SCSI?

There seem to be recent/exotic commands that can have both data-in and
data-out buffers.  The sense buffer length is also not necessarily 96
bytes max, I believe.  I haven't looked into the these two issues but
a proper virtio-scsi design should be future proof and include them
based on these fancy commands that are being added to SCSI.

Stefan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 15:04         ` Stefan Hajnoczi
@ 2011-04-15 20:56           ` Paolo Bonzini
  2011-04-18 14:05             ` Hannes Reinecke
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-15 20:56 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Michael S. Tsirkin, Hannes Reinecke, Nicholas A. Bellinger, qemu-devel

On 04/15/2011 05:04 PM, Stefan Hajnoczi wrote:
> The way I approached virtio-scsi was to look at the SCSI Architecture
> Model document and some of the Linux SCSI code.  I'm not sure if
> letting virtio-blk SCSI pass-through or scsi-generic guide us is a
> good approach.
>
> How do your ioprio and barrier relate to SCSI?

Both are part of the transport protocol, which can provide additional 
features with respect to SAM.  For example SCSI doesn't provide the full 
details of hotplug/hotunplug, or doesn't have a way for the guest to 
trigger a drive unplug on the host, but these are all desirable features 
for virtio-scsi (and they are supported by vmw_pvscsi by the way).

> There seem to be recent/exotic commands that can have both data-in and
> data-out buffers.

That can fit by adding more stuff at the end of the buffer.  It can be 
in the first version, or it can be an extra feature for later.  Since 
QEMU currently cannot handle it, probably it would need negotiation even 
if it were in the first version.

> The sense buffer length is also not necessarily 96
> bytes max, I believe.

I couldn't find that in either SPC or SAM indeed.  It seems like a 
pretty widespread assumption though.  Perhaps Nicholas or Hannes know 
where it comes from.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-15 20:56           ` Paolo Bonzini
@ 2011-04-18 14:05             ` Hannes Reinecke
  2011-04-18 15:27               ` Stefan Hajnoczi
  2011-04-18 16:09               ` Paolo Bonzini
  0 siblings, 2 replies; 12+ messages in thread
From: Hannes Reinecke @ 2011-04-18 14:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Stefan Hajnoczi, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On 04/15/2011 10:56 PM, Paolo Bonzini wrote:
> On 04/15/2011 05:04 PM, Stefan Hajnoczi wrote:
>> The way I approached virtio-scsi was to look at the SCSI Architecture
>> Model document and some of the Linux SCSI code. I'm not sure if
>> letting virtio-blk SCSI pass-through or scsi-generic guide us is a
>> good approach.
>>
>> How do your ioprio and barrier relate to SCSI?
>
> Both are part of the transport protocol, which can provide
> additional features with respect to SAM. For example SCSI doesn't
> provide the full details of hotplug/hotunplug, or doesn't have a way
> for the guest to trigger a drive unplug on the host, but these are
> all desirable features for virtio-scsi (and they are supported by
> vmw_pvscsi by the way).
>
And this is something I really miss in the current proposals, namely
a working transport layer.

The SCSI spec (SPC etc) itself just handles command delivery between 
initiator and target. Anything else (like hotplug, error recovery, 
target addressing etc) is out of the scope of the spec and needs to 
be implemented on another layer (that's the ominous
transport layer).

Hence any protocol implemented to the above spec would be missing 
those parts, and they would need to be implemented additionally.
Which also explains why these features are missing when just using 
SCSI CDBs as the main command container.

My proposal would be to implement a full virtio-scsi _host_, and 
extend the proposal to be able to handle the transport layer too.
At the lastest we would need to include a LUN address before the 
CDB, and define TMF command values for proper error recovery.

That way we could handle hotplug / -unplug via a simple host rescan, 
and would even be able to pass-in NPIV hosts.

>> There seem to be recent/exotic commands that can have both data-in
>> and data-out buffers.
>
These are bi-directional commands which are required for OSD.

> That can fit by adding more stuff at the end of the buffer. It can
> be in the first version, or it can be an extra feature for later.
> Since QEMU currently cannot handle it, probably it would need
> negotiation even if it were in the first version.
>
>> The sense buffer length is also not necessarily 96
>> bytes max, I believe.
>
> I couldn't find that in either SPC or SAM indeed. It seems like a
> pretty widespread assumption though. Perhaps Nicholas or Hannes know
> where it comes from.
>
96 bytes is a carry-over from scsi parallel. We shouldn't rely
on a fixed length here but rather use an additional pointer/iovec 
and length field.

Check SG_IO header on how it's done.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Markus Rex, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-18 14:05             ` Hannes Reinecke
@ 2011-04-18 15:27               ` Stefan Hajnoczi
  2011-04-18 16:09               ` Paolo Bonzini
  1 sibling, 0 replies; 12+ messages in thread
From: Stefan Hajnoczi @ 2011-04-18 15:27 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Paolo Bonzini, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On Mon, Apr 18, 2011 at 3:05 PM, Hannes Reinecke <hare@suse.de> wrote:
> On 04/15/2011 10:56 PM, Paolo Bonzini wrote:
>>
>> On 04/15/2011 05:04 PM, Stefan Hajnoczi wrote:
>>>
>>> The way I approached virtio-scsi was to look at the SCSI Architecture
>>> Model document and some of the Linux SCSI code. I'm not sure if
>>> letting virtio-blk SCSI pass-through or scsi-generic guide us is a
>>> good approach.
>>>
>>> How do your ioprio and barrier relate to SCSI?
>>
>> Both are part of the transport protocol, which can provide
>> additional features with respect to SAM. For example SCSI doesn't
>> provide the full details of hotplug/hotunplug, or doesn't have a way
>> for the guest to trigger a drive unplug on the host, but these are
>> all desirable features for virtio-scsi (and they are supported by
>> vmw_pvscsi by the way).
>>
> And this is something I really miss in the current proposals, namely
> a working transport layer.
>
> The SCSI spec (SPC etc) itself just handles command delivery between
> initiator and target. Anything else (like hotplug, error recovery, target
> addressing etc) is out of the scope of the spec and needs to be implemented
> on another layer (that's the ominous
> transport layer).
>
> Hence any protocol implemented to the above spec would be missing those
> parts, and they would need to be implemented additionally.
> Which also explains why these features are missing when just using SCSI CDBs
> as the main command container.
>
> My proposal would be to implement a full virtio-scsi _host_, and extend the
> proposal to be able to handle the transport layer too.
> At the lastest we would need to include a LUN address before the CDB, and
> define TMF command values for proper error recovery.
>
> That way we could handle hotplug / -unplug via a simple host rescan, and
> would even be able to pass-in NPIV hosts.

In my prototype there is a header and a footer for the request and
response, respectively:
http://git.kernel.org/?p=linux/kernel/git/nab/lio-core-2.6.git;a=blob;f=include/linux/virtio_scsi.h;hb=refs/heads/tcm_vhost

We definitely need more than plain CDB pass-through.

Stefan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] implement vmware pvscsi device
  2011-04-18 14:05             ` Hannes Reinecke
  2011-04-18 15:27               ` Stefan Hajnoczi
@ 2011-04-18 16:09               ` Paolo Bonzini
  1 sibling, 0 replies; 12+ messages in thread
From: Paolo Bonzini @ 2011-04-18 16:09 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Stefan Hajnoczi, qemu-devel, Nicholas A. Bellinger, Michael S. Tsirkin

On 04/18/2011 04:05 PM, Hannes Reinecke wrote:
> My proposal would be to implement a full virtio-scsi _host_, and extend
> the proposal to be able to handle the transport layer too.

Yes, I have added this independently from Friday to today, and it is why 
I haven't sent the proposal yet.

> At the lastest we would need to include a LUN address before the CDB,
> and define TMF command values for proper error recovery.

I haven't yet worked out TMF, but I did add a LUN.

> That way we could handle hotplug / -unplug via a simple host rescan

It's a bit more complicated because you also want guest-initiated 
unplug, and SAM transport reset events include more than hotplug/unplug.

>> I couldn't find that in either SPC or SAM indeed. It seems like a
>> pretty widespread assumption though. Perhaps Nicholas or Hannes know
>> where it comes from.
>>
> 96 bytes is a carry-over from scsi parallel. We shouldn't rely
> on a fixed length here but rather use an additional pointer/iovec and
> length field.
>
> Check SG_IO header on how it's done.

Will do.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-04-18 16:09 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-15 13:42 [Qemu-devel] [RFC PATCH] implement vmware pvscsi device Paolo Bonzini
2011-04-15 14:01 ` Stefan Hajnoczi
2011-04-15 14:17   ` Paolo Bonzini
2011-04-15 14:28     ` Stefan Hajnoczi
2011-04-15 14:37       ` Paolo Bonzini
2011-04-15 15:04         ` Stefan Hajnoczi
2011-04-15 20:56           ` Paolo Bonzini
2011-04-18 14:05             ` Hannes Reinecke
2011-04-18 15:27               ` Stefan Hajnoczi
2011-04-18 16:09               ` Paolo Bonzini
2011-04-15 14:55     ` Hannes Reinecke
2011-04-15 14:59       ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.