On Feb 9 03:59, Keith Busch wrote: > On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote: > > @@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) > > > > static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) > > { > > - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > > - BLOCK_ACCT_FLUSH); > > - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > > - return NVME_NO_COMPLETE; > > + uint32_t nsid = le32_to_cpu(req->cmd.nsid); > > + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; > > + uint16_t status; > > + struct nvme_aio_flush_ctx *ctx; > > + NvmeNamespace *ns; > > + > > + trace_pci_nvme_flush(nvme_cid(req), nsid); > > + > > + if (nsid != NVME_NSID_BROADCAST) { > > + req->ns = nvme_ns(n, nsid); > > + if (unlikely(!req->ns)) { > > + return NVME_INVALID_FIELD | NVME_DNR; > > + } > > + > > + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > > + BLOCK_ACCT_FLUSH); > > + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > > + return NVME_NO_COMPLETE; > > + } > > + > > + /* 1-initialize; see comment in nvme_dsm */ > > + *num_flushes = 1; > > + > > + for (int i = 1; i <= n->num_namespaces; i++) { > > + ns = nvme_ns(n, i); > > + if (!ns) { > > + continue; > > + } > > + > > + ctx = g_new(struct nvme_aio_flush_ctx, 1); > > + ctx->req = req; > > + ctx->ns = ns; > > + > > + (*num_flushes)++; > > + > > + block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, > > + BLOCK_ACCT_FLUSH); > > + req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); > > + } > > Overwriting req->aiocb with the most recent flush request doesn't seem > right. > It doesn't really matter if this[1] patch is merged, but it is wrong and to align with the other multi-aio commands, this should just ignore the returned aiocb. > This whole implementation would be much simpler with the synchronous > blk_flush() routine instead of the AIO equivalent. This is not really a > performant feature, so I don't think it's critical to get these > operations happening in parallel. What do you think? It would definitely be simpler, but I believe that if there is a lot to flush, then we won't just block the nvme device. We are holding the Big QEMU Lock and will block most other devices as well. [1]: hw/block/nvme: drain namespaces on sq deletion