From: Stefan Hajnoczi <stefanha@redhat.com>
To: qemu-devel@nongnu.org
Cc: "Kevin Wolf" <kwolf@redhat.com>, "Fam Zheng" <fam@euphon.net>,
qemu-block@nongnu.org, "Max Reitz" <mreitz@redhat.com>,
"Stefan Hajnoczi" <stefanha@redhat.com>,
"Philippe Mathieu-Daudé" <philmd@redhat.com>
Subject: [PATCH 4/7] block/nvme: switch to a NVMeRequest freelist
Date: Tue, 19 May 2020 18:11:35 +0100 [thread overview]
Message-ID: <20200519171138.201667-5-stefanha@redhat.com> (raw)
In-Reply-To: <20200519171138.201667-1-stefanha@redhat.com>
There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
request submission O(n^2).
Switch to an O(1) freelist that is always accessed under the lock.
Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 54 insertions(+), 27 deletions(-)
diff --git a/block/nvme.c b/block/nvme.c
index 6bf58bc6aa..3ad4f27e1c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -33,6 +33,12 @@
#define NVME_QUEUE_SIZE 128
#define NVME_BAR_SIZE 8192
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
typedef struct {
int32_t head, tail;
uint8_t *queue;
@@ -47,7 +53,7 @@ typedef struct {
int cid;
void *prp_list_page;
uint64_t prp_list_iova;
- bool busy;
+ int free_req_next; /* q->reqs[] index of next free req */
} NVMeRequest;
typedef struct {
@@ -61,7 +67,8 @@ typedef struct {
/* Fields protected by @lock */
NVMeQueue sq, cq;
int cq_phase;
- NVMeRequest reqs[NVME_QUEUE_SIZE];
+ int free_req_head;
+ NVMeRequest reqs[NVME_NUM_REQS];
bool busy;
int need_kick;
int inflight;
@@ -200,19 +207,23 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
qemu_mutex_init(&q->lock);
q->index = idx;
qemu_co_queue_init(&q->free_req_queue);
- q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+ q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
- s->page_size * NVME_QUEUE_SIZE,
+ s->page_size * NVME_NUM_REQS,
false, &prp_list_iova);
if (r) {
goto fail;
}
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+ q->free_req_head = -1;
+ for (i = 0; i < NVME_NUM_REQS; i++) {
NVMeRequest *req = &q->reqs[i];
req->cid = i + 1;
+ req->free_req_next = q->free_req_head;
+ q->free_req_head = i;
req->prp_list_page = q->prp_list_pages + i * s->page_size;
req->prp_list_iova = prp_list_iova + i * s->page_size;
}
+
nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
@@ -254,13 +265,11 @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
*/
static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
{
- int i;
- NVMeRequest *req = NULL;
+ NVMeRequest *req;
qemu_mutex_lock(&q->lock);
- while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
- /* We have to leave one slot empty as that is the full queue case (head
- * == tail + 1). */
+
+ while (q->free_req_head == -1) {
if (qemu_in_coroutine()) {
trace_nvme_free_req_queue_wait(q);
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -269,20 +278,41 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
return NULL;
}
}
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
- if (!q->reqs[i].busy) {
- q->reqs[i].busy = true;
- req = &q->reqs[i];
- break;
- }
- }
- /* We have checked inflight and need_kick while holding q->lock, so one
- * free req must be available. */
- assert(req);
+
+ req = &q->reqs[q->free_req_head];
+ q->free_req_head = req->free_req_next;
+ req->free_req_next = -1;
+
qemu_mutex_unlock(&q->lock);
return req;
}
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+ req->free_req_next = q->free_req_head;
+ q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+ if (!qemu_co_queue_empty(&q->free_req_queue)) {
+ replay_bh_schedule_oneshot_event(s->aio_context,
+ nvme_free_req_queue_cb, q);
+ }
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
+ NVMeRequest *req)
+{
+ qemu_mutex_lock(&q->lock);
+ nvme_put_free_req_locked(q, req);
+ nvme_wake_free_req_locked(s, q);
+ qemu_mutex_unlock(&q->lock);
+}
+
static inline int nvme_translate_error(const NvmeCqe *c)
{
uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -344,7 +374,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
req = *preq;
assert(req.cid == cid);
assert(req.cb);
- preq->busy = false;
+ nvme_put_free_req_locked(q, preq);
preq->cb = preq->opaque = NULL;
qemu_mutex_unlock(&q->lock);
req.cb(req.opaque, ret);
@@ -356,10 +386,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
/* Notify the device so it can post more completions. */
smp_mb_release();
*q->cq.doorbell = cpu_to_le32(q->cq.head);
- if (!qemu_co_queue_empty(&q->free_req_queue)) {
- replay_bh_schedule_oneshot_event(s->aio_context,
- nvme_free_req_queue_cb, q);
- }
+ nvme_wake_free_req_locked(s, q);
}
q->busy = false;
return progress;
@@ -1001,7 +1028,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
qemu_co_mutex_unlock(&s->dma_map_lock);
if (r) {
- req->busy = false;
+ nvme_put_free_req_and_wake(s, ioq, req);
return r;
}
nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -1218,7 +1245,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
qemu_co_mutex_unlock(&s->dma_map_lock);
if (ret) {
- req->busy = false;
+ nvme_put_free_req_and_wake(s, ioq, req);
goto out;
}
--
2.25.3
next prev parent reply other threads:[~2020-05-19 17:15 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-05-19 17:11 [PATCH 0/7] block/nvme: support nested aio_poll() Stefan Hajnoczi
2020-05-19 17:11 ` [PATCH 1/7] block/nvme: poll queues without q->lock Stefan Hajnoczi
2020-05-25 8:07 ` Sergio Lopez
2020-05-28 15:23 ` Stefan Hajnoczi
2020-05-29 7:49 ` Sergio Lopez
2020-06-17 12:52 ` Stefan Hajnoczi
2020-05-19 17:11 ` [PATCH 2/7] block/nvme: drop tautologous assertion Stefan Hajnoczi
2020-05-25 8:08 ` Sergio Lopez
2020-05-26 12:00 ` Philippe Mathieu-Daudé
2020-05-19 17:11 ` [PATCH 3/7] block/nvme: don't access CQE after moving cq.head Stefan Hajnoczi
2020-05-25 8:12 ` Sergio Lopez
2020-05-26 12:03 ` Philippe Mathieu-Daudé
2020-05-19 17:11 ` Stefan Hajnoczi [this message]
2020-05-25 8:10 ` [PATCH 4/7] block/nvme: switch to a NVMeRequest freelist Sergio Lopez
2020-05-19 17:11 ` [PATCH 5/7] block/nvme: clarify that free_req_queue is protected by q->lock Stefan Hajnoczi
2020-05-25 8:13 ` Sergio Lopez
2020-05-26 12:04 ` Philippe Mathieu-Daudé
2020-05-19 17:11 ` [PATCH 6/7] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair Stefan Hajnoczi
2020-05-25 8:22 ` Sergio Lopez
2020-05-26 14:55 ` Philippe Mathieu-Daudé
2020-05-26 15:20 ` Philippe Mathieu-Daudé
2020-05-28 15:25 ` Stefan Hajnoczi
2020-05-19 17:11 ` [PATCH 7/7] block/nvme: support nested aio_poll() Stefan Hajnoczi
2020-05-25 8:26 ` Sergio Lopez
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200519171138.201667-5-stefanha@redhat.com \
--to=stefanha@redhat.com \
--cc=fam@euphon.net \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=philmd@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).