From: Stefan Hajnoczi <stefanha@redhat.com>
To: qemu-devel@nongnu.org, Peter Maydell <peter.maydell@linaro.org>
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>,
Sergio Lopez <slp@redhat.com>,
Eduardo Habkost <ehabkost@redhat.com>,
qemu-block@nongnu.org, Max Reitz <mreitz@redhat.com>,
Stefan Hajnoczi <stefanha@redhat.com>,
Cleber Rosa <crosa@redhat.com>
Subject: [PULL 09/12] block/nvme: switch to a NVMeRequest freelist
Date: Wed, 24 Jun 2020 11:02:07 +0100 [thread overview]
Message-ID: <20200624100210.59975-10-stefanha@redhat.com> (raw)
In-Reply-To: <20200624100210.59975-1-stefanha@redhat.com>
There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
request submission O(n^2).
Switch to an O(1) freelist that is always accessed under the lock.
Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 54 insertions(+), 27 deletions(-)
diff --git a/block/nvme.c b/block/nvme.c
index 344893811a..8e60882af3 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -33,6 +33,12 @@
#define NVME_QUEUE_SIZE 128
#define NVME_BAR_SIZE 8192
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
typedef struct {
int32_t head, tail;
uint8_t *queue;
@@ -47,7 +53,7 @@ typedef struct {
int cid;
void *prp_list_page;
uint64_t prp_list_iova;
- bool busy;
+ int free_req_next; /* q->reqs[] index of next free req */
} NVMeRequest;
typedef struct {
@@ -61,7 +67,8 @@ typedef struct {
/* Fields protected by @lock */
NVMeQueue sq, cq;
int cq_phase;
- NVMeRequest reqs[NVME_QUEUE_SIZE];
+ int free_req_head;
+ NVMeRequest reqs[NVME_NUM_REQS];
bool busy;
int need_kick;
int inflight;
@@ -200,19 +207,23 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
qemu_mutex_init(&q->lock);
q->index = idx;
qemu_co_queue_init(&q->free_req_queue);
- q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+ q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
- s->page_size * NVME_QUEUE_SIZE,
+ s->page_size * NVME_NUM_REQS,
false, &prp_list_iova);
if (r) {
goto fail;
}
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+ q->free_req_head = -1;
+ for (i = 0; i < NVME_NUM_REQS; i++) {
NVMeRequest *req = &q->reqs[i];
req->cid = i + 1;
+ req->free_req_next = q->free_req_head;
+ q->free_req_head = i;
req->prp_list_page = q->prp_list_pages + i * s->page_size;
req->prp_list_iova = prp_list_iova + i * s->page_size;
}
+
nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
@@ -254,13 +265,11 @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
*/
static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
{
- int i;
- NVMeRequest *req = NULL;
+ NVMeRequest *req;
qemu_mutex_lock(&q->lock);
- while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
- /* We have to leave one slot empty as that is the full queue case (head
- * == tail + 1). */
+
+ while (q->free_req_head == -1) {
if (qemu_in_coroutine()) {
trace_nvme_free_req_queue_wait(q);
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -269,20 +278,41 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
return NULL;
}
}
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
- if (!q->reqs[i].busy) {
- q->reqs[i].busy = true;
- req = &q->reqs[i];
- break;
- }
- }
- /* We have checked inflight and need_kick while holding q->lock, so one
- * free req must be available. */
- assert(req);
+
+ req = &q->reqs[q->free_req_head];
+ q->free_req_head = req->free_req_next;
+ req->free_req_next = -1;
+
qemu_mutex_unlock(&q->lock);
return req;
}
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+ req->free_req_next = q->free_req_head;
+ q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+ if (!qemu_co_queue_empty(&q->free_req_queue)) {
+ replay_bh_schedule_oneshot_event(s->aio_context,
+ nvme_free_req_queue_cb, q);
+ }
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
+ NVMeRequest *req)
+{
+ qemu_mutex_lock(&q->lock);
+ nvme_put_free_req_locked(q, req);
+ nvme_wake_free_req_locked(s, q);
+ qemu_mutex_unlock(&q->lock);
+}
+
static inline int nvme_translate_error(const NvmeCqe *c)
{
uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -344,7 +374,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
req = *preq;
assert(req.cid == cid);
assert(req.cb);
- preq->busy = false;
+ nvme_put_free_req_locked(q, preq);
preq->cb = preq->opaque = NULL;
qemu_mutex_unlock(&q->lock);
req.cb(req.opaque, ret);
@@ -356,10 +386,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
/* Notify the device so it can post more completions. */
smp_mb_release();
*q->cq.doorbell = cpu_to_le32(q->cq.head);
- if (!qemu_co_queue_empty(&q->free_req_queue)) {
- replay_bh_schedule_oneshot_event(s->aio_context,
- nvme_free_req_queue_cb, q);
- }
+ nvme_wake_free_req_locked(s, q);
}
q->busy = false;
return progress;
@@ -1001,7 +1028,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
qemu_co_mutex_unlock(&s->dma_map_lock);
if (r) {
- req->busy = false;
+ nvme_put_free_req_and_wake(s, ioq, req);
return r;
}
nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -1218,7 +1245,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
qemu_co_mutex_unlock(&s->dma_map_lock);
if (ret) {
- req->busy = false;
+ nvme_put_free_req_and_wake(s, ioq, req);
goto out;
}
--
2.26.2
next prev parent reply other threads:[~2020-06-24 10:09 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-06-24 10:01 [PULL 00/12] Block patches Stefan Hajnoczi
2020-06-24 10:01 ` [PULL 01/12] minikconf: explicitly set encoding to UTF-8 Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 02/12] coroutine: support SafeStack in ucontext backend Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 03/12] coroutine: add check for SafeStack in sigaltstack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 04/12] configure: add flags to support SafeStack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 05/12] check-block: enable iotests with SafeStack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 06/12] block/nvme: poll queues without q->lock Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 07/12] block/nvme: drop tautologous assertion Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 08/12] block/nvme: don't access CQE after moving cq.head Stefan Hajnoczi
2020-06-24 10:02 ` Stefan Hajnoczi [this message]
2020-06-24 10:02 ` [PULL 10/12] block/nvme: clarify that free_req_queue is protected by q->lock Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 12/12] block/nvme: support nested aio_poll() Stefan Hajnoczi
2020-06-25 13:31 ` [PULL 00/12] Block patches Peter Maydell
2020-06-26 10:25 ` Stefan Hajnoczi
2020-06-26 10:49 ` Peter Maydell
2020-06-26 13:01 ` Stefan Hajnoczi
2020-06-26 15:54 ` Peter Maydell
2020-07-07 15:28 ` Philippe Mathieu-Daudé
2020-07-07 22:05 ` Eduardo Habkost
2020-07-09 15:02 ` Kevin Wolf
2020-07-09 18:41 ` Eduardo Habkost
2020-07-10 8:36 ` Kevin Wolf
2020-07-16 12:40 ` Peter Maydell
2020-07-16 13:29 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200624100210.59975-10-stefanha@redhat.com \
--to=stefanha@redhat.com \
--cc=crosa@redhat.com \
--cc=ehabkost@redhat.com \
--cc=fam@euphon.net \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=slp@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.