All of lore.kernel.org
 help / color / mirror / Atom feed
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
To: ming.lei@redhat.com
Cc: axboe@kernel.dk, xiaoguang.wang@linux.alibaba.com,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	joseph.qi@linux.alibaba.com,
	ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Subject: [PATCH V3 5/7] ublk_drv: consider recovery feature in aborting mechanism
Date: Tue, 13 Sep 2022 12:17:05 +0800	[thread overview]
Message-ID: <20220913041707.197334-6-ZiyangZhang@linux.alibaba.com> (raw)
In-Reply-To: <20220913041707.197334-1-ZiyangZhang@linux.alibaba.com>

With USER_RECOVERY feature enabled, the monitor_work schedules
quiesce_work after finding a dying ubq_daemon. The quiesce_work's job
is to:
(1) quiesce request queue.
(2) check if there is any INFLIGHT rq with UBLK_IO_FLAG_ACTIVE unset.
    If so, we retry until all these rqs are requeued by ublk_queue_rq()
    and task_work and become IDLE.
(3) requeue/abort inflight rqs issued to the crash ubq_daemon before. If
    UBLK_F_USER_RECOVERY_REISSUE is set, rq is requeued; or it is
    aborted.
(4) complete all ioucmds by calling io_uring_cmd_done(). We are safe to
    do so because no ioucmd can be referenced now.
(5) set ub's state to UBLK_S_DEV_QUIESCED, which means we are ready for
    recovery. This state is exposed to userspace by GET_DEV_INFO.

The driver can always handle STOP_DEV and cleanup everything no matter
ub's state is LIVE or QUIESCED. After ub's state is UBLK_S_DEV_QUIESCED,
user can recover with new process by sending START_USER_RECOVERY.

Note: we do not change the default behavior with reocvery feature
disabled. monitor_work still schedules stop_work and abort inflight
rqs. Finally ublk_device is released.

Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
---
 drivers/block/ublk_drv.c | 168 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 161 insertions(+), 7 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b067f33a1913..4409a130d0b6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -121,7 +121,7 @@ struct ublk_queue {
 
 	unsigned long io_addr;	/* mapped vm address */
 	unsigned int max_io_sz;
-	bool abort_work_pending;
+	bool force_abort;
 	unsigned short nr_io_ready;	/* how many ios setup */
 	struct ublk_device *dev;
 	struct ublk_io ios[0];
@@ -163,6 +163,7 @@ struct ublk_device {
 	 * monitor each queue's daemon periodically
 	 */
 	struct delayed_work	monitor_work;
+	struct work_struct	quiesce_work;
 	struct work_struct	stop_work;
 };
 
@@ -660,6 +661,11 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
 
 	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
+				__func__,
+				((struct ublk_queue *)req->mq_hctx->driver_data)->q_id,
+				req->tag,
+				io->flags);
 		io->flags |= UBLK_IO_FLAG_ABORTED;
 		blk_mq_end_request(req, BLK_STS_IOERR);
 	}
@@ -820,6 +826,21 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	res = ublk_setup_iod(ubq, rq);
 	if (unlikely(res != BLK_STS_OK))
 		return BLK_STS_IOERR;
+    /* With recovery feature enabled, force_abort is set in
+     * ublk_stop_dev() before calling del_gendisk() if ub's state
+     * is QUIESCED. We have to abort all requeued and new rqs here
+     * to let del_gendisk() move on. Besides, we do not call
+     * io_uring_cmd_complete_in_task() to avoid UAF on io_uring ctx.
+     *
+     * Note: force_abort is guaranteed to be seen because it is set
+     * before request queue is unqiuesced.
+     */
+	if (unlikely(ubq->force_abort)) {
+		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
+				__func__, ubq->q_id, rq->tag,
+				ubq->ios[rq->tag].flags);
+		return BLK_STS_IOERR;
+	}
 
 	blk_mq_start_request(bd->rq);
 
@@ -1003,6 +1024,101 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 	ublk_put_device(ub);
 }
 
+static bool ublk_check_inflight_rq(struct request *rq, void *data)
+{
+	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+	struct ublk_io *io = &ubq->ios[rq->tag];
+	bool *busy = data;
+
+	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+		*busy = true;
+		return false;
+	}
+	return true;
+}
+
+static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
+{
+	bool busy = false;
+
+	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
+	while (true) {
+		blk_mq_tagset_busy_iter(&ub->tag_set,
+				ublk_check_inflight_rq, &busy);
+		if (busy)
+			msleep(UBLK_REQUEUE_DELAY_MS);
+		else
+			break;
+	}
+}
+
+static void ublk_quiesce_queue(struct ublk_device *ub,
+		struct ublk_queue *ubq)
+{
+	int i;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
+			struct request *rq = blk_mq_tag_to_rq(
+					ub->tag_set.tags[ubq->q_id], i);
+
+			WARN_ON_ONCE(!rq);
+			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
+					ublk_queue_can_use_recovery_reissue(ubq) ?
+					"requeue" : "abort",
+					ubq->q_id, i, io->flags);
+			if (ublk_queue_can_use_recovery_reissue(ubq))
+				blk_mq_requeue_request(rq, false);
+			else
+				__ublk_fail_req(io, rq);
+		} else {
+			pr_devel("%s: done old cmd: qid %d tag %d\n",
+					__func__, ubq->q_id, i);
+			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
+			io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+		}
+		ubq->nr_io_ready--;
+	}
+	WARN_ON_ONCE(ubq->nr_io_ready);
+}
+
+static void ublk_quiesce_dev(struct ublk_device *ub)
+{
+	int i;
+
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+		goto unlock;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		if (!ubq_daemon_is_dying(ubq))
+			goto unlock;
+	}
+	blk_mq_quiesce_queue(ub->ub_disk->queue);
+	ublk_wait_tagset_rqs_idle(ub);
+	pr_devel("%s: quiesce ub: dev_id %d\n",
+			__func__, ub->dev_info.dev_id);
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_quiesce_queue(ub, ublk_get_queue(ub, i));
+
+	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
+ unlock:
+	mutex_unlock(&ub->mutex);
+}
+
+static void ublk_quiesce_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, quiesce_work);
+
+	ublk_quiesce_dev(ub);
+}
+
 static void ublk_daemon_monitor_work(struct work_struct *work)
 {
 	struct ublk_device *ub =
@@ -1013,10 +1129,14 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
 
 		if (ubq_daemon_is_dying(ubq)) {
-			schedule_work(&ub->stop_work);
-
-			/* abort queue is for making forward progress */
-			ublk_abort_queue(ub, ubq);
+			if (ublk_queue_can_use_recovery(ubq)) {
+				schedule_work(&ub->quiesce_work);
+			} else {
+				schedule_work(&ub->stop_work);
+
+				/* abort queue is for making forward progress */
+				ublk_abort_queue(ub, ubq);
+			}
 		}
 	}
 
@@ -1080,12 +1200,43 @@ static void ublk_cancel_dev(struct ublk_device *ub)
 		ublk_cancel_queue(ublk_get_queue(ub, i));
 }
 
+static void ublk_unquiesce_dev(struct ublk_device *ub)
+{
+	int i;
+
+	pr_devel("%s: ub state %s\n", __func__,
+			ub->dev_info.state == UBLK_S_DEV_LIVE ?
+			"LIVE" : "QUIESCED");
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		/*
+		 * quiesce_work cannot be running. We let monitor_work,
+		 * ublk_queue_rq() and task_work abort rqs instead of
+		 * requeuing them with a dying ubq_daemon. Then
+		 * del_gendisk() can move on.
+		 */
+		ublk_disable_recovery(ub);
+	} else {
+		/* quiesce_work has run. We let requeued rqs be aborted
+		 * before running fallback_wq. "force_abort" must be seen
+		 * after request queue is unqiuesced. Then del_gendisk()
+		 * can move on.
+		 */
+		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+			ublk_get_queue(ub, i)->force_abort = true;
+
+		blk_mq_unquiesce_queue(ub->ub_disk->queue);
+		/* We may have requeued some rqs in ublk_quiesce_queue() */
+		blk_mq_kick_requeue_list(ub->ub_disk->queue);
+	}
+}
+
 static void ublk_stop_dev(struct ublk_device *ub)
 {
 	mutex_lock(&ub->mutex);
-	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
 		goto unlock;
-
+	if (ublk_can_use_recovery(ub))
+		ublk_unquiesce_dev(ub);
 	del_gendisk(ub->ub_disk);
 	ub->dev_info.state = UBLK_S_DEV_DEAD;
 	ub->dev_info.ublksrv_pid = -1;
@@ -1409,6 +1560,7 @@ static void ublk_remove(struct ublk_device *ub)
 {
 	ublk_stop_dev(ub);
 	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
 	put_device(&ub->cdev_dev);
 }
@@ -1585,6 +1737,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 		goto out_unlock;
 	mutex_init(&ub->mutex);
 	spin_lock_init(&ub->mm_lock);
+	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
 	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
 	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
 
@@ -1705,6 +1858,7 @@ static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
 
 	ublk_stop_dev(ub);
 	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
 
 	ublk_put_device(ub);
 	return 0;
-- 
2.27.0


  parent reply	other threads:[~2022-09-13  4:19 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-13  4:17 [PATCH V3 0/7] ublk_drv: add USER_RECOVERY support ZiyangZhang
2022-09-13  4:17 ` [PATCH V3 1/7] ublk_drv: check 'current' instead of 'ubq_daemon' ZiyangZhang
2022-09-13  4:17 ` [PATCH V3 2/7] ublk_drv: refactor ublk_cancel_queue() ZiyangZhang
2022-09-13  4:17 ` [PATCH V3 3/7] ublk_drv: define macros for recovery feature and check them ZiyangZhang
2022-09-20  5:04   ` Ming Lei
2022-09-13  4:17 ` [PATCH V3 4/7] ublk_drv: requeue rqs with recovery feature enabled ZiyangZhang
2022-09-19  3:55   ` Ming Lei
2022-09-19  9:12     ` Ziyang Zhang
2022-09-19 12:39       ` Ming Lei
2022-09-20  1:31         ` Ziyang Zhang
2022-09-20  2:39           ` Ming Lei
2022-09-20  3:04             ` Ziyang Zhang
2022-09-20  3:18               ` Ming Lei
2022-09-20  3:34                 ` Ziyang Zhang
2022-09-20  4:41                   ` Ming Lei
2022-09-13  4:17 ` ZiyangZhang [this message]
2022-09-19  9:32   ` [PATCH V3 5/7] ublk_drv: consider recovery feature in aborting mechanism Ming Lei
2022-09-19  9:55     ` Ziyang Zhang
2022-09-19 12:33       ` Ming Lei
2022-09-20  1:49         ` Ziyang Zhang
2022-09-20  3:04           ` Ming Lei
2022-09-20  3:24             ` Ziyang Zhang
2022-09-20  4:01               ` Ming Lei
2022-09-20  4:39                 ` Ziyang Zhang
2022-09-20  4:49                   ` Ming Lei
2022-09-20  5:03                     ` Ziyang Zhang
2022-09-20  4:45             ` Ziyang Zhang
2022-09-20  5:05               ` Ziyang Zhang
2022-09-13  4:17 ` [PATCH V3 6/7] ublk_drv: add START_USER_RECOVERY and END_USER_RECOVERY support ZiyangZhang
2022-09-19 13:03   ` Ming Lei
2022-09-20  2:41     ` Ziyang Zhang
2022-09-13  4:17 ` [PATCH V3 7/7] ublk_drv: do not run monitor_work while ub's state is QUIESCED ZiyangZhang
2022-09-19  2:17 ` [PATCH V3 0/7] ublk_drv: add USER_RECOVERY support Ziyang Zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220913041707.197334-6-ZiyangZhang@linux.alibaba.com \
    --to=ziyangzhang@linux.alibaba.com \
    --cc=axboe@kernel.dk \
    --cc=joseph.qi@linux.alibaba.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=xiaoguang.wang@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.