All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Liu Bo <bo.liu@linux.alibaba.com>, Jens Axboe <axboe@kernel.dk>,
	Sasha Levin <sashal@kernel.org>,
	linux-block@vger.kernel.org
Subject: [PATCH AUTOSEL 4.19 64/64] blk-iolatency: fix IO hang due to negative inflight counter
Date: Thu, 28 Feb 2019 10:11:05 -0500	[thread overview]
Message-ID: <20190228151105.11277-64-sashal@kernel.org> (raw)
In-Reply-To: <20190228151105.11277-1-sashal@kernel.org>

From: Liu Bo <bo.liu@linux.alibaba.com>

[ Upstream commit 8c772a9bfc7c07c76f4a58b58910452fbb20843b ]

Our test reported the following stack, and vmcore showed that
->inflight counter is -1.

[ffffc9003fcc38d0] __schedule at ffffffff8173d95d
[ffffc9003fcc3958] schedule at ffffffff8173de26
[ffffc9003fcc3970] io_schedule at ffffffff810bb6b6
[ffffc9003fcc3988] blkcg_iolatency_throttle at ffffffff813911cb
[ffffc9003fcc3a20] rq_qos_throttle at ffffffff813847f3
[ffffc9003fcc3a48] blk_mq_make_request at ffffffff8137468a
[ffffc9003fcc3b08] generic_make_request at ffffffff81368b49
[ffffc9003fcc3b68] submit_bio at ffffffff81368d7d
[ffffc9003fcc3bb8] ext4_io_submit at ffffffffa031be00 [ext4]
[ffffc9003fcc3c00] ext4_writepages at ffffffffa03163de [ext4]
[ffffc9003fcc3d68] do_writepages at ffffffff811c49ae
[ffffc9003fcc3d78] __filemap_fdatawrite_range at ffffffff811b6188
[ffffc9003fcc3e30] filemap_write_and_wait_range at ffffffff811b6301
[ffffc9003fcc3e60] ext4_sync_file at ffffffffa030cee8 [ext4]
[ffffc9003fcc3ea8] vfs_fsync_range at ffffffff8128594b
[ffffc9003fcc3ee8] do_fsync at ffffffff81285abd
[ffffc9003fcc3f18] sys_fsync at ffffffff81285d50
[ffffc9003fcc3f28] do_syscall_64 at ffffffff81003c04
[ffffc9003fcc3f50] entry_SYSCALL_64_after_swapgs at ffffffff81742b8e

The ->inflight counter may be negative (-1) if

1) blk-iolatency was disabled when the IO was issued,

2) blk-iolatency was enabled before this IO reached its endio,

3) the ->inflight counter is decreased from 0 to -1 in endio()

In fact the hang can be easily reproduced by the below script,

H=/sys/fs/cgroup/unified/
P=/sys/fs/cgroup/unified/test

echo "+io" > $H/cgroup.subtree_control
mkdir -p $P

echo $$ > $P/cgroup.procs

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

echo "`cat /sys/block/sdg/dev` target=1000000" > $P/io.latency

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

This fixes the problem by freezing the queue so that while
enabling/disabling iolatency, there is no inflight rq running.

Note that quiesce_queue is not needed as this only updating iolatency
configuration about which dispatching request_queue doesn't care.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 block/blk-iolatency.c | 52 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 19923f8a029dd..b154e057ca67c 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -72,6 +72,7 @@
 #include <linux/sched/loadavg.h>
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
+#include <linux/blk-mq.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 
@@ -568,6 +569,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 		return;
 
 	enabled = blk_iolatency_enabled(iolat->blkiolat);
+	if (!enabled)
+		return;
+
 	while (blkg && blkg->parent) {
 		iolat = blkg_to_lat(blkg);
 		if (!iolat) {
@@ -577,7 +581,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 		rqw = &iolat->rq_wait;
 
 		atomic_dec(&rqw->inflight);
-		if (!enabled || iolat->min_lat_nsec == 0)
+		if (iolat->min_lat_nsec == 0)
 			goto next;
 		iolatency_record_time(iolat, &bio->bi_issue, now,
 				      issue_as_root);
@@ -721,10 +725,13 @@ int blk_iolatency_init(struct request_queue *q)
 	return 0;
 }
 
-static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+/*
+ * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
+ * return 0.
+ */
+static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
 {
 	struct iolatency_grp *iolat = blkg_to_lat(blkg);
-	struct blk_iolatency *blkiolat = iolat->blkiolat;
 	u64 oldval = iolat->min_lat_nsec;
 
 	iolat->min_lat_nsec = val;
@@ -733,9 +740,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
 				    BLKIOLATENCY_MAX_WIN_SIZE);
 
 	if (!oldval && val)
-		atomic_inc(&blkiolat->enabled);
+		return 1;
 	if (oldval && !val)
-		atomic_dec(&blkiolat->enabled);
+		return -1;
+	return 0;
 }
 
 static void iolatency_clear_scaling(struct blkcg_gq *blkg)
@@ -768,6 +776,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	u64 lat_val = 0;
 	u64 oldval;
 	int ret;
+	int enable = 0;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
 	if (ret)
@@ -803,7 +812,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	blkg = ctx.blkg;
 	oldval = iolat->min_lat_nsec;
 
-	iolatency_set_min_lat_nsec(blkg, lat_val);
+	enable = iolatency_set_min_lat_nsec(blkg, lat_val);
+	if (enable) {
+		WARN_ON_ONCE(!blk_get_queue(blkg->q));
+		blkg_get(blkg);
+	}
+
 	if (oldval != iolat->min_lat_nsec) {
 		iolatency_clear_scaling(blkg);
 	}
@@ -811,6 +825,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	ret = 0;
 out:
 	blkg_conf_finish(&ctx);
+	if (ret == 0 && enable) {
+		struct iolatency_grp *tmp = blkg_to_lat(blkg);
+		struct blk_iolatency *blkiolat = tmp->blkiolat;
+
+		blk_mq_freeze_queue(blkg->q);
+
+		if (enable == 1)
+			atomic_inc(&blkiolat->enabled);
+		else if (enable == -1)
+			atomic_dec(&blkiolat->enabled);
+		else
+			WARN_ON_ONCE(1);
+
+		blk_mq_unfreeze_queue(blkg->q);
+
+		blkg_put(blkg);
+		blk_put_queue(blkg->q);
+	}
 	return ret ?: nbytes;
 }
 
@@ -910,8 +942,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
 {
 	struct iolatency_grp *iolat = pd_to_lat(pd);
 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+	struct blk_iolatency *blkiolat = iolat->blkiolat;
+	int ret;
 
-	iolatency_set_min_lat_nsec(blkg, 0);
+	ret = iolatency_set_min_lat_nsec(blkg, 0);
+	if (ret == 1)
+		atomic_inc(&blkiolat->enabled);
+	if (ret == -1)
+		atomic_dec(&blkiolat->enabled);
 	iolatency_clear_scaling(blkg);
 }
 
-- 
2.19.1


      parent reply	other threads:[~2019-02-28 15:22 UTC|newest]

Thread overview: 89+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-28 15:10 [PATCH AUTOSEL 4.19 01/64] ARM: OMAP: dts: N950/N9: fix onenand timings Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 02/64] ARM: dts: omap4-droid4: Fix typo in cpcap IRQ flags Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 03/64] ARM: dts: sun8i: h3: Add ethernet0 alias to Beelink X2 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 04/64] arm: dts: meson: Fix IRQ trigger type for macirq Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 05/64] ARM: dts: meson8b: odroidc1: mark the SD card detection GPIO active-low Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 06/64] ARM: dts: meson8m2: mxiii-plus: " Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 07/64] ARM: dts: imx6sx: correct backward compatible of gpt Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 08/64] arm64: dts: renesas: r8a7796: Enable DMA for SCIF2 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 09/64] arm64: dts: renesas: r8a77965: " Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 10/64] soc: fsl: qbman: avoid race in clearing QMan interrupt Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 11/64] pinctrl: mcp23s08: spi: Fix regmap allocation for mcp23s18 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 12/64] wlcore: sdio: Fixup power on/off sequence Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 13/64] bpftool: Fix prog dump by tag Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 14/64] bpftool: fix percpu maps updating Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 15/64] bpf: sock recvbuff must be limited by rmem_max in bpf_setsockopt() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 16/64] ARM: pxa: ssp: unneeded to free devm_ allocated data Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 17/64] arm64: dts: add msm8996 compatible to gicv3 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 18/64] batman-adv: release station info tidstats Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 19/64] DTS: CI20: Fix bugs in ci20's device tree Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 20/64] usb: phy: fix link errors Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 21/64] irqchip/gic-v4: Fix occasional VLPI drop Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 22/64] irqchip/gic-v3-its: Gracefully fail on LPI exhaustion Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 23/64] irqchip/mmp: Only touch the PJ4 IRQ & FIQ bits on enable/disable Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 24/64] drm/amdgpu: Add missing power attribute to APU check Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 25/64] drm/radeon: check if device is root before getting pci speed caps Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 26/64] debugfs: return error values, not NULL Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 27/64] debugfs: debugfs_lookup() should return NULL if not found Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 28/64] drm/amdgpu: Transfer fences to dmabuf importer Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 29/64] net: stmmac: Fallback to Platform Data clock in Watchdog conversion Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 30/64] net: stmmac: Send TSO packets always from Queue 0 Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 31/64] net: stmmac: Disable EEE mode earlier in XMIT callback Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 32/64] irqchip/gic-v3-its: Fix ITT_entry_size accessor Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 33/64] relay: check return of create_buf_file() properly Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 34/64] blk-mq: protect debugfs_create_files() from failures Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 35/64] bpf, selftests: fix handling of sparse CPU allocations Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10   ` sashal
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 36/64] bpf: fix lockdep false positive in percpu_freelist Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 37/64] bpf: fix potential deadlock in bpf_prog_register Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 38/64] bpf: Fix syscall's stackmap lookup potential deadlock Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 39/64] drm/sun4i: tcon: Prepare and enable TCON channel 0 clock at init Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 40/64] dmaengine: at_xdmac: Fix wrongfull report of a channel as in use Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 41/64] vsock/virtio: fix kernel panic after device hot-unplug Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 42/64] vsock/virtio: reset connected sockets on device removal Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 43/64] dmaengine: dmatest: Abort test in case of mapping error Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 44/64] selftests: netfilter: fix config fragment CONFIG_NF_TABLES_INET Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10   ` sashal
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 45/64] selftests: netfilter: add simple masq/redirect test cases Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10   ` sashal
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 46/64] netfilter: nf_nat: skip nat clash resolution for same-origin entries Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 47/64] arm64: ptdump: Don't iterate kernel page tables using PTRS_PER_PXX Sasha Levin
2019-02-28 15:18   ` Will Deacon
2019-03-11 17:16     ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 48/64] s390/qeth: release cmd buffer in error paths Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 49/64] s390/qeth: fix use-after-free in error path Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 50/64] s390/qeth: cancel close_dev work before removing a card Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 51/64] perf symbols: Filter out hidden symbols from labels Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 52/64] perf trace: Support multiple "vfs_getname" probes Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 53/64] MIPS: Loongson: Introduce and use loongson_llsc_mb() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 54/64] MIPS: Remove function size check in get_frame_info() Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 55/64] Revert "scsi: libfc: Add WARN_ON() when deleting rports" Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 56/64] i2c: omap: Use noirq system sleep pm ops to idle device for suspend Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 57/64] drm/amdgpu: use spin_lock_irqsave to protect vm_manager.pasid_idr Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:10 ` [PATCH AUTOSEL 4.19 58/64] nvme: lock NS list changes while handling command effects Sasha Levin
2019-02-28 15:10   ` Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 59/64] nvme-pci: fix rapid add remove sequence Sasha Levin
2019-02-28 15:11   ` Sasha Levin
2019-02-28 15:16   ` Keith Busch
2019-02-28 15:16     ` Keith Busch
2019-03-11 17:21     ` Sasha Levin
2019-03-11 17:21       ` Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 60/64] fs: ratelimit __find_get_block_slow() failure message Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 61/64] qed: Fix EQ full firmware assert Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 62/64] qed: Consider TX tcs while deriving the max num_queues for PF Sasha Levin
2019-02-28 15:11 ` [PATCH AUTOSEL 4.19 63/64] qede: Fix system crash on configuring channels Sasha Levin
2019-02-28 15:11 ` Sasha Levin [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190228151105.11277-64-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bo.liu@linux.alibaba.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.