All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-04  1:55 Ming Lei
  2017-11-04 14:19 ` Jens Axboe
  2017-11-06 18:04   ` Bart Van Assche
  0 siblings, 2 replies; 48+ messages in thread
From: Ming Lei @ 2017-11-04  1:55 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Christoph Hellwig
  Cc: Bart Van Assche, Laurence Oberman, linux-scsi, Omar Sandoval,
	John Garry, Ming Lei

It is very expensive to atomic_inc/atomic_dec the host wide counter of
host->busy_count, and it should have been avoided via blk-mq's mechanism
of getting driver tag, which uses the more efficient way of sbitmap queue.

Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
and don't run queue if the counter becomes zero, so IO hang may be caused
if all requests are completed just before the current SCSI device
is added to shost->starved_list.

Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
Reported-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---

 drivers/scsi/scsi_lib.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a098af3070a1..7f218ef61900 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1944,11 +1944,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost = sdev->host;
 
-	atomic_dec(&shost->host_busy);
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
 	atomic_dec(&sdev->device_busy);
 	put_device(&sdev->sdev_gendev);
 }
@@ -1957,7 +1953,6 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost = sdev->host;
 	blk_status_t ret;
 
 	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
@@ -1968,18 +1963,9 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 		goto out;
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
-	if (!scsi_target_queue_ready(shost, sdev))
-		goto out_dec_device_busy;
-	if (!scsi_host_queue_ready(q, shost, sdev))
-		goto out_dec_target_busy;
 
 	return BLK_STS_OK;
 
-out_dec_target_busy:
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
-	atomic_dec(&sdev->device_busy);
 out_put_device:
 	put_device(&sdev->sdev_gendev);
 out:
@@ -1992,6 +1978,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct request_queue *q = req->q;
 	struct scsi_device *sdev = q->queuedata;
+	struct Scsi_Host *shost = sdev->host;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
 	int reason;
@@ -2001,10 +1988,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_put_budget;
 
 	ret = BLK_STS_RESOURCE;
+	if (!scsi_target_queue_ready(shost, sdev))
+		goto out_put_budget;
+	if (!scsi_host_queue_ready(q, shost, sdev))
+		goto out_dec_target_busy;
+
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
 		if (ret != BLK_STS_OK)
-			goto out_put_budget;
+			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;
 	} else {
 		blk_mq_start_request(req);
@@ -2022,11 +2014,16 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (reason) {
 		scsi_set_blocked(cmd, reason);
 		ret = BLK_STS_RESOURCE;
-		goto out_put_budget;
+		goto out_dec_host_busy;
 	}
 
 	return BLK_STS_OK;
 
+out_dec_host_busy:
+       atomic_dec(&shost->host_busy);
+out_dec_target_busy:
+	if (scsi_target(sdev)->can_queue > 0)
+		atomic_dec(&scsi_target(sdev)->target_busy);
 out_put_budget:
 	scsi_mq_put_budget(hctx);
 	switch (ret) {
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-04  1:55 [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget() Ming Lei
@ 2017-11-04 14:19 ` Jens Axboe
  2017-11-06 19:45     ` Bart Van Assche
  2017-11-06 18:04   ` Bart Van Assche
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-04 14:19 UTC (permalink / raw)
  To: Ming Lei, linux-block, Christoph Hellwig
  Cc: Bart Van Assche, Laurence Oberman, linux-scsi, Omar Sandoval, John Garry

On 11/03/2017 07:55 PM, Ming Lei wrote:
> It is very expensive to atomic_inc/atomic_dec the host wide counter of
> host->busy_count, and it should have been avoided via blk-mq's mechanism
> of getting driver tag, which uses the more efficient way of sbitmap queue.
> 
> Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> and don't run queue if the counter becomes zero, so IO hang may be caused
> if all requests are completed just before the current SCSI device
> is added to shost->starved_list.

This looks like an improvement. I have added it for 4.15.

Bart, does this fix your hang?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-04  1:55 [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget() Ming Lei
@ 2017-11-06 18:04   ` Bart Van Assche
  2017-11-06 18:04   ` Bart Van Assche
  1 sibling, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-06 18:04 UTC (permalink / raw)
  To: hch, linux-block, axboe, ming.lei
  Cc: linux-scsi, osandov, john.garry, loberman

T24gU2F0LCAyMDE3LTExLTA0IGF0IDA5OjU1ICswODAwLCBNaW5nIExlaSB3cm90ZToNCj4gSXQg
aXMgdmVyeSBleHBlbnNpdmUgdG8gYXRvbWljX2luYy9hdG9taWNfZGVjIHRoZSBob3N0IHdpZGUg
Y291bnRlciBvZg0KPiBob3N0LT5idXN5X2NvdW50LCBhbmQgaXQgc2hvdWxkIGhhdmUgYmVlbiBh
dm9pZGVkIHZpYSBibGstbXEncyBtZWNoYW5pc20NCj4gb2YgZ2V0dGluZyBkcml2ZXIgdGFnLCB3
aGljaCB1c2VzIHRoZSBtb3JlIGVmZmljaWVudCB3YXkgb2Ygc2JpdG1hcCBxdWV1ZS4NCg0KRGlk
IHlvdSBwZXJoYXBzIG1lYW4gdGhlIGhvc3QtPmhvc3RfYnVzeSBjb3VudGVyPyBVbnJlbGF0ZWQg
dG8gdGhpcyBwYXRjaDoNCkkgdGhpbmsgdGhhdCBjb21taXQgNjRkNTEzYWMzMWJkICgic2NzaTog
dXNlIGhvc3Qgd2lkZSB0YWdzIGJ5IGRlZmF1bHQiKSBtYWRlDQp0aGF0IGNvdW50ZXIgc3VwZXJm
bHVvdXMuDQoNCj4gQWxzbyB3ZSBkb24ndCBjaGVjayBhdG9taWNfcmVhZCgmc2Rldi0+ZGV2aWNl
X2J1c3kpIGluIHNjc2lfbXFfZ2V0X2J1ZGdldCgpDQo+IGFuZCBkb24ndCBydW4gcXVldWUgaWYg
dGhlIGNvdW50ZXIgYmVjb21lcyB6ZXJvLCBzbyBJTyBoYW5nIG1heSBiZSBjYXVzZWQNCj4gaWYg
YWxsIHJlcXVlc3RzIGFyZSBjb21wbGV0ZWQganVzdCBiZWZvcmUgdGhlIGN1cnJlbnQgU0NTSSBk
ZXZpY2UNCj4gaXMgYWRkZWQgdG8gc2hvc3QtPnN0YXJ2ZWRfbGlzdC4NCg0KV2hhdCBpcyB0aGUg
cmVsYXRpb25zaGlwIGJldHdlZW4gdGhlIGFib3ZlIGRlc2NyaXB0aW9uIGFuZCB0aGUgY29kZSBj
aGFuZ2VzDQpiZWxvdz8gVGhlIGFib3ZlIGRlc2NyaXB0aW9uIGRvZXMgbm90IGV4cGxhaW4gd2hl
dGhlciB0aGUgc2NzaV9tcV9nZXQvcHV0X2J1ZGdldCgpDQpjaGFuZ2VzIGJlbG93IHByZXZlbnQg
dGhhdCBhbGwgb3V0c3RhbmRpbmcgU0NTSSByZXF1ZXN0cyBjb21wbGV0ZSBiZWZvcmUNCmFub3Ro
ZXIgcXVldWUgaXMgYWRkZWQgdG8gdGhlIHN0YXJ2ZWQgbGlzdC4NCg0KSXMgdGhpcyBwZXJoYXBz
IGFuIGF0dGVtcHQgdG8gbW92ZSB0aGUgY29kZSB0aGF0IGNhbiBhZGQgYSByZXF1ZXN0IHF1ZXVl
IHRvDQoic3RhcnZlZF9saXN0IiBmcm9tIGluc2lkZSBzY3NpX21xX2dldF9idWRnZXQoKSBpbnRv
IHNjc2lfcXVldWVfcnEoKT8gRG9lcw0KdGhpcyBwYXRjaCBtb3JlIHRoYW4gcmVkdWNpbmcgdGhl
IHByb2JhYmlsaXR5IHRoYXQgdGhlIHJhY2UgaXMgZW5jb3VudGVyZWQNCnRoYXQgYSBxdWV1ZSBp
cyBhZGRlZCB0byAic3RhcnZlZF9saXN0IiBhZnRlciBhbGwgcmVxdWVzdHMgaGF2ZSBmaW5pc2hl
ZD8NCg0KPiBGaXhlczogMGRmMjFjODZiZGJmKHNjc2k6IGltcGxlbWVudCAuZ2V0X2J1ZGdldCBh
bmQgLnB1dF9idWRnZXQgZm9yIGJsay1tcSkNCj4gUmVwb3J0ZWQtYnk6IEJhcnQgVmFuIEFzc2No
ZSA8YmFydC52YW5hc3NjaGVAd2RjLmNvbT4NCj4gU2lnbmVkLW9mZi1ieTogTWluZyBMZWkgPG1p
bmcubGVpQHJlZGhhdC5jb20+DQoNClNpbmNlIHRoaXMgaXMgYSBTQ1NJIHBhdGNoIHRoZSBTQ1NJ
IG1haW50YWluZXIsIE1hcnRpbiBQZXRlcnNlbiwgc2hvdWxkIGhhdmUNCmJlZW4gQ2MtZWQgZm9y
IHRoaXMgcGF0Y2guIEFkZGl0aW9uYWxseSwgSSB0aGluayB0aGF0IHRoaXMgcGF0Y2ggc2hvdWxk
IG5vdA0KaGF2ZSBiZWVuIHF1ZXVlZCBieSBKZW5zIGJlZm9yZSBNYXJ0aW4gaGFkIGFwcHJvdmVk
IHRoaXMgcGF0Y2guDQoNCj4gZGlmZiAtLWdpdCBhL2RyaXZlcnMvc2NzaS9zY3NpX2xpYi5jIGIv
ZHJpdmVycy9zY3NpL3Njc2lfbGliLmMNCj4gaW5kZXggYTA5OGFmMzA3MGExLi43ZjIxOGVmNjE5
MDAgMTAwNjQ0DQo+IC0tLSBhL2RyaXZlcnMvc2NzaS9zY3NpX2xpYi5jDQo+ICsrKyBiL2RyaXZl
cnMvc2NzaS9zY3NpX2xpYi5jDQo+IEBAIC0xOTQ0LDExICsxOTQ0LDcgQEAgc3RhdGljIHZvaWQg
c2NzaV9tcV9wdXRfYnVkZ2V0KHN0cnVjdCBibGtfbXFfaHdfY3R4ICpoY3R4KQ0KPiAgew0KPiAg
CXN0cnVjdCByZXF1ZXN0X3F1ZXVlICpxID0gaGN0eC0+cXVldWU7DQo+ICAJc3RydWN0IHNjc2lf
ZGV2aWNlICpzZGV2ID0gcS0+cXVldWVkYXRhOw0KPiAtCXN0cnVjdCBTY3NpX0hvc3QgKnNob3N0
ID0gc2Rldi0+aG9zdDsNCj4gIA0KPiAtCWF0b21pY19kZWMoJnNob3N0LT5ob3N0X2J1c3kpOw0K
PiAtCWlmIChzY3NpX3RhcmdldChzZGV2KS0+Y2FuX3F1ZXVlID4gMCkNCj4gLQkJYXRvbWljX2Rl
Yygmc2NzaV90YXJnZXQoc2RldiktPnRhcmdldF9idXN5KTsNCj4gIAlhdG9taWNfZGVjKCZzZGV2
LT5kZXZpY2VfYnVzeSk7DQo+ICAJcHV0X2RldmljZSgmc2Rldi0+c2Rldl9nZW5kZXYpOw0KPiAg
fQ0KDQpzY3NpX21xX2dldC9wdXRfYnVkZ2V0KCkgd2VyZSBpbnRyb2R1Y2VkIHRvIGltcHJvdmUg
c2VxdWVudGlhbCBJL08NCnBlcmZvcm1hbmNlLiBEb2VzIHJlbW92aW5nIHRoZSBTQ1NJIHRhcmdl
dCBidXN5IGNoZWNrIGhhdmUgYSBuZWdhdGl2ZQ0KcGVyZm9ybWFuY2Ugb24gc2VxdWVudGlhbCBJ
L08gZm9yIGUuZy4gdGhlIGlTRVIgaW5pdGlhdG9yIGRyaXZlcj8gVGhlIGlTQ1NJDQpvdmVyIFRD
UCBpbml0aWF0b3IgZHJpdmVyIGlzIG5vdCBhcHByb3ByaWF0ZSBmb3IgdGVzdGluZyBwZXJmb3Jt
YW5jZQ0KcmVncmVzc2lvbnMgYmVjYXVzZSBpdCBsaW1pdHMgdGhlIGlTQ1NJIHBhcmFtZXRlciBN
YXhPdXRzdGFuZGluZ1IyVCB0byBvbmUuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-06 18:04   ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-06 18:04 UTC (permalink / raw)
  To: hch, linux-block, axboe, ming.lei
  Cc: linux-scsi, osandov, john.garry, loberman

On Sat, 2017-11-04 at 09:55 +0800, Ming Lei wrote:
> It is very expensive to atomic_inc/atomic_dec the host wide counter of
> host->busy_count, and it should have been avoided via blk-mq's mechanism
> of getting driver tag, which uses the more efficient way of sbitmap queue.

Did you perhaps mean the host->host_busy counter? Unrelated to this patch:
I think that commit 64d513ac31bd ("scsi: use host wide tags by default") made
that counter superfluous.

> Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> and don't run queue if the counter becomes zero, so IO hang may be caused
> if all requests are completed just before the current SCSI device
> is added to shost->starved_list.

What is the relationship between the above description and the code changes
below? The above description does not explain whether the scsi_mq_get/put_budget()
changes below prevent that all outstanding SCSI requests complete before
another queue is added to the starved list.

Is this perhaps an attempt to move the code that can add a request queue to
"starved_list" from inside scsi_mq_get_budget() into scsi_queue_rq()? Does
this patch more than reducing the probability that the race is encountered
that a queue is added to "starved_list" after all requests have finished?

> Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
> Reported-by: Bart Van Assche <bart.vanassche@wdc.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Since this is a SCSI patch the SCSI maintainer, Martin Petersen, should have
been Cc-ed for this patch. Additionally, I think that this patch should not
have been queued by Jens before Martin had approved this patch.

> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index a098af3070a1..7f218ef61900 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1944,11 +1944,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct request_queue *q = hctx->queue;
>  	struct scsi_device *sdev = q->queuedata;
> -	struct Scsi_Host *shost = sdev->host;
>  
> -	atomic_dec(&shost->host_busy);
> -	if (scsi_target(sdev)->can_queue > 0)
> -		atomic_dec(&scsi_target(sdev)->target_busy);
>  	atomic_dec(&sdev->device_busy);
>  	put_device(&sdev->sdev_gendev);
>  }

scsi_mq_get/put_budget() were introduced to improve sequential I/O
performance. Does removing the SCSI target busy check have a negative
performance on sequential I/O for e.g. the iSER initiator driver? The iSCSI
over TCP initiator driver is not appropriate for testing performance
regressions because it limits the iSCSI parameter MaxOutstandingR2T to one.

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-04 14:19 ` Jens Axboe
@ 2017-11-06 19:45     ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-06 19:45 UTC (permalink / raw)
  To: hch, linux-block, axboe, ming.lei
  Cc: linux-scsi, osandov, john.garry, loberman

T24gU2F0LCAyMDE3LTExLTA0IGF0IDA4OjE5IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biAxMS8wMy8yMDE3IDA3OjU1IFBNLCBNaW5nIExlaSB3cm90ZToNCj4gPiBJdCBpcyB2ZXJ5IGV4
cGVuc2l2ZSB0byBhdG9taWNfaW5jL2F0b21pY19kZWMgdGhlIGhvc3Qgd2lkZSBjb3VudGVyIG9m
DQo+ID4gaG9zdC0+YnVzeV9jb3VudCwgYW5kIGl0IHNob3VsZCBoYXZlIGJlZW4gYXZvaWRlZCB2
aWEgYmxrLW1xJ3MgbWVjaGFuaXNtDQo+ID4gb2YgZ2V0dGluZyBkcml2ZXIgdGFnLCB3aGljaCB1
c2VzIHRoZSBtb3JlIGVmZmljaWVudCB3YXkgb2Ygc2JpdG1hcCBxdWV1ZS4NCj4gPiANCj4gPiBB
bHNvIHdlIGRvbid0IGNoZWNrIGF0b21pY19yZWFkKCZzZGV2LT5kZXZpY2VfYnVzeSkgaW4gc2Nz
aV9tcV9nZXRfYnVkZ2V0KCkNCj4gPiBhbmQgZG9uJ3QgcnVuIHF1ZXVlIGlmIHRoZSBjb3VudGVy
IGJlY29tZXMgemVybywgc28gSU8gaGFuZyBtYXkgYmUgY2F1c2VkDQo+ID4gaWYgYWxsIHJlcXVl
c3RzIGFyZSBjb21wbGV0ZWQganVzdCBiZWZvcmUgdGhlIGN1cnJlbnQgU0NTSSBkZXZpY2UNCj4g
PiBpcyBhZGRlZCB0byBzaG9zdC0+c3RhcnZlZF9saXN0Lg0KPiANCj4gVGhpcyBsb29rcyBsaWtl
IGFuIGltcHJvdmVtZW50LiBJIGhhdmUgYWRkZWQgaXQgZm9yIDQuMTUuDQo+IA0KPiBCYXJ0LCBk
b2VzIHRoaXMgZml4IHlvdXIgaGFuZz8NCg0KTm8sIGl0IGRvZXNuJ3QuIEFmdGVyIEkgaGFkIHJl
ZHVjZWQgc3RhcmdldC0+Y2FuX3F1ZXVlIGluIHRoZSBTUlAgaW5pdGlhdG9yIEkNCnJhbiBpbnRv
IHRoZSBmb2xsb3dpbmcgaGFuZyB3aGlsZSBydW5uaW5nIHRoZSBzcnAtdGVzdCBzb2Z0d2FyZToN
Cg0Kc3lzcnE6IFN5c1JxIDogU2hvdyBCbG9ja2VkIFN0YXRlDQogIHRhc2sgICAgICAgICAgICAg
ICAgICAgICAgICBQQyBzdGFjayAgIHBpZCBmYXRoZXINCnN5c3RlbWQtdWRldmQgICBEICAgIDAg
MTk4ODIgICAgNDY3IDB4ODAwMDAxMDYNCkNhbGwgVHJhY2U6DQogX19zY2hlZHVsZSsweDJmYS8w
eGJiMA0KIHNjaGVkdWxlKzB4MzYvMHg5MA0KIGlvX3NjaGVkdWxlKzB4MTYvMHg0MA0KIF9fbG9j
a19wYWdlKzB4MTBhLzB4MTQwDQogdHJ1bmNhdGVfaW5vZGVfcGFnZXNfcmFuZ2UrMHg0ZmYvMHg4
MDANCiB0cnVuY2F0ZV9pbm9kZV9wYWdlcysweDE1LzB4MjANCiBraWxsX2JkZXYrMHgzNS8weDQw
DQogX19ibGtkZXZfcHV0KzB4NmQvMHgxZjANCiBibGtkZXZfcHV0KzB4NGUvMHgxMzANCiBibGtk
ZXZfY2xvc2UrMHgyNS8weDMwDQogX19mcHV0KzB4ZWQvMHgxZjANCiBfX19fZnB1dCsweGUvMHgx
MA0KIHRhc2tfd29ya19ydW4rMHg4Yi8weGMwDQogZG9fZXhpdCsweDM4ZC8weGM3MA0KIGRvX2dy
b3VwX2V4aXQrMHg1MC8weGQwDQogZ2V0X3NpZ25hbCsweDJhZC8weDhjMA0KIGRvX3NpZ25hbCsw
eDI4LzB4NjgwDQogZXhpdF90b191c2VybW9kZV9sb29wKzB4NWEvMHhhMA0KIGRvX3N5c2NhbGxf
NjQrMHgxMmUvMHgxNzANCiBlbnRyeV9TWVNDQUxMNjRfc2xvd19wYXRoKzB4MjUvMHgyNQ0KDQpU
aGUgU1JQIGluaXRpYXRvciBkcml2ZXIgd2FzIG1vZGlmaWVkIGFzIGZvbGxvd3MgZm9yIHRoaXMg
dGVzdDoNCg0KZGlmZiAtLWdpdCBhL2RyaXZlcnMvaW5maW5pYmFuZC91bHAvc3JwL2liX3NycC5j
IGIvZHJpdmVycy9pbmZpbmliYW5kL3VscC9zcnAvaWJfc3JwLmMNCmluZGV4IGE2NjY0NDY3NjUx
ZS4uOWQyNGE4NzFjYzJlIDEwMDY0NA0KLS0tIGEvZHJpdmVycy9pbmZpbmliYW5kL3VscC9zcnAv
aWJfc3JwLmMNCisrKyBiL2RyaXZlcnMvaW5maW5pYmFuZC91bHAvc3JwL2liX3NycC5jDQoNCkBA
IC0yODM1LDYgKzI4MzksMTMgQEAgc3RhdGljIGludCBzcnBfcmVzZXRfaG9zdChzdHJ1Y3Qgc2Nz
aV9jbW5kICpzY21uZCkNCiAJcmV0dXJuIHNycF9yZWNvbm5lY3RfcnBvcnQodGFyZ2V0LT5ycG9y
dCkgPT0gMCA/IFNVQ0NFU1MgOiBGQUlMRUQ7DQogfQ0KIA0KK3N0YXRpYyBpbnQgc3JwX3Rhcmdl
dF9hbGxvYyhzdHJ1Y3Qgc2NzaV90YXJnZXQgKnN0YXJnZXQpDQorew0KKwlzdGFyZ2V0LT5jYW5f
cXVldWUgPSAxOw0KKwlyZXR1cm4gMDsNCit9DQorDQogc3RhdGljIGludCBzcnBfc2xhdmVfYWxs
b2Moc3RydWN0IHNjc2lfZGV2aWNlICpzZGV2KQ0KIHsNCiAJc3RydWN0IFNjc2lfSG9zdCAqc2hv
c3QgPSBzZGV2LT5ob3N0Ow0KQEAgLTMwMzksNiArMzA1MCw3IEBAIHN0YXRpYyBzdHJ1Y3Qgc2Nz
aV9ob3N0X3RlbXBsYXRlIHNycF90ZW1wbGF0ZSA9IHsNCiAJLm1vZHVsZQkJCQk9IFRISVNfTU9E
VUxFLA0KIAkubmFtZQkJCQk9ICJJbmZpbmlCYW5kIFNSUCBpbml0aWF0b3IiLA0KIAkucHJvY19u
YW1lCQkJPSBEUlZfTkFNRSwNCisJLnRhcmdldF9hbGxvYwkJCT0gc3JwX3RhcmdldF9hbGxvYywN
CiAJLnNsYXZlX2FsbG9jCQkJPSBzcnBfc2xhdmVfYWxsb2MsDQogCS5zbGF2ZV9jb25maWd1cmUJ
CT0gc3JwX3NsYXZlX2NvbmZpZ3VyZSwNCiAJLmluZm8JCQkJPSBzcnBfdGFyZ2V0X2luZm8sDQoN
CkJhcnQuDQo=

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-06 19:45     ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-06 19:45 UTC (permalink / raw)
  To: hch, linux-block, axboe, ming.lei
  Cc: linux-scsi, osandov, john.garry, loberman

On Sat, 2017-11-04 at 08:19 -0600, Jens Axboe wrote:
> On 11/03/2017 07:55 PM, Ming Lei wrote:
> > It is very expensive to atomic_inc/atomic_dec the host wide counter of
> > host->busy_count, and it should have been avoided via blk-mq's mechanism
> > of getting driver tag, which uses the more efficient way of sbitmap queue.
> > 
> > Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> > and don't run queue if the counter becomes zero, so IO hang may be caused
> > if all requests are completed just before the current SCSI device
> > is added to shost->starved_list.
> 
> This looks like an improvement. I have added it for 4.15.
> 
> Bart, does this fix your hang?

No, it doesn't. After I had reduced starget->can_queue in the SRP initiator I
ran into the following hang while running the srp-test software:

sysrq: SysRq : Show Blocked State
  task                        PC stack   pid father
systemd-udevd   D    0 19882    467 0x80000106
Call Trace:
 __schedule+0x2fa/0xbb0
 schedule+0x36/0x90
 io_schedule+0x16/0x40
 __lock_page+0x10a/0x140
 truncate_inode_pages_range+0x4ff/0x800
 truncate_inode_pages+0x15/0x20
 kill_bdev+0x35/0x40
 __blkdev_put+0x6d/0x1f0
 blkdev_put+0x4e/0x130
 blkdev_close+0x25/0x30
 __fput+0xed/0x1f0
 ____fput+0xe/0x10
 task_work_run+0x8b/0xc0
 do_exit+0x38d/0xc70
 do_group_exit+0x50/0xd0
 get_signal+0x2ad/0x8c0
 do_signal+0x28/0x680
 exit_to_usermode_loop+0x5a/0xa0
 do_syscall_64+0x12e/0x170
 entry_SYSCALL64_slow_path+0x25/0x25

The SRP initiator driver was modified as follows for this test:

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index a6664467651e..9d24a871cc2e 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c

@@ -2835,6 +2839,13 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
 	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
 }
 
+static int srp_target_alloc(struct scsi_target *starget)
+{
+	starget->can_queue = 1;
+	return 0;
+}
+
 static int srp_slave_alloc(struct scsi_device *sdev)
 {
 	struct Scsi_Host *shost = sdev->host;
@@ -3039,6 +3050,7 @@ static struct scsi_host_template srp_template = {
 	.module				= THIS_MODULE,
 	.name				= "InfiniBand SRP initiator",
 	.proc_name			= DRV_NAME,
+	.target_alloc			= srp_target_alloc,
 	.slave_alloc			= srp_slave_alloc,
 	.slave_configure		= srp_slave_configure,
 	.info				= srp_target_info,

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-06 19:45     ` Bart Van Assche
  (?)
@ 2017-11-07  2:11     ` Ming Lei
  2017-11-07 16:20         ` Bart Van Assche
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-07  2:11 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: hch, linux-block, axboe, linux-scsi, osandov, john.garry, loberman

On Mon, Nov 06, 2017 at 07:45:23PM +0000, Bart Van Assche wrote:
> On Sat, 2017-11-04 at 08:19 -0600, Jens Axboe wrote:
> > On 11/03/2017 07:55 PM, Ming Lei wrote:
> > > It is very expensive to atomic_inc/atomic_dec the host wide counter of
> > > host->busy_count, and it should have been avoided via blk-mq's mechanism
> > > of getting driver tag, which uses the more efficient way of sbitmap queue.
> > > 
> > > Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> > > and don't run queue if the counter becomes zero, so IO hang may be caused
> > > if all requests are completed just before the current SCSI device
> > > is added to shost->starved_list.
> > 
> > This looks like an improvement. I have added it for 4.15.
> > 
> > Bart, does this fix your hang?
> 
> No, it doesn't. After I had reduced starget->can_queue in the SRP initiator I
> ran into the following hang while running the srp-test software:
> 
> sysrq: SysRq : Show Blocked State
>   task                        PC stack   pid father
> systemd-udevd   D    0 19882    467 0x80000106
> Call Trace:
>  __schedule+0x2fa/0xbb0
>  schedule+0x36/0x90
>  io_schedule+0x16/0x40
>  __lock_page+0x10a/0x140
>  truncate_inode_pages_range+0x4ff/0x800
>  truncate_inode_pages+0x15/0x20
>  kill_bdev+0x35/0x40
>  __blkdev_put+0x6d/0x1f0
>  blkdev_put+0x4e/0x130
>  blkdev_close+0x25/0x30
>  __fput+0xed/0x1f0
>  ____fput+0xe/0x10
>  task_work_run+0x8b/0xc0
>  do_exit+0x38d/0xc70
>  do_group_exit+0x50/0xd0
>  get_signal+0x2ad/0x8c0
>  do_signal+0x28/0x680
>  exit_to_usermode_loop+0x5a/0xa0
>  do_syscall_64+0x12e/0x170
>  entry_SYSCALL64_slow_path+0x25/0x25

I can't reproduce your issue on IB/SRP any more against V4.14-RC4 with
the following patches, and without any hang after running your 6
srp-test:

88022d7201e9 blk-mq: don't handle failure in .get_budget
826a70a08b12 SCSI: don't get target/host busy_count in scsi_mq_get_budget()
1f460b63d4b3 blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE
358a3a6bccb7 blk-mq: don't handle TAG_SHARED in restart
0df21c86bdbf scsi: implement .get_budget and .put_budget for blk-mq
aeec77629a4a scsi: allow passing in null rq to scsi_prep_state_check()
b347689ffbca blk-mq-sched: improve dispatching from sw queue
de1482974080 blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
63ba8e31c3ac block: kyber: check if there are requests in ctx in kyber_has_work()
7930d0a00ff5 sbitmap: introduce __sbitmap_for_each_set()
caf8eb0d604a blk-mq-sched: move actual dispatching into one helper
5e3d02bbafad blk-mq-sched: dispatch from scheduler IFF progress is made in ->dispatch


If you can reproduce, please provide me at least the following log
first:

	find /sys/kernel/debug/block -name tags | xargs cat | grep busy

If any pending requests arn't completed, please provide the related
info in dbgfs about where is the request.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-06 18:04   ` Bart Van Assche
  (?)
@ 2017-11-07  2:19   ` Ming Lei
  2017-11-07  3:53       ` Martin K. Petersen
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-07  2:19 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: hch, linux-block, axboe, linux-scsi, osandov, john.garry, loberman

On Mon, Nov 06, 2017 at 06:04:42PM +0000, Bart Van Assche wrote:
> On Sat, 2017-11-04 at 09:55 +0800, Ming Lei wrote:
> > It is very expensive to atomic_inc/atomic_dec the host wide counter of
> > host->busy_count, and it should have been avoided via blk-mq's mechanism
> > of getting driver tag, which uses the more efficient way of sbitmap queue.
> 
> Did you perhaps mean the host->host_busy counter? Unrelated to this patch:
> I think that commit 64d513ac31bd ("scsi: use host wide tags by default") made
> that counter superfluous.

But this counter is still inc/dec in .get_budget(), so my patch moves
after get driver tag, which will be much efficient.

> 
> > Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> > and don't run queue if the counter becomes zero, so IO hang may be caused
> > if all requests are completed just before the current SCSI device
> > is added to shost->starved_list.
> 
> What is the relationship between the above description and the code changes
> below? The above description does not explain whether the scsi_mq_get/put_budget()
> changes below prevent that all outstanding SCSI requests complete before
> another queue is added to the starved list.
> 
> Is this perhaps an attempt to move the code that can add a request queue to
> "starved_list" from inside scsi_mq_get_budget() into scsi_queue_rq()? Does
> this patch more than reducing the probability that the race is encountered
> that a queue is added to "starved_list" after all requests have finished?

This patch moves scsi_target_queue_ready() and scsi_host_queue_ready()
into scsi_queue_rq(), so if any one of them returns busy, we will check
if the queue is idle via the following:

              if (atomic_read(&sdev->device_busy) == 0 &&
                    !scsi_device_blocked(sdev))
                        blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);

Then the added sdev in 'starved_list' will be visible in scsi_end_request().

I am pretty sure this patch fixes the issue in my test.

> 
> > Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
> > Reported-by: Bart Van Assche <bart.vanassche@wdc.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 
> Since this is a SCSI patch the SCSI maintainer, Martin Petersen, should have
> been Cc-ed for this patch. Additionally, I think that this patch should not
> have been queued by Jens before Martin had approved this patch.

This patch has been CCed to SCSI list.

> 
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index a098af3070a1..7f218ef61900 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1944,11 +1944,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> >  {
> >  	struct request_queue *q = hctx->queue;
> >  	struct scsi_device *sdev = q->queuedata;
> > -	struct Scsi_Host *shost = sdev->host;
> >  
> > -	atomic_dec(&shost->host_busy);
> > -	if (scsi_target(sdev)->can_queue > 0)
> > -		atomic_dec(&scsi_target(sdev)->target_busy);
> >  	atomic_dec(&sdev->device_busy);
> >  	put_device(&sdev->sdev_gendev);
> >  }
> 
> scsi_mq_get/put_budget() were introduced to improve sequential I/O
> performance. Does removing the SCSI target busy check have a negative
> performance on sequential I/O for e.g. the iSER initiator driver? The iSCSI
> over TCP initiator driver is not appropriate for testing performance
> regressions because it limits the iSCSI parameter MaxOutstandingR2T to one.

At least in my test, it doesn't. If you have some report, please let me
know, and we still can improve the case.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07  2:19   ` Ming Lei
@ 2017-11-07  3:53       ` Martin K. Petersen
  0 siblings, 0 replies; 48+ messages in thread
From: Martin K. Petersen @ 2017-11-07  3:53 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, hch, linux-block, axboe, linux-scsi, osandov,
	john.garry, loberman


Ming,

>> Since this is a SCSI patch the SCSI maintainer, Martin Petersen,
>> should have been Cc-ed for this patch. Additionally, I think that
>> this patch should not have been queued by Jens before Martin had
>> approved this patch.
>
> This patch has been CCed to SCSI list.

I don't need a personal CC, linux-scsi is sufficient. However, as a
general rule, changes to any file in drivers/scsi needs go through the
SCSI tree.

We sometimes make exceptions to facilitate the merge process. In those
cases an Acked-by: will be provided to indicate that it is OK that the
patch goes through a different tree.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-07  3:53       ` Martin K. Petersen
  0 siblings, 0 replies; 48+ messages in thread
From: Martin K. Petersen @ 2017-11-07  3:53 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, hch, linux-block, axboe, linux-scsi, osandov,
	john.garry, loberman


Ming,

>> Since this is a SCSI patch the SCSI maintainer, Martin Petersen,
>> should have been Cc-ed for this patch. Additionally, I think that
>> this patch should not have been queued by Jens before Martin had
>> approved this patch.
>
> This patch has been CCed to SCSI list.

I don't need a personal CC, linux-scsi is sufficient. However, as a
general rule, changes to any file in drivers/scsi needs go through the
SCSI tree.

We sometimes make exceptions to facilitate the merge process. In those
cases an Acked-by: will be provided to indicate that it is OK that the
patch goes through a different tree.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-06 19:45     ` Bart Van Assche
  (?)
  (?)
@ 2017-11-07 10:15     ` Ming Lei
  2017-11-07 16:17         ` Bart Van Assche
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-07 10:15 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: hch, linux-block, axboe, linux-scsi, osandov, john.garry, loberman

On Mon, Nov 06, 2017 at 07:45:23PM +0000, Bart Van Assche wrote:
> On Sat, 2017-11-04 at 08:19 -0600, Jens Axboe wrote:
> > On 11/03/2017 07:55 PM, Ming Lei wrote:
> > > It is very expensive to atomic_inc/atomic_dec the host wide counter of
> > > host->busy_count, and it should have been avoided via blk-mq's mechanism
> > > of getting driver tag, which uses the more efficient way of sbitmap queue.
> > > 
> > > Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
> > > and don't run queue if the counter becomes zero, so IO hang may be caused
> > > if all requests are completed just before the current SCSI device
> > > is added to shost->starved_list.
> > 
> > This looks like an improvement. I have added it for 4.15.
> > 
> > Bart, does this fix your hang?
> 
> No, it doesn't. After I had reduced starget->can_queue in the SRP initiator I
> ran into the following hang while running the srp-test software:
> 
> sysrq: SysRq : Show Blocked State
>   task                        PC stack   pid father
> systemd-udevd   D    0 19882    467 0x80000106
> Call Trace:
>  __schedule+0x2fa/0xbb0
>  schedule+0x36/0x90
>  io_schedule+0x16/0x40
>  __lock_page+0x10a/0x140
>  truncate_inode_pages_range+0x4ff/0x800
>  truncate_inode_pages+0x15/0x20
>  kill_bdev+0x35/0x40
>  __blkdev_put+0x6d/0x1f0
>  blkdev_put+0x4e/0x130
>  blkdev_close+0x25/0x30
>  __fput+0xed/0x1f0
>  ____fput+0xe/0x10
>  task_work_run+0x8b/0xc0
>  do_exit+0x38d/0xc70
>  do_group_exit+0x50/0xd0
>  get_signal+0x2ad/0x8c0
>  do_signal+0x28/0x680
>  exit_to_usermode_loop+0x5a/0xa0
>  do_syscall_64+0x12e/0x170
>  entry_SYSCALL64_slow_path+0x25/0x25
> 
> The SRP initiator driver was modified as follows for this test:
> 
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index a6664467651e..9d24a871cc2e 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> 
> @@ -2835,6 +2839,13 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
>  	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
>  }
>  
> +static int srp_target_alloc(struct scsi_target *starget)
> +{
> +	starget->can_queue = 1;
> +	return 0;
> +}
> +
>  static int srp_slave_alloc(struct scsi_device *sdev)
>  {
>  	struct Scsi_Host *shost = sdev->host;
> @@ -3039,6 +3050,7 @@ static struct scsi_host_template srp_template = {
>  	.module				= THIS_MODULE,
>  	.name				= "InfiniBand SRP initiator",
>  	.proc_name			= DRV_NAME,
> +	.target_alloc			= srp_target_alloc,
>  	.slave_alloc			= srp_slave_alloc,
>  	.slave_configure		= srp_slave_configure,
>  	.info				= srp_target_info,

Last time, you didn't mention the target patch for setting its
can_queue as 1, so I think you can't reproduce the issue on upstream
kernel without out-of-tree patch. Then looks it is another issue,
and we are making progress actually.

I just posted a one-line patch, which should address the small queue
depth issue, please let us know if it fixes your issue:

	https://marc.info/?l=linux-block&m=151004881411480&w=2

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 10:15     ` Ming Lei
@ 2017-11-07 16:17         ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 16:17 UTC (permalink / raw)
  To: ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, axboe, loberman

T24gVHVlLCAyMDE3LTExLTA3IGF0IDE4OjE1ICswODAwLCBNaW5nIExlaSB3cm90ZToNCj4gTGFz
dCB0aW1lLCB5b3UgZGlkbid0IG1lbnRpb24gdGhlIHRhcmdldCBwYXRjaCBmb3Igc2V0dGluZyBp
dHMNCj4gY2FuX3F1ZXVlIGFzIDEsIHNvIEkgdGhpbmsgeW91IGNhbid0IHJlcHJvZHVjZSB0aGUg
aXNzdWUgb24gdXBzdHJlYW0NCj4ga2VybmVsIHdpdGhvdXQgb3V0LW9mLXRyZWUgcGF0Y2guIFRo
ZW4gbG9va3MgaXQgaXMgYW5vdGhlciBpc3N1ZSwNCj4gYW5kIHdlIGFyZSBtYWtpbmcgcHJvZ3Jl
c3MgYWN0dWFsbHkuDQoNCklmIEkgZG9uJ3QgdHJ1c3QgYSBwYXRjaCBJIGludHJvZHVjZSBhZGRp
dGlvbmFsIHRlc3RzLiBUaGUgZmFjdCB0aGF0IEkNCm1vZGlmaWVkIHRoZSBTUlAgaW5pdGlhdG9y
IGJlZm9yZSB0aGlzIGhhbmcgb2NjdXJyZWQgZG9lcyBub3QgbWVhbiB0aGF0IHRoZQ0KYXBwcm9h
Y2ggb2YgeW91ciBwYXRjaCBpcyBmaW5lLiBXaGF0IHRoaXMgbWVhbnMgaXMgdGhhdCBhbGwgeW91
ciBwYXRjaCBkb2VzDQppcyB0byByZWR1Y2UgdGhlIHJhY2Ugd2luZG93IGFuZCB0aGF0IHRoZXJl
IGlzIHN0aWxsIGEgcmFjZSB3aW5kb3cuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-07 16:17         ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 16:17 UTC (permalink / raw)
  To: ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, axboe, loberman

On Tue, 2017-11-07 at 18:15 +0800, Ming Lei wrote:
> Last time, you didn't mention the target patch for setting its
> can_queue as 1, so I think you can't reproduce the issue on upstream
> kernel without out-of-tree patch. Then looks it is another issue,
> and we are making progress actually.

If I don't trust a patch I introduce additional tests. The fact that I
modified the SRP initiator before this hang occurred does not mean that the
approach of your patch is fine. What this means is that all your patch does
is to reduce the race window and that there is still a race window.

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07  2:11     ` Ming Lei
@ 2017-11-07 16:20         ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 16:20 UTC (permalink / raw)
  To: ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, axboe, loberman

T24gVHVlLCAyMDE3LTExLTA3IGF0IDEwOjExICswODAwLCBNaW5nIExlaSB3cm90ZToNCj4gSWYg
eW91IGNhbiByZXByb2R1Y2UsIHBsZWFzZSBwcm92aWRlIG1lIGF0IGxlYXN0IHRoZSBmb2xsb3dp
bmcgbG9nDQo+IGZpcnN0Og0KPiANCj4gCWZpbmQgL3N5cy9rZXJuZWwvZGVidWcvYmxvY2sgLW5h
bWUgdGFncyB8IHhhcmdzIGNhdCB8IGdyZXAgYnVzeQ0KPiANCj4gSWYgYW55IHBlbmRpbmcgcmVx
dWVzdHMgYXJuJ3QgY29tcGxldGVkLCBwbGVhc2UgcHJvdmlkZSB0aGUgcmVsYXRlZA0KPiBpbmZv
IGluIGRiZ2ZzIGFib3V0IHdoZXJlIGlzIHRoZSByZXF1ZXN0Lg0KDQpFdmVyeSB0aW1lIEkgcmFu
IHRoZSBhYm92ZSBvciBhIHNpbWlsYXIgY29tbWFuZCBpdHMgb3V0cHV0IHdhcyBlbXB0eS4gSQ0K
YXNzdW1lIHRoYXQncyBiZWNhdXNlIHRoZSBoYW5nIHVzdWFsbHkgb2NjdXJzIGluIGEgcGhhc2Ug
d2hlcmUgdGhlc2UgZGVidWdmcw0KYXR0cmlidXRlcyBlaXRoZXIgaGF2ZSBub3QgeWV0IGJlZW4g
Y3JlYXRlZCBvciBoYXZlIGFscmVhZHkgZGlzYXBwZWFyZWQuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-07 16:20         ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 16:20 UTC (permalink / raw)
  To: ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, axboe, loberman

On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> If you can reproduce, please provide me at least the following log
> first:
> 
> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> 
> If any pending requests arn't completed, please provide the related
> info in dbgfs about where is the request.

Every time I ran the above or a similar command its output was empty. I
assume that's because the hang usually occurs in a phase where these debugfs
attributes either have not yet been created or have already disappeared.

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 16:20         ` Bart Van Assche
  (?)
@ 2017-11-07 16:29         ` Jens Axboe
  2017-11-07 17:10           ` Jens Axboe
  2017-11-07 17:34             ` Bart Van Assche
  -1 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-07 16:29 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>> If you can reproduce, please provide me at least the following log
>> first:
>>
>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>
>> If any pending requests arn't completed, please provide the related
>> info in dbgfs about where is the request.
> 
> Every time I ran the above or a similar command its output was empty. I
> assume that's because the hang usually occurs in a phase where these debugfs
> attributes either have not yet been created or have already disappeared.

Bart, do you still see a hang with the patch that fixes the tag leak when
we fail to get a dispatch budget?

https://marc.info/?l=linux-block&m=151004881411480&w=2

I've run a lot of stability testing here, and I haven't run into any
issues. This is with shared tags as well. So if you still see the failure
case with the current tree AND the above patch, then I'll try and get
a test case setup that hits it too so we can get to the bottom of this.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 16:29         ` Jens Axboe
@ 2017-11-07 17:10           ` Jens Axboe
  2017-11-07 17:36             ` Jens Axboe
  2017-11-07 17:34             ` Bart Van Assche
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-07 17:10 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 09:29 AM, Jens Axboe wrote:
> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>> If you can reproduce, please provide me at least the following log
>>> first:
>>>
>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>
>>> If any pending requests arn't completed, please provide the related
>>> info in dbgfs about where is the request.
>>
>> Every time I ran the above or a similar command its output was empty. I
>> assume that's because the hang usually occurs in a phase where these debugfs
>> attributes either have not yet been created or have already disappeared.
> 
> Bart, do you still see a hang with the patch that fixes the tag leak when
> we fail to get a dispatch budget?
> 
> https://marc.info/?l=linux-block&m=151004881411480&w=2
> 
> I've run a lot of stability testing here, and I haven't run into any
> issues. This is with shared tags as well. So if you still see the failure
> case with the current tree AND the above patch, then I'll try and get
> a test case setup that hits it too so we can get to the bottom of this.

Ming, I managed to reproduce the hang using null_blk. Note this is
WITHOUT the patch mentioned above, running with that now.

# modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

and using this fio file:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

it seemed to keep running, but it hung when exiting. The troublesome
device was nullb1:

[  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
[  492.520782]       Not tainted 4.14.0-rc7+ #499
[  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  492.535904] fio             D13208  3263   3211 0x00000000
[  492.542535] Call Trace:
[  492.545764]  __schedule+0x279/0x720
[  492.550151]  schedule+0x33/0x90
[  492.554145]  io_schedule+0x16/0x40
[  492.558435]  blk_mq_get_tag+0x148/0x250
[  492.563211]  ? finish_wait+0x90/0x90
[  492.567693]  blk_mq_get_request+0xf0/0x3e0
[  492.572760]  blk_mq_make_request+0xe2/0x690
[  492.577913]  generic_make_request+0xfc/0x2f0
[  492.583177]  submit_bio+0x64/0x120
[  492.587475]  ? set_page_dirty_lock+0x4b/0x60
[  492.592736]  ? submit_bio+0x64/0x120
[  492.597309]  ? bio_set_pages_dirty+0x55/0x60
[  492.602570]  blkdev_direct_IO+0x388/0x3c0
[  492.607546]  ? free_ioctx_users+0xe0/0xe0
[  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
[  492.618353]  ? _raw_spin_unlock+0xe/0x20
[  492.623227]  generic_file_read_iter+0xb3/0xa00
[  492.628682]  ? generic_file_read_iter+0xb3/0xa00
[  492.634334]  ? security_file_permission+0x9b/0xc0
[  492.640114]  blkdev_read_iter+0x35/0x40
[  492.644877]  aio_read+0xc5/0x120
[  492.648973]  ? aio_read_events+0x24c/0x340
[  492.654124]  ? __might_sleep+0x4a/0x80
[  492.658800]  do_io_submit+0x47c/0x5e0
[  492.663373]  ? do_io_submit+0x47c/0x5e0
[  492.668234]  SyS_io_submit+0x10/0x20
[  492.672715]  ? SyS_io_submit+0x10/0x20
[  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
[  492.683039] RIP: 0033:0x7f83d1871717
[  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
[  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
[  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
[  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
[  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
[  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368

and if we look at the debug entries, it's waiting on a scheduler tag:

sched_tags:nr_tags=2
sched_tags:nr_reserved_tags=0
sched_tags:active_queues=0
sched_tags:bitmap_tags:
sched_tags:depth=2
sched_tags:busy=2
sched_tags:bits_per_word=64
sched_tags:map_nr=1
sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
sched_tags:wake_batch=1
sched_tags:wake_index=4
sched_tags:ws={
sched_tags:	{.wait_cnt=-102, .wait=inactive},
sched_tags:	{.wait_cnt=-126, .wait=inactive},
sched_tags:	{.wait_cnt=-72, .wait=inactive},
sched_tags:	{.wait_cnt=-96, .wait=inactive},
sched_tags:	{.wait_cnt=-134, .wait=inactive},
sched_tags:	{.wait_cnt=-116, .wait=inactive},
sched_tags:	{.wait_cnt=-90, .wait=inactive},
sched_tags:	{.wait_cnt=-115, .wait=active},
sched_tags:}
sched_tags:round_robin=0
sched_tags_bitmap:00000000: 03

with SCHED_RESTART being set:

state:SCHED_RESTART

and with the driver tags being idle:

tags:nr_tags=1
tags:nr_reserved_tags=0
tags:active_queues=0
tags:bitmap_tags:
tags:depth=1
tags:busy=0
tags:bits_per_word=64
tags:map_nr=1
tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
tags:wake_batch=1
tags:wake_index=3
tags:ws={
tags:	{.wait_cnt=-48, .wait=inactive},
tags:	{.wait_cnt=-39, .wait=inactive},
tags:	{.wait_cnt=-50, .wait=inactive},
tags:	{.wait_cnt=-47, .wait=inactive},
tags:	{.wait_cnt=-25, .wait=inactive},
tags:	{.wait_cnt=-24, .wait=inactive},
tags:	{.wait_cnt=-47, .wait=inactive},
tags:	{.wait_cnt=-47, .wait=inactive},
tags:}

Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 16:29         ` Jens Axboe
@ 2017-11-07 17:34             ` Bart Van Assche
  2017-11-07 17:34             ` Bart Van Assche
  1 sibling, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 17:34 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

T24gVHVlLCAyMDE3LTExLTA3IGF0IDA5OjI5IC0wNzAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biAxMS8wNy8yMDE3IDA5OjIwIEFNLCBCYXJ0IFZhbiBBc3NjaGUgd3JvdGU6DQo+ID4gT24gVHVl
LCAyMDE3LTExLTA3IGF0IDEwOjExICswODAwLCBNaW5nIExlaSB3cm90ZToNCj4gPiA+IElmIHlv
dSBjYW4gcmVwcm9kdWNlLCBwbGVhc2UgcHJvdmlkZSBtZSBhdCBsZWFzdCB0aGUgZm9sbG93aW5n
IGxvZw0KPiA+ID4gZmlyc3Q6DQo+ID4gPiANCj4gPiA+IAlmaW5kIC9zeXMva2VybmVsL2RlYnVn
L2Jsb2NrIC1uYW1lIHRhZ3MgfCB4YXJncyBjYXQgfCBncmVwIGJ1c3kNCj4gPiA+IA0KPiA+ID4g
SWYgYW55IHBlbmRpbmcgcmVxdWVzdHMgYXJuJ3QgY29tcGxldGVkLCBwbGVhc2UgcHJvdmlkZSB0
aGUgcmVsYXRlZA0KPiA+ID4gaW5mbyBpbiBkYmdmcyBhYm91dCB3aGVyZSBpcyB0aGUgcmVxdWVz
dC4NCj4gPiANCj4gPiBFdmVyeSB0aW1lIEkgcmFuIHRoZSBhYm92ZSBvciBhIHNpbWlsYXIgY29t
bWFuZCBpdHMgb3V0cHV0IHdhcyBlbXB0eS4gSQ0KPiA+IGFzc3VtZSB0aGF0J3MgYmVjYXVzZSB0
aGUgaGFuZyB1c3VhbGx5IG9jY3VycyBpbiBhIHBoYXNlIHdoZXJlIHRoZXNlIGRlYnVnZnMNCj4g
PiBhdHRyaWJ1dGVzIGVpdGhlciBoYXZlIG5vdCB5ZXQgYmVlbiBjcmVhdGVkIG9yIGhhdmUgYWxy
ZWFkeSBkaXNhcHBlYXJlZC4NCj4gDQo+IEJhcnQsIGRvIHlvdSBzdGlsbCBzZWUgYSBoYW5nIHdp
dGggdGhlIHBhdGNoIHRoYXQgZml4ZXMgdGhlIHRhZyBsZWFrIHdoZW4NCj4gd2UgZmFpbCB0byBn
ZXQgYSBkaXNwYXRjaCBidWRnZXQ/DQo+IA0KPiBodHRwczovL21hcmMuaW5mby8/bD1saW51eC1i
bG9jayZtPTE1MTAwNDg4MTQxMTQ4MCZ3PTINCj4gDQo+IEkndmUgcnVuIGEgbG90IG9mIHN0YWJp
bGl0eSB0ZXN0aW5nIGhlcmUsIGFuZCBJIGhhdmVuJ3QgcnVuIGludG8gYW55DQo+IGlzc3Vlcy4g
VGhpcyBpcyB3aXRoIHNoYXJlZCB0YWdzIGFzIHdlbGwuIFNvIGlmIHlvdSBzdGlsbCBzZWUgdGhl
IGZhaWx1cmUNCj4gY2FzZSB3aXRoIHRoZSBjdXJyZW50IHRyZWUgQU5EIHRoZSBhYm92ZSBwYXRj
aCwgdGhlbiBJJ2xsIHRyeSBhbmQgZ2V0DQo+IGEgdGVzdCBjYXNlIHNldHVwIHRoYXQgaGl0cyBp
dCB0b28gc28gd2UgY2FuIGdldCB0byB0aGUgYm90dG9tIG9mIHRoaXMuDQoNCkl0IHRvb2sgYSBs
aXR0bGUgbG9uZ2VyIHRoYW4gZXhwZWN0ZWQgYnV0IEkganVzdCByYW4gaW50byB0aGUgZm9sbG93
aW5nDQpsb2NrdXAgd2l0aCB5b3VyIGZvci1uZXh0IGJyYW5jaCBvZiB0aGlzIG1vcm5pbmcgKGNv
bW1pdCBlOGZhNDRiYjhhZjkpIGFuZA0KTWluZydzIHBhdGNoICJibGstbXE6IHB1dCBkcml2ZXIg
dGFnIGlmIGRpc3BhdGNoIGJ1ZGdldCBjYW4ndCBiZSBnb3QiDQphcHBsaWVkIG9uIHRvcCBvZiBp
dDoNCg0KWyAyNTc1LjMyNDY3OF0gc3lzcnE6IFN5c1JxIDogU2hvdyBCbG9ja2VkIFN0YXRlDQpb
IDI1NzUuMzMyMzM2XSAgIHRhc2sgICAgICAgICAgICAgICAgICAgICAgICBQQyBzdGFjayAgIHBp
ZCBmYXRoZXINClsgMjU3NS4zNDUyMzldIHN5c3RlbWQtdWRldmQgICBEICAgIDAgNDc1NzcgICAg
NTE4IDB4MDAwMDAxMDYNClsgMjU3NS4zNTM4MjFdIENhbGwgVHJhY2U6DQpbIDI1NzUuMzU4ODA1
XSAgX19zY2hlZHVsZSsweDI4Yi8weDg5MA0KWyAyNTc1LjM2NDkwNl0gIHNjaGVkdWxlKzB4MzYv
MHg4MA0KWyAyNTc1LjM3MDQzNl0gIGlvX3NjaGVkdWxlKzB4MTYvMHg0MA0KWyAyNTc1LjM3NjI4
N10gIF9fbG9ja19wYWdlKzB4ZmMvMHgxNDANClsgMjU3NS4zODIwNjFdICA/IHBhZ2VfY2FjaGVf
dHJlZV9pbnNlcnQrMHhjMC8weGMwDQpbIDI1NzUuMzg4OTQzXSAgdHJ1bmNhdGVfaW5vZGVfcGFn
ZXNfcmFuZ2UrMHg1ZTgvMHg4MzANClsgMjU3NS4zOTYwODNdICB0cnVuY2F0ZV9pbm9kZV9wYWdl
cysweDE1LzB4MjANClsgMjU3NS40MDIzOThdICBraWxsX2JkZXYrMHgyZi8weDQwDQpbIDI1NzUu
NDA3NTM4XSAgX19ibGtkZXZfcHV0KzB4NzQvMHgxZjANClsgMjU3NS40MTMwMTBdICA/IGttZW1f
Y2FjaGVfZnJlZSsweDE5Ny8weDFjMA0KWyAyNTc1LjQxODk4Nl0gIGJsa2Rldl9wdXQrMHg0Yy8w
eGQwDQpbIDI1NzUuNDI0MDQwXSAgYmxrZGV2X2Nsb3NlKzB4MzQvMHg3MA0KWyAyNTc1LjQyOTIx
Nl0gIF9fZnB1dCsweGU3LzB4MjIwDQpbIDI1NzUuNDMzODYzXSAgX19fX2ZwdXQrMHhlLzB4MTAN
ClsgMjU3NS40Mzg0OTBdICB0YXNrX3dvcmtfcnVuKzB4NzYvMHg5MA0KWyAyNTc1LjQ0MzYxOV0g
IGRvX2V4aXQrMHgyZTAvMHhhZjANClsgMjU3NS40NDgzMTFdICBkb19ncm91cF9leGl0KzB4NDMv
MHhiMA0KWyAyNTc1LjQ1MzM4Nl0gIGdldF9zaWduYWwrMHgyOTkvMHg1ZTANClsgMjU3NS40NTgz
MDNdICBkb19zaWduYWwrMHgzNy8weDc0MA0KWyAyNTc1LjQ2Mjk3Nl0gID8gYmxrZGV2X3JlYWRf
aXRlcisweDM1LzB4NDANClsgMjU3NS40Njg0MjVdICA/IG5ld19zeW5jX3JlYWQrMHhkZS8weDEz
MA0KWyAyNTc1LjQ3MzYyMF0gID8gdmZzX3JlYWQrMHgxMTUvMHgxMzANClsgMjU3NS40NzgzODhd
ICBleGl0X3RvX3VzZXJtb2RlX2xvb3ArMHg4MC8weGQwDQpbIDI1NzUuNDg0MDAyXSAgZG9fc3lz
Y2FsbF82NCsweGIzLzB4YzANClsgMjU3NS40ODg4MTNdICBlbnRyeV9TWVNDQUxMNjRfc2xvd19w
YXRoKzB4MjUvMHgyNQ0KWyAyNTc1LjQ5NDc1OV0gUklQOiAwMDMzOjB4N2VmZDgyOWNiZDExDQpb
IDI1NzUuNDk5NTA2XSBSU1A6IDAwMmI6MDAwMDdmZmZmOTg0Zjk3OCBFRkxBR1M6IDAwMDAwMjQ2
IE9SSUdfUkFYOiAwMDAwMDAwMDAwMDAwMDAwDQpbIDI1NzUuNTA4NzQxXSBSQVg6IDAwMDAwMDAw
MDAwMjIwMDAgUkJYOiAwMDAwNTVmMTlmOTAyY2EwIFJDWDogMDAwMDdlZmQ4MjljYmQxMQ0KWyAy
NTc1LjUxNzQ1NV0gUkRYOiAwMDAwMDAwMDAwMDQwMDAwIFJTSTogMDAwMDU1ZjE5ZjkwMmNjOCBS
REk6IDAwMDAwMDAwMDAwMDAwMDcNClsgMjU3NS41MjYxNjNdIFJCUDogMDAwMDU1ZjE5ZjdmYjlk
MCBSMDg6IDAwMDAwMDAwMDAwMDAwMDAgUjA5OiAwMDAwNTVmMTlmOTAyY2EwDQpbIDI1NzUuNTM0
ODYwXSBSMTA6IDAwMDA1NWYxOWY5MDJjYjggUjExOiAwMDAwMDAwMDAwMDAwMjQ2IFIxMjogMDAw
MDAwMDAwMDAwMDAwMA0KWyAyNTc1LjU0NDI1MF0gUjEzOiAwMDAwMDAwMDAwMDQwMDAwIFIxNDog
MDAwMDU1ZjE5ZjdmYmEyMCBSMTU6IDAwMDAwMDAwMDAwNDAwMDANCg0KQmFydC4=

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-07 17:34             ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 17:34 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On Tue, 2017-11-07 at 09:29 -0700, Jens Axboe wrote:
> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> > On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> > > If you can reproduce, please provide me at least the following log
> > > first:
> > > 
> > > 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> > > 
> > > If any pending requests arn't completed, please provide the related
> > > info in dbgfs about where is the request.
> > 
> > Every time I ran the above or a similar command its output was empty. I
> > assume that's because the hang usually occurs in a phase where these debugfs
> > attributes either have not yet been created or have already disappeared.
> 
> Bart, do you still see a hang with the patch that fixes the tag leak when
> we fail to get a dispatch budget?
> 
> https://marc.info/?l=linux-block&m=151004881411480&w=2
> 
> I've run a lot of stability testing here, and I haven't run into any
> issues. This is with shared tags as well. So if you still see the failure
> case with the current tree AND the above patch, then I'll try and get
> a test case setup that hits it too so we can get to the bottom of this.

It took a little longer than expected but I just ran into the following
lockup with your for-next branch of this morning (commit e8fa44bb8af9) and
Ming's patch "blk-mq: put driver tag if dispatch budget can't be got"
applied on top of it:

[ 2575.324678] sysrq: SysRq : Show Blocked State
[ 2575.332336]   task                        PC stack   pid father
[ 2575.345239] systemd-udevd   D    0 47577    518 0x00000106
[ 2575.353821] Call Trace:
[ 2575.358805]  __schedule+0x28b/0x890
[ 2575.364906]  schedule+0x36/0x80
[ 2575.370436]  io_schedule+0x16/0x40
[ 2575.376287]  __lock_page+0xfc/0x140
[ 2575.382061]  ? page_cache_tree_insert+0xc0/0xc0
[ 2575.388943]  truncate_inode_pages_range+0x5e8/0x830
[ 2575.396083]  truncate_inode_pages+0x15/0x20
[ 2575.402398]  kill_bdev+0x2f/0x40
[ 2575.407538]  __blkdev_put+0x74/0x1f0
[ 2575.413010]  ? kmem_cache_free+0x197/0x1c0
[ 2575.418986]  blkdev_put+0x4c/0xd0
[ 2575.424040]  blkdev_close+0x34/0x70
[ 2575.429216]  __fput+0xe7/0x220
[ 2575.433863]  ____fput+0xe/0x10
[ 2575.438490]  task_work_run+0x76/0x90
[ 2575.443619]  do_exit+0x2e0/0xaf0
[ 2575.448311]  do_group_exit+0x43/0xb0
[ 2575.453386]  get_signal+0x299/0x5e0
[ 2575.458303]  do_signal+0x37/0x740
[ 2575.462976]  ? blkdev_read_iter+0x35/0x40
[ 2575.468425]  ? new_sync_read+0xde/0x130
[ 2575.473620]  ? vfs_read+0x115/0x130
[ 2575.478388]  exit_to_usermode_loop+0x80/0xd0
[ 2575.484002]  do_syscall_64+0xb3/0xc0
[ 2575.488813]  entry_SYSCALL64_slow_path+0x25/0x25
[ 2575.494759] RIP: 0033:0x7efd829cbd11
[ 2575.499506] RSP: 002b:00007ffff984f978 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 2575.508741] RAX: 0000000000022000 RBX: 000055f19f902ca0 RCX: 00007efd829cbd11
[ 2575.517455] RDX: 0000000000040000 RSI: 000055f19f902cc8 RDI: 0000000000000007
[ 2575.526163] RBP: 000055f19f7fb9d0 R08: 0000000000000000 R09: 000055f19f902ca0
[ 2575.534860] R10: 000055f19f902cb8 R11: 0000000000000246 R12: 0000000000000000
[ 2575.544250] R13: 0000000000040000 R14: 000055f19f7fba20 R15: 0000000000040000

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 17:10           ` Jens Axboe
@ 2017-11-07 17:36             ` Jens Axboe
  2017-11-07 22:06               ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-07 17:36 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 10:10 AM, Jens Axboe wrote:
> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>> If you can reproduce, please provide me at least the following log
>>>> first:
>>>>
>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>
>>>> If any pending requests arn't completed, please provide the related
>>>> info in dbgfs about where is the request.
>>>
>>> Every time I ran the above or a similar command its output was empty. I
>>> assume that's because the hang usually occurs in a phase where these debugfs
>>> attributes either have not yet been created or have already disappeared.
>>
>> Bart, do you still see a hang with the patch that fixes the tag leak when
>> we fail to get a dispatch budget?
>>
>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>
>> I've run a lot of stability testing here, and I haven't run into any
>> issues. This is with shared tags as well. So if you still see the failure
>> case with the current tree AND the above patch, then I'll try and get
>> a test case setup that hits it too so we can get to the bottom of this.
> 
> Ming, I managed to reproduce the hang using null_blk. Note this is
> WITHOUT the patch mentioned above, running with that now.
> 
> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> 
> and using this fio file:
> 
> [global]
> bs=4k
> rw=randread
> norandommap
> direct=1
> ioengine=libaio
> iodepth=4
> 
> [nullb0]
> filename=/dev/nullb0
> [nullb1]
> filename=/dev/nullb1
> [nullb2]
> filename=/dev/nullb2
> [nullb3]
> filename=/dev/nullb3
> 
> it seemed to keep running, but it hung when exiting. The troublesome
> device was nullb1:
> 
> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
> [  492.520782]       Not tainted 4.14.0-rc7+ #499
> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  492.535904] fio             D13208  3263   3211 0x00000000
> [  492.542535] Call Trace:
> [  492.545764]  __schedule+0x279/0x720
> [  492.550151]  schedule+0x33/0x90
> [  492.554145]  io_schedule+0x16/0x40
> [  492.558435]  blk_mq_get_tag+0x148/0x250
> [  492.563211]  ? finish_wait+0x90/0x90
> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
> [  492.572760]  blk_mq_make_request+0xe2/0x690
> [  492.577913]  generic_make_request+0xfc/0x2f0
> [  492.583177]  submit_bio+0x64/0x120
> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
> [  492.592736]  ? submit_bio+0x64/0x120
> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
> [  492.623227]  generic_file_read_iter+0xb3/0xa00
> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
> [  492.634334]  ? security_file_permission+0x9b/0xc0
> [  492.640114]  blkdev_read_iter+0x35/0x40
> [  492.644877]  aio_read+0xc5/0x120
> [  492.648973]  ? aio_read_events+0x24c/0x340
> [  492.654124]  ? __might_sleep+0x4a/0x80
> [  492.658800]  do_io_submit+0x47c/0x5e0
> [  492.663373]  ? do_io_submit+0x47c/0x5e0
> [  492.668234]  SyS_io_submit+0x10/0x20
> [  492.672715]  ? SyS_io_submit+0x10/0x20
> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
> [  492.683039] RIP: 0033:0x7f83d1871717
> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
> 
> and if we look at the debug entries, it's waiting on a scheduler tag:
> 
> sched_tags:nr_tags=2
> sched_tags:nr_reserved_tags=0
> sched_tags:active_queues=0
> sched_tags:bitmap_tags:
> sched_tags:depth=2
> sched_tags:busy=2
> sched_tags:bits_per_word=64
> sched_tags:map_nr=1
> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
> sched_tags:wake_batch=1
> sched_tags:wake_index=4
> sched_tags:ws={
> sched_tags:	{.wait_cnt=-102, .wait=inactive},
> sched_tags:	{.wait_cnt=-126, .wait=inactive},
> sched_tags:	{.wait_cnt=-72, .wait=inactive},
> sched_tags:	{.wait_cnt=-96, .wait=inactive},
> sched_tags:	{.wait_cnt=-134, .wait=inactive},
> sched_tags:	{.wait_cnt=-116, .wait=inactive},
> sched_tags:	{.wait_cnt=-90, .wait=inactive},
> sched_tags:	{.wait_cnt=-115, .wait=active},
> sched_tags:}
> sched_tags:round_robin=0
> sched_tags_bitmap:00000000: 03
> 
> with SCHED_RESTART being set:
> 
> state:SCHED_RESTART
> 
> and with the driver tags being idle:
> 
> tags:nr_tags=1
> tags:nr_reserved_tags=0
> tags:active_queues=0
> tags:bitmap_tags:
> tags:depth=1
> tags:busy=0
> tags:bits_per_word=64
> tags:map_nr=1
> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
> tags:wake_batch=1
> tags:wake_index=3
> tags:ws={
> tags:	{.wait_cnt=-48, .wait=inactive},
> tags:	{.wait_cnt=-39, .wait=inactive},
> tags:	{.wait_cnt=-50, .wait=inactive},
> tags:	{.wait_cnt=-47, .wait=inactive},
> tags:	{.wait_cnt=-25, .wait=inactive},
> tags:	{.wait_cnt=-24, .wait=inactive},
> tags:	{.wait_cnt=-47, .wait=inactive},
> tags:	{.wait_cnt=-47, .wait=inactive},
> tags:}

Unsurprisingly (since this is sched_tags starvation) this still happens
with the patch. Same trace as above. Note that dispatch has two requests
sitting ready:

ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}

while we're blocked trying to get a new sched tag off the regular
blk_mq_make_request() path. It looks like a missing restart of the hctx.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 17:36             ` Jens Axboe
@ 2017-11-07 22:06               ` Jens Axboe
  2017-11-07 22:34                   ` Bart Van Assche
  2017-11-08  1:03                 ` Ming Lei
  0 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-07 22:06 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 10:36 AM, Jens Axboe wrote:
> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>> If you can reproduce, please provide me at least the following log
>>>>> first:
>>>>>
>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>
>>>>> If any pending requests arn't completed, please provide the related
>>>>> info in dbgfs about where is the request.
>>>>
>>>> Every time I ran the above or a similar command its output was empty. I
>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>> attributes either have not yet been created or have already disappeared.
>>>
>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>> we fail to get a dispatch budget?
>>>
>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>
>>> I've run a lot of stability testing here, and I haven't run into any
>>> issues. This is with shared tags as well. So if you still see the failure
>>> case with the current tree AND the above patch, then I'll try and get
>>> a test case setup that hits it too so we can get to the bottom of this.
>>
>> Ming, I managed to reproduce the hang using null_blk. Note this is
>> WITHOUT the patch mentioned above, running with that now.
>>
>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
>>
>> and using this fio file:
>>
>> [global]
>> bs=4k
>> rw=randread
>> norandommap
>> direct=1
>> ioengine=libaio
>> iodepth=4
>>
>> [nullb0]
>> filename=/dev/nullb0
>> [nullb1]
>> filename=/dev/nullb1
>> [nullb2]
>> filename=/dev/nullb2
>> [nullb3]
>> filename=/dev/nullb3
>>
>> it seemed to keep running, but it hung when exiting. The troublesome
>> device was nullb1:
>>
>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  492.535904] fio             D13208  3263   3211 0x00000000
>> [  492.542535] Call Trace:
>> [  492.545764]  __schedule+0x279/0x720
>> [  492.550151]  schedule+0x33/0x90
>> [  492.554145]  io_schedule+0x16/0x40
>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>> [  492.563211]  ? finish_wait+0x90/0x90
>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>> [  492.577913]  generic_make_request+0xfc/0x2f0
>> [  492.583177]  submit_bio+0x64/0x120
>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>> [  492.592736]  ? submit_bio+0x64/0x120
>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>> [  492.640114]  blkdev_read_iter+0x35/0x40
>> [  492.644877]  aio_read+0xc5/0x120
>> [  492.648973]  ? aio_read_events+0x24c/0x340
>> [  492.654124]  ? __might_sleep+0x4a/0x80
>> [  492.658800]  do_io_submit+0x47c/0x5e0
>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>> [  492.668234]  SyS_io_submit+0x10/0x20
>> [  492.672715]  ? SyS_io_submit+0x10/0x20
>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
>> [  492.683039] RIP: 0033:0x7f83d1871717
>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
>>
>> and if we look at the debug entries, it's waiting on a scheduler tag:
>>
>> sched_tags:nr_tags=2
>> sched_tags:nr_reserved_tags=0
>> sched_tags:active_queues=0
>> sched_tags:bitmap_tags:
>> sched_tags:depth=2
>> sched_tags:busy=2
>> sched_tags:bits_per_word=64
>> sched_tags:map_nr=1
>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
>> sched_tags:wake_batch=1
>> sched_tags:wake_index=4
>> sched_tags:ws={
>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
>> sched_tags:	{.wait_cnt=-115, .wait=active},
>> sched_tags:}
>> sched_tags:round_robin=0
>> sched_tags_bitmap:00000000: 03
>>
>> with SCHED_RESTART being set:
>>
>> state:SCHED_RESTART
>>
>> and with the driver tags being idle:
>>
>> tags:nr_tags=1
>> tags:nr_reserved_tags=0
>> tags:active_queues=0
>> tags:bitmap_tags:
>> tags:depth=1
>> tags:busy=0
>> tags:bits_per_word=64
>> tags:map_nr=1
>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>> tags:wake_batch=1
>> tags:wake_index=3
>> tags:ws={
>> tags:	{.wait_cnt=-48, .wait=inactive},
>> tags:	{.wait_cnt=-39, .wait=inactive},
>> tags:	{.wait_cnt=-50, .wait=inactive},
>> tags:	{.wait_cnt=-47, .wait=inactive},
>> tags:	{.wait_cnt=-25, .wait=inactive},
>> tags:	{.wait_cnt=-24, .wait=inactive},
>> tags:	{.wait_cnt=-47, .wait=inactive},
>> tags:	{.wait_cnt=-47, .wait=inactive},
>> tags:}
> 
> Unsurprisingly (since this is sched_tags starvation) this still happens
> with the patch. Same trace as above. Note that dispatch has two requests
> sitting ready:
> 
> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
> 
> while we're blocked trying to get a new sched tag off the regular
> blk_mq_make_request() path. It looks like a missing restart of the hctx.

Just to keep everyone in the loop, this bug is not new to
for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
probably different to what Bart is hitting, but it's a bug none the
less...

To me, this looks like a race between marking the queue as needing a
restart, and the driver tag being released and re-running the queue. If
I do:

# echo run > /sys/kernel/debug/block/nullb1/state

then it starts just fine. So there must be a race between marking the
need for a restart (when getting the driver tag fails), and when one of
the shared tag queues finishes a request and releases the driver tag.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 22:06               ` Jens Axboe
@ 2017-11-07 22:34                   ` Bart Van Assche
  2017-11-08  1:03                 ` Ming Lei
  1 sibling, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 22:34 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

T24gVHVlLCAyMDE3LTExLTA3IGF0IDE1OjA2IC0wNzAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBK
dXN0IHRvIGtlZXAgZXZlcnlvbmUgaW4gdGhlIGxvb3AsIHRoaXMgYnVnIGlzIG5vdCBuZXcgdG8N
Cj4gZm9yLTQuMTUvYmxvY2ssIG5vciBpcyBpdCBuZXcgdG8gdGhlIGN1cnJlbnQgNC40MS1yYyBv
ciA0LjEzLiBTbyBpdCdzDQo+IHByb2JhYmx5IGRpZmZlcmVudCB0byB3aGF0IEJhcnQgaXMgaGl0
dGluZywgYnV0IGl0J3MgYSBidWcgbm9uZSB0aGUNCj4gbGVzcy4uLg0KDQpIZWxsbyBKZW5zLA0K
DQpUaGVyZSBhcmUgc2V2ZXJhbCByZWFzb25zIHdoeSBJIHRoaW5rIHRoYXQgcGF0Y2ggImJsay1t
cTogZG9uJ3QgaGFuZGxlDQpUQUdfU0hBUkVEIGluIHJlc3RhcnQiIHJlYWxseSBzaG91bGQgYmUg
cmV2ZXJ0ZWQ6DQoqIFRoYXQgcGF0Y2ggaXMgYmFzZWQgb24gdGhlIGFzc3VtcHRpb24gdGhhdCBv
bmx5IHRoZSBTQ1NJIGRyaXZlciB1c2VzIHNoYXJlZA0KICB0YWdzLiBUaGF0IGFzc3VtcHRpb24g
aXMgbm90IGNvcnJlY3QuIG51bGxfYmxrIGFuZCBudm1lIGFsc28gdXNlIHNoYXJlZCB0YWdzLg0K
KiBBcyBteSB0ZXN0cyBoYXZlIHNob3duLCB0aGUgYWxnb3JpdGhtIGZvciByZXN0YXJ0aW5nIHF1
ZXVlcyBiYXNlZCBvbiB0aGUNCiAgU0NTSSBzdGFydmVkIGxpc3QgaXMgZmxhd2VkLiBTbyB1c2lu
ZyB0aGF0IG1lY2hhbmlzbSBpbnN0ZWFkIG9mIHRoZSBibGstbXENCiAgc2hhcmVkIHF1ZXVlIHJl
c3RhcnRpbmcgYWxnb3JpdGhtIGlzIHdyb25nLg0KKiBXZSBhcmUgY2xvc2UgdG8gdGhlIG1lcmdl
IHdpbmRvdy4gSXQgaXMgdG9vIGxhdGUgZm9yIHRyeWluZyB0byBmaXggdGhlDQogICJibGstbXE6
IGRvbid0IGhhbmRsZSBUQUdfU0hBUkVEIGluIHJlc3RhcnQiIHBhdGNoLg0KDQpNeSBwcm9wb3Nh
bCBpcyB0byBtYWtlIHN1cmUgdGhhdCB3aGF0IHdpbGwgYmUgc2VudCB0byBMaW51cyBkdXJpbmcg
dGhlIHY0LjE1DQptZXJnZSB3aW5kb3cgd29ya3MgcmVsaWFibHkuIFRoYXQgbWVhbnMgdXNpbmcg
dGhlIHY0LjEzL3Y0LjE0IGFsZ29yaXRobSBmb3INCnF1ZXVlIHJlc3RhcnRpbmcgd2hpY2ggaXMg
YW4gYWxnb3JpdGhtIHRoYXQgaXMgdHJ1c3RlZCBieSB0aGUgY29tbXVuaXR5LiBJZg0KUm9tYW4g
UGVueWFldidzIHBhdGNoIGNvdWxkIGdldCBhcHBsaWVkIHRoYXQgd291bGQgYmUgZXZlbiBiZXR0
ZXIuDQoNCkJhcnQu

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-07 22:34                   ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-07 22:34 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On Tue, 2017-11-07 at 15:06 -0700, Jens Axboe wrote:
> Just to keep everyone in the loop, this bug is not new to
> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> probably different to what Bart is hitting, but it's a bug none the
> less...

Hello Jens,

There are several reasons why I think that patch "blk-mq: don't handle
TAG_SHARED in restart" really should be reverted:
* That patch is based on the assumption that only the SCSI driver uses shared
  tags. That assumption is not correct. null_blk and nvme also use shared tags.
* As my tests have shown, the algorithm for restarting queues based on the
  SCSI starved list is flawed. So using that mechanism instead of the blk-mq
  shared queue restarting algorithm is wrong.
* We are close to the merge window. It is too late for trying to fix the
  "blk-mq: don't handle TAG_SHARED in restart" patch.

My proposal is to make sure that what will be sent to Linus during the v4.15
merge window works reliably. That means using the v4.13/v4.14 algorithm for
queue restarting which is an algorithm that is trusted by the community. If
Roman Penyaev's patch could get applied that would be even better.

Bart.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 22:34                   ` Bart Van Assche
  (?)
@ 2017-11-07 22:39                   ` Jens Axboe
  -1 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-07 22:39 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 03:34 PM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 15:06 -0700, Jens Axboe wrote:
>> Just to keep everyone in the loop, this bug is not new to
>> for-4.15/block, nor is it new to the current 4.14-rc or 4.13. So it's
>> probably different to what Bart is hitting, but it's a bug none the
>> less...
> 
> Hello Jens,
> 
> There are several reasons why I think that patch "blk-mq: don't handle
> TAG_SHARED in restart" really should be reverted:
> * That patch is based on the assumption that only the SCSI driver uses shared
>   tags. That assumption is not correct. null_blk and nvme also use shared tags.
> * As my tests have shown, the algorithm for restarting queues based on the
>   SCSI starved list is flawed. So using that mechanism instead of the blk-mq
>   shared queue restarting algorithm is wrong.
> * We are close to the merge window. It is too late for trying to fix the
>   "blk-mq: don't handle TAG_SHARED in restart" patch.
> 
> My proposal is to make sure that what will be sent to Linus during the v4.15
> merge window works reliably. That means using the v4.13/v4.14 algorithm for
> queue restarting which is an algorithm that is trusted by the community. If
> Roman Penyaev's patch could get applied that would be even better.

I'm fine with reverting a single patch, that's still a far cry from the
giant list. I'd rather get a fix in though, if at all possible. The code
it removed wasn't exactly the fastest or prettiest solution. But the
most important part is that we have something that works. If you have a
test case that is runnable AND reproduces the problem, I'd love to have
it. I've seen comments to that effect spread over several messages,
would be nice to have it summarized and readily available for all that
want to work on it.

The issue above is NOT a new bug, I ran into it by accident trying to
reproduce your case. Debugging that one right now, hopefully we'll have
some closure on that tomorrow and we can move forward on the shared tag
restart/loop. It smells like a TAG_WAITING race, or a restart race.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 16:20         ` Bart Van Assche
  (?)
  (?)
@ 2017-11-08  0:39         ` Ming Lei
  2017-11-08  2:55           ` Jens Axboe
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  0:39 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, axboe, loberman

On Tue, Nov 07, 2017 at 04:20:08PM +0000, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> > If you can reproduce, please provide me at least the following log
> > first:
> > 
> > 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> > 
> > If any pending requests arn't completed, please provide the related
> > info in dbgfs about where is the request.
> 
> Every time I ran the above or a similar command its output was empty. I
> assume that's because the hang usually occurs in a phase where these debugfs
> attributes either have not yet been created or have already disappeared.

Could you dump all tags? Then you can see if this attribute is disappeared.

If that output is empty, it often means there isn't pending request not
completed. So if that is true, your hang is _not_ related with RESTART.

Thakns,
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 22:34                   ` Bart Van Assche
  (?)
  (?)
@ 2017-11-08  0:50                   ` Ming Lei
  -1 siblings, 0 replies; 48+ messages in thread
From: Ming Lei @ 2017-11-08  0:50 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe, linux-scsi, hch, linux-block, osandov, john.garry, loberman

On Tue, Nov 07, 2017 at 10:34:35PM +0000, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 15:06 -0700, Jens Axboe wrote:
> > Just to keep everyone in the loop, this bug is not new to
> > for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> > probably different to what Bart is hitting, but it's a bug none the
> > less...
> 
> Hello Jens,
> 
> There are several reasons why I think that patch "blk-mq: don't handle
> TAG_SHARED in restart" really should be reverted:
> * That patch is based on the assumption that only the SCSI driver uses shared
>   tags. That assumption is not correct. null_blk and nvme also use shared tags.

No, both null_blk and nvme should be handled by BLK_MQ_S_TAG_WAITING, not need
to waste CPU to check all shared tags.

> * As my tests have shown, the algorithm for restarting queues based on the

Your test doesn't show it is related with RESTART since there isn't
pending request in output of 'tags'.

>   SCSI starved list is flawed. So using that mechanism instead of the blk-mq
>   shared queue restarting algorithm is wrong.

The algorithm based on starved list has been used for dozens of years
for SCSI, I don't think it is flawed enough.

> * We are close to the merge window. It is too late for trying to fix the
>   "blk-mq: don't handle TAG_SHARED in restart" patch.

If you can provide us the reproduction approach, the time is enough to fix it
before V4.15 release.

> 
> My proposal is to make sure that what will be sent to Linus during the v4.15
> merge window works reliably. That means using the v4.13/v4.14 algorithm for
> queue restarting which is an algorithm that is trusted by the community. If
> Roman Penyaev's patch could get applied that would be even better.

Frankly speaking, the algorithm for blk-mq's restarting won't be used by SCSI at
all because scsi_end_request() restarts the queue before the restart for TAG_SHARED.

For NVMe and null_blk, it is basically same since we cover that via BLK_MQ_S_TAG_WAITING.

So Nak your proposal.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 17:34             ` Bart Van Assche
  (?)
@ 2017-11-08  0:53             ` Ming Lei
  2017-11-08  2:06               ` Ming Lei
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  0:53 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe, linux-scsi, hch, linux-block, osandov, john.garry, loberman

On Tue, Nov 07, 2017 at 05:34:38PM +0000, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 09:29 -0700, Jens Axboe wrote:
> > On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> > > On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> > > > If you can reproduce, please provide me at least the following log
> > > > first:
> > > > 
> > > > 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> > > > 
> > > > If any pending requests arn't completed, please provide the related
> > > > info in dbgfs about where is the request.
> > > 
> > > Every time I ran the above or a similar command its output was empty. I
> > > assume that's because the hang usually occurs in a phase where these debugfs
> > > attributes either have not yet been created or have already disappeared.
> > 
> > Bart, do you still see a hang with the patch that fixes the tag leak when
> > we fail to get a dispatch budget?
> > 
> > https://marc.info/?l=linux-block&m=151004881411480&w=2
> > 
> > I've run a lot of stability testing here, and I haven't run into any
> > issues. This is with shared tags as well. So if you still see the failure
> > case with the current tree AND the above patch, then I'll try and get
> > a test case setup that hits it too so we can get to the bottom of this.
> 
> It took a little longer than expected but I just ran into the following
> lockup with your for-next branch of this morning (commit e8fa44bb8af9) and
> Ming's patch "blk-mq: put driver tag if dispatch budget can't be got"
> applied on top of it:
> 
> [ 2575.324678] sysrq: SysRq : Show Blocked State
> [ 2575.332336]   task                        PC stack   pid father
> [ 2575.345239] systemd-udevd   D    0 47577    518 0x00000106
> [ 2575.353821] Call Trace:
> [ 2575.358805]  __schedule+0x28b/0x890
> [ 2575.364906]  schedule+0x36/0x80
> [ 2575.370436]  io_schedule+0x16/0x40
> [ 2575.376287]  __lock_page+0xfc/0x140
> [ 2575.382061]  ? page_cache_tree_insert+0xc0/0xc0
> [ 2575.388943]  truncate_inode_pages_range+0x5e8/0x830
> [ 2575.396083]  truncate_inode_pages+0x15/0x20
> [ 2575.402398]  kill_bdev+0x2f/0x40
> [ 2575.407538]  __blkdev_put+0x74/0x1f0
> [ 2575.413010]  ? kmem_cache_free+0x197/0x1c0
> [ 2575.418986]  blkdev_put+0x4c/0xd0
> [ 2575.424040]  blkdev_close+0x34/0x70
> [ 2575.429216]  __fput+0xe7/0x220
> [ 2575.433863]  ____fput+0xe/0x10
> [ 2575.438490]  task_work_run+0x76/0x90
> [ 2575.443619]  do_exit+0x2e0/0xaf0
> [ 2575.448311]  do_group_exit+0x43/0xb0
> [ 2575.453386]  get_signal+0x299/0x5e0
> [ 2575.458303]  do_signal+0x37/0x740
> [ 2575.462976]  ? blkdev_read_iter+0x35/0x40
> [ 2575.468425]  ? new_sync_read+0xde/0x130
> [ 2575.473620]  ? vfs_read+0x115/0x130
> [ 2575.478388]  exit_to_usermode_loop+0x80/0xd0
> [ 2575.484002]  do_syscall_64+0xb3/0xc0
> [ 2575.488813]  entry_SYSCALL64_slow_path+0x25/0x25
> [ 2575.494759] RIP: 0033:0x7efd829cbd11
> [ 2575.499506] RSP: 002b:00007ffff984f978 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> [ 2575.508741] RAX: 0000000000022000 RBX: 000055f19f902ca0 RCX: 00007efd829cbd11
> [ 2575.517455] RDX: 0000000000040000 RSI: 000055f19f902cc8 RDI: 0000000000000007
> [ 2575.526163] RBP: 000055f19f7fb9d0 R08: 0000000000000000 R09: 000055f19f902ca0
> [ 2575.534860] R10: 000055f19f902cb8 R11: 0000000000000246 R12: 0000000000000000
> [ 2575.544250] R13: 0000000000040000 R14: 000055f19f7fba20 R15: 0000000000040000

Again please show us the output of 'tags' to see if there is pending
requests not completed.

Please run this test on linus tree(V4.14-rc7) to see if the same issue
can be reproduced.

Also if possible, please provide us the way for reproducing.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 22:06               ` Jens Axboe
  2017-11-07 22:34                   ` Bart Van Assche
@ 2017-11-08  1:03                 ` Ming Lei
  2017-11-08  3:01                   ` Jens Axboe
  1 sibling, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  1:03 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
> On 11/07/2017 10:36 AM, Jens Axboe wrote:
> > On 11/07/2017 10:10 AM, Jens Axboe wrote:
> >> On 11/07/2017 09:29 AM, Jens Axboe wrote:
> >>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> >>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> >>>>> If you can reproduce, please provide me at least the following log
> >>>>> first:
> >>>>>
> >>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> >>>>>
> >>>>> If any pending requests arn't completed, please provide the related
> >>>>> info in dbgfs about where is the request.
> >>>>
> >>>> Every time I ran the above or a similar command its output was empty. I
> >>>> assume that's because the hang usually occurs in a phase where these debugfs
> >>>> attributes either have not yet been created or have already disappeared.
> >>>
> >>> Bart, do you still see a hang with the patch that fixes the tag leak when
> >>> we fail to get a dispatch budget?
> >>>
> >>> https://marc.info/?l=linux-block&m=151004881411480&w=2
> >>>
> >>> I've run a lot of stability testing here, and I haven't run into any
> >>> issues. This is with shared tags as well. So if you still see the failure
> >>> case with the current tree AND the above patch, then I'll try and get
> >>> a test case setup that hits it too so we can get to the bottom of this.
> >>
> >> Ming, I managed to reproduce the hang using null_blk. Note this is
> >> WITHOUT the patch mentioned above, running with that now.
> >>
> >> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> >>
> >> and using this fio file:
> >>
> >> [global]
> >> bs=4k
> >> rw=randread
> >> norandommap
> >> direct=1
> >> ioengine=libaio
> >> iodepth=4
> >>
> >> [nullb0]
> >> filename=/dev/nullb0
> >> [nullb1]
> >> filename=/dev/nullb1
> >> [nullb2]
> >> filename=/dev/nullb2
> >> [nullb3]
> >> filename=/dev/nullb3
> >>
> >> it seemed to keep running, but it hung when exiting. The troublesome
> >> device was nullb1:
> >>
> >> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
> >> [  492.520782]       Not tainted 4.14.0-rc7+ #499
> >> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >> [  492.535904] fio             D13208  3263   3211 0x00000000
> >> [  492.542535] Call Trace:
> >> [  492.545764]  __schedule+0x279/0x720
> >> [  492.550151]  schedule+0x33/0x90
> >> [  492.554145]  io_schedule+0x16/0x40
> >> [  492.558435]  blk_mq_get_tag+0x148/0x250
> >> [  492.563211]  ? finish_wait+0x90/0x90
> >> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
> >> [  492.572760]  blk_mq_make_request+0xe2/0x690
> >> [  492.577913]  generic_make_request+0xfc/0x2f0
> >> [  492.583177]  submit_bio+0x64/0x120
> >> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
> >> [  492.592736]  ? submit_bio+0x64/0x120
> >> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
> >> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
> >> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
> >> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
> >> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
> >> [  492.623227]  generic_file_read_iter+0xb3/0xa00
> >> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
> >> [  492.634334]  ? security_file_permission+0x9b/0xc0
> >> [  492.640114]  blkdev_read_iter+0x35/0x40
> >> [  492.644877]  aio_read+0xc5/0x120
> >> [  492.648973]  ? aio_read_events+0x24c/0x340
> >> [  492.654124]  ? __might_sleep+0x4a/0x80
> >> [  492.658800]  do_io_submit+0x47c/0x5e0
> >> [  492.663373]  ? do_io_submit+0x47c/0x5e0
> >> [  492.668234]  SyS_io_submit+0x10/0x20
> >> [  492.672715]  ? SyS_io_submit+0x10/0x20
> >> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
> >> [  492.683039] RIP: 0033:0x7f83d1871717
> >> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
> >> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
> >> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
> >> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
> >> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
> >> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
> >>
> >> and if we look at the debug entries, it's waiting on a scheduler tag:
> >>
> >> sched_tags:nr_tags=2
> >> sched_tags:nr_reserved_tags=0
> >> sched_tags:active_queues=0
> >> sched_tags:bitmap_tags:
> >> sched_tags:depth=2
> >> sched_tags:busy=2
> >> sched_tags:bits_per_word=64
> >> sched_tags:map_nr=1
> >> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
> >> sched_tags:wake_batch=1
> >> sched_tags:wake_index=4
> >> sched_tags:ws={
> >> sched_tags:	{.wait_cnt=-102, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-126, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-72, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-96, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-134, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-116, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-90, .wait=inactive},
> >> sched_tags:	{.wait_cnt=-115, .wait=active},
> >> sched_tags:}
> >> sched_tags:round_robin=0
> >> sched_tags_bitmap:00000000: 03
> >>
> >> with SCHED_RESTART being set:
> >>
> >> state:SCHED_RESTART
> >>
> >> and with the driver tags being idle:
> >>
> >> tags:nr_tags=1
> >> tags:nr_reserved_tags=0
> >> tags:active_queues=0
> >> tags:bitmap_tags:
> >> tags:depth=1
> >> tags:busy=0
> >> tags:bits_per_word=64
> >> tags:map_nr=1
> >> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
> >> tags:wake_batch=1
> >> tags:wake_index=3
> >> tags:ws={
> >> tags:	{.wait_cnt=-48, .wait=inactive},
> >> tags:	{.wait_cnt=-39, .wait=inactive},
> >> tags:	{.wait_cnt=-50, .wait=inactive},
> >> tags:	{.wait_cnt=-47, .wait=inactive},
> >> tags:	{.wait_cnt=-25, .wait=inactive},
> >> tags:	{.wait_cnt=-24, .wait=inactive},
> >> tags:	{.wait_cnt=-47, .wait=inactive},
> >> tags:	{.wait_cnt=-47, .wait=inactive},
> >> tags:}
> > 
> > Unsurprisingly (since this is sched_tags starvation) this still happens
> > with the patch. Same trace as above. Note that dispatch has two requests
> > sitting ready:
> > 
> > ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
> > ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
> > 
> > while we're blocked trying to get a new sched tag off the regular
> > blk_mq_make_request() path. It looks like a missing restart of the hctx.
> 
> Just to keep everyone in the loop, this bug is not new to
> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> probably different to what Bart is hitting, but it's a bug none the
> less...

Thanks for running this test ans share us the result.

> 
> To me, this looks like a race between marking the queue as needing a
> restart, and the driver tag being released and re-running the queue. If
> I do:
> 
> # echo run > /sys/kernel/debug/block/nullb1/state
> 
> then it starts just fine. So there must be a race between marking the
> need for a restart (when getting the driver tag fails), and when one of
> the shared tag queues finishes a request and releases the driver tag.

Just one hint, blk-mq's RESTART won't work if no requests are pending,
but looks no efficient/reliable way to handle that.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  0:53             ` Ming Lei
@ 2017-11-08  2:06               ` Ming Lei
  0 siblings, 0 replies; 48+ messages in thread
From: Ming Lei @ 2017-11-08  2:06 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe, linux-scsi, hch, linux-block, osandov, john.garry, loberman

On Wed, Nov 08, 2017 at 08:53:38AM +0800, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 05:34:38PM +0000, Bart Van Assche wrote:
> > On Tue, 2017-11-07 at 09:29 -0700, Jens Axboe wrote:
> > > On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> > > > On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> > > > > If you can reproduce, please provide me at least the following log
> > > > > first:
> > > > > 
> > > > > 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> > > > > 
> > > > > If any pending requests arn't completed, please provide the related
> > > > > info in dbgfs about where is the request.
> > > > 
> > > > Every time I ran the above or a similar command its output was empty. I
> > > > assume that's because the hang usually occurs in a phase where these debugfs
> > > > attributes either have not yet been created or have already disappeared.
> > > 
> > > Bart, do you still see a hang with the patch that fixes the tag leak when
> > > we fail to get a dispatch budget?
> > > 
> > > https://marc.info/?l=linux-block&m=151004881411480&w=2
> > > 
> > > I've run a lot of stability testing here, and I haven't run into any
> > > issues. This is with shared tags as well. So if you still see the failure
> > > case with the current tree AND the above patch, then I'll try and get
> > > a test case setup that hits it too so we can get to the bottom of this.
> > 
> > It took a little longer than expected but I just ran into the following
> > lockup with your for-next branch of this morning (commit e8fa44bb8af9) and
> > Ming's patch "blk-mq: put driver tag if dispatch budget can't be got"
> > applied on top of it:
> > 
> > [ 2575.324678] sysrq: SysRq : Show Blocked State
> > [ 2575.332336]   task                        PC stack   pid father
> > [ 2575.345239] systemd-udevd   D    0 47577    518 0x00000106
> > [ 2575.353821] Call Trace:
> > [ 2575.358805]  __schedule+0x28b/0x890
> > [ 2575.364906]  schedule+0x36/0x80
> > [ 2575.370436]  io_schedule+0x16/0x40
> > [ 2575.376287]  __lock_page+0xfc/0x140
> > [ 2575.382061]  ? page_cache_tree_insert+0xc0/0xc0
> > [ 2575.388943]  truncate_inode_pages_range+0x5e8/0x830
> > [ 2575.396083]  truncate_inode_pages+0x15/0x20
> > [ 2575.402398]  kill_bdev+0x2f/0x40
> > [ 2575.407538]  __blkdev_put+0x74/0x1f0
> > [ 2575.413010]  ? kmem_cache_free+0x197/0x1c0
> > [ 2575.418986]  blkdev_put+0x4c/0xd0
> > [ 2575.424040]  blkdev_close+0x34/0x70
> > [ 2575.429216]  __fput+0xe7/0x220
> > [ 2575.433863]  ____fput+0xe/0x10
> > [ 2575.438490]  task_work_run+0x76/0x90
> > [ 2575.443619]  do_exit+0x2e0/0xaf0
> > [ 2575.448311]  do_group_exit+0x43/0xb0
> > [ 2575.453386]  get_signal+0x299/0x5e0
> > [ 2575.458303]  do_signal+0x37/0x740
> > [ 2575.462976]  ? blkdev_read_iter+0x35/0x40
> > [ 2575.468425]  ? new_sync_read+0xde/0x130
> > [ 2575.473620]  ? vfs_read+0x115/0x130
> > [ 2575.478388]  exit_to_usermode_loop+0x80/0xd0
> > [ 2575.484002]  do_syscall_64+0xb3/0xc0
> > [ 2575.488813]  entry_SYSCALL64_slow_path+0x25/0x25
> > [ 2575.494759] RIP: 0033:0x7efd829cbd11
> > [ 2575.499506] RSP: 002b:00007ffff984f978 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> > [ 2575.508741] RAX: 0000000000022000 RBX: 000055f19f902ca0 RCX: 00007efd829cbd11
> > [ 2575.517455] RDX: 0000000000040000 RSI: 000055f19f902cc8 RDI: 0000000000000007
> > [ 2575.526163] RBP: 000055f19f7fb9d0 R08: 0000000000000000 R09: 000055f19f902ca0
> > [ 2575.534860] R10: 000055f19f902cb8 R11: 0000000000000246 R12: 0000000000000000
> > [ 2575.544250] R13: 0000000000040000 R14: 000055f19f7fba20 R15: 0000000000040000
> 
> Again please show us the output of 'tags' to see if there is pending
> requests not completed.
> 
> Please run this test on linus tree(V4.14-rc7) to see if the same issue
> can be reproduced.
> 
> Also if possible, please provide us the way for reproducing.

BTW, please apply the following patch before your further test:

	https://marc.info/?l=linux-block&m=150988386406050&w=2

Since you don't see busy tag in 'tags' and queue may have been frozen.
And the in-progress dispatch after queue DEAD might corrupt memory.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  0:39         ` Ming Lei
@ 2017-11-08  2:55           ` Jens Axboe
  2017-11-08  2:58             ` Ming Lei
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  2:55 UTC (permalink / raw)
  To: Ming Lei, Bart Van Assche
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 05:39 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 04:20:08PM +0000, Bart Van Assche wrote:
>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>> If you can reproduce, please provide me at least the following log
>>> first:
>>>
>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>
>>> If any pending requests arn't completed, please provide the related
>>> info in dbgfs about where is the request.
>>
>> Every time I ran the above or a similar command its output was empty. I
>> assume that's because the hang usually occurs in a phase where these debugfs
>> attributes either have not yet been created or have already disappeared.
> 
> Could you dump all tags? Then you can see if this attribute is disappeared.
> 
> If that output is empty, it often means there isn't pending request not
> completed. So if that is true, your hang is _not_ related with RESTART.

You need to check sched_tags as well. It could still be a restart race
or problem, if tags is empty but sched_tags has busy bits.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  2:55           ` Jens Axboe
@ 2017-11-08  2:58             ` Ming Lei
  2017-11-08  3:06               ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  2:58 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On Tue, Nov 07, 2017 at 07:55:32PM -0700, Jens Axboe wrote:
> On 11/07/2017 05:39 PM, Ming Lei wrote:
> > On Tue, Nov 07, 2017 at 04:20:08PM +0000, Bart Van Assche wrote:
> >> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> >>> If you can reproduce, please provide me at least the following log
> >>> first:
> >>>
> >>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> >>>
> >>> If any pending requests arn't completed, please provide the related
> >>> info in dbgfs about where is the request.
> >>
> >> Every time I ran the above or a similar command its output was empty. I
> >> assume that's because the hang usually occurs in a phase where these debugfs
> >> attributes either have not yet been created or have already disappeared.
> > 
> > Could you dump all tags? Then you can see if this attribute is disappeared.
> > 
> > If that output is empty, it often means there isn't pending request not
> > completed. So if that is true, your hang is _not_ related with RESTART.
> 
> You need to check sched_tags as well. It could still be a restart race
> or problem, if tags is empty but sched_tags has busy bits.

Yeah, I didn't mention because SRP is MQ hardware, and the default
scheduler is none, but if Bart changes that, the sched_tags need to
checked first.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  1:03                 ` Ming Lei
@ 2017-11-08  3:01                   ` Jens Axboe
  2017-11-08  3:12                     ` Ming Lei
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  3:01 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On 11/07/2017 06:03 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>> first:
>>>>>>>
>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>
>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>> info in dbgfs about where is the request.
>>>>>>
>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>>>> attributes either have not yet been created or have already disappeared.
>>>>>
>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>>>> we fail to get a dispatch budget?
>>>>>
>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>
>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>> issues. This is with shared tags as well. So if you still see the failure
>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>
>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>> WITHOUT the patch mentioned above, running with that now.
>>>>
>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
>>>>
>>>> and using this fio file:
>>>>
>>>> [global]
>>>> bs=4k
>>>> rw=randread
>>>> norandommap
>>>> direct=1
>>>> ioengine=libaio
>>>> iodepth=4
>>>>
>>>> [nullb0]
>>>> filename=/dev/nullb0
>>>> [nullb1]
>>>> filename=/dev/nullb1
>>>> [nullb2]
>>>> filename=/dev/nullb2
>>>> [nullb3]
>>>> filename=/dev/nullb3
>>>>
>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>> device was nullb1:
>>>>
>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
>>>> [  492.542535] Call Trace:
>>>> [  492.545764]  __schedule+0x279/0x720
>>>> [  492.550151]  schedule+0x33/0x90
>>>> [  492.554145]  io_schedule+0x16/0x40
>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>> [  492.583177]  submit_bio+0x64/0x120
>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
>>>> [  492.644877]  aio_read+0xc5/0x120
>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>>>> [  492.668234]  SyS_io_submit+0x10/0x20
>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
>>>> [  492.683039] RIP: 0033:0x7f83d1871717
>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
>>>>
>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
>>>>
>>>> sched_tags:nr_tags=2
>>>> sched_tags:nr_reserved_tags=0
>>>> sched_tags:active_queues=0
>>>> sched_tags:bitmap_tags:
>>>> sched_tags:depth=2
>>>> sched_tags:busy=2
>>>> sched_tags:bits_per_word=64
>>>> sched_tags:map_nr=1
>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
>>>> sched_tags:wake_batch=1
>>>> sched_tags:wake_index=4
>>>> sched_tags:ws={
>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
>>>> sched_tags:}
>>>> sched_tags:round_robin=0
>>>> sched_tags_bitmap:00000000: 03
>>>>
>>>> with SCHED_RESTART being set:
>>>>
>>>> state:SCHED_RESTART
>>>>
>>>> and with the driver tags being idle:
>>>>
>>>> tags:nr_tags=1
>>>> tags:nr_reserved_tags=0
>>>> tags:active_queues=0
>>>> tags:bitmap_tags:
>>>> tags:depth=1
>>>> tags:busy=0
>>>> tags:bits_per_word=64
>>>> tags:map_nr=1
>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>>>> tags:wake_batch=1
>>>> tags:wake_index=3
>>>> tags:ws={
>>>> tags:	{.wait_cnt=-48, .wait=inactive},
>>>> tags:	{.wait_cnt=-39, .wait=inactive},
>>>> tags:	{.wait_cnt=-50, .wait=inactive},
>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>> tags:	{.wait_cnt=-25, .wait=inactive},
>>>> tags:	{.wait_cnt=-24, .wait=inactive},
>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>> tags:}
>>>
>>> Unsurprisingly (since this is sched_tags starvation) this still happens
>>> with the patch. Same trace as above. Note that dispatch has two requests
>>> sitting ready:
>>>
>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
>>>
>>> while we're blocked trying to get a new sched tag off the regular
>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
>>
>> Just to keep everyone in the loop, this bug is not new to
>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
>> probably different to what Bart is hitting, but it's a bug none the
>> less...
> 
> Thanks for running this test ans share us the result.
> 
>>
>> To me, this looks like a race between marking the queue as needing a
>> restart, and the driver tag being released and re-running the queue. If
>> I do:
>>
>> # echo run > /sys/kernel/debug/block/nullb1/state
>>
>> then it starts just fine. So there must be a race between marking the
>> need for a restart (when getting the driver tag fails), and when one of
>> the shared tag queues finishes a request and releases the driver tag.
> 
> Just one hint, blk-mq's RESTART won't work if no requests are pending,
> but looks no efficient/reliable way to handle that.

Requests are pending, see above output. My assumption/theory was that
we were racing on setting restart. Simplified version:

CPUX					CPUY
get_driver_tag, fails
					complete request, put tag
					check restart, not set
mark needing restart

and stall unless new IO is started on the device, since we now have two
requests waiting to be dispatched, and a driver tag available for one of
them, but nobody left to run the queue.

I wasted most of today bisecting this, only to discover it is present in
earlier kernels too. So I have nothing backing up the above, except the
fact that:

1) We have requests ready to issue in 'dispatch'
2) We have a driver tag available
3) hctx is marked restart, but looks like that happened later since no
   driver tags are pending

I'll try and debug this tomorrow.

As I said, this isn't necessarily the same problem that Bart is seeing,
since he doesn't trigger it on 4.14-rc or earlier, only in
for-4.15/block. Or it could be the same, just a wider window now.
Another data point is that I can't trigger it with scsi_debug, but it
triggers pretty easily with null_blk.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  2:58             ` Ming Lei
@ 2017-11-08  3:06               ` Jens Axboe
  2017-11-08 16:41                   ` Bart Van Assche
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  3:06 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On 11/07/2017 07:58 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 07:55:32PM -0700, Jens Axboe wrote:
>> On 11/07/2017 05:39 PM, Ming Lei wrote:
>>> On Tue, Nov 07, 2017 at 04:20:08PM +0000, Bart Van Assche wrote:
>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>> If you can reproduce, please provide me at least the following log
>>>>> first:
>>>>>
>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>
>>>>> If any pending requests arn't completed, please provide the related
>>>>> info in dbgfs about where is the request.
>>>>
>>>> Every time I ran the above or a similar command its output was empty. I
>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>> attributes either have not yet been created or have already disappeared.
>>>
>>> Could you dump all tags? Then you can see if this attribute is disappeared.
>>>
>>> If that output is empty, it often means there isn't pending request not
>>> completed. So if that is true, your hang is _not_ related with RESTART.
>>
>> You need to check sched_tags as well. It could still be a restart race
>> or problem, if tags is empty but sched_tags has busy bits.
> 
> Yeah, I didn't mention because SRP is MQ hardware, and the default
> scheduler is none, but if Bart changes that, the sched_tags need to
> checked first.

At this point, I have no idea what Bart's setup looks like. Bart, it
would be REALLY helpful if you could tell us how you are reproducing
your hang. I don't know why this has to be dragged out.

Ming/Bart - there seems to be an increasing amount of tension between
you two, for reasons that are unknown to me. I suggest you put that
aside in the pursuit of fixing the current issue, and then we can
discuss how to best resolve these going forward. But right now the top
priority is getting to the bottom of this. There's a chance that the
issue I can reproduce is the same that Bart is seeing, in which case we
might be fixing both in one fell swoop. But if that isn't the case, then
we have some work to do this week.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  3:01                   ` Jens Axboe
@ 2017-11-08  3:12                     ` Ming Lei
  2017-11-08  3:17                         ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  3:12 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
> On 11/07/2017 06:03 PM, Ming Lei wrote:
> > On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
> >> On 11/07/2017 10:36 AM, Jens Axboe wrote:
> >>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
> >>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
> >>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> >>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> >>>>>>> If you can reproduce, please provide me at least the following log
> >>>>>>> first:
> >>>>>>>
> >>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> >>>>>>>
> >>>>>>> If any pending requests arn't completed, please provide the related
> >>>>>>> info in dbgfs about where is the request.
> >>>>>>
> >>>>>> Every time I ran the above or a similar command its output was empty. I
> >>>>>> assume that's because the hang usually occurs in a phase where these debugfs
> >>>>>> attributes either have not yet been created or have already disappeared.
> >>>>>
> >>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
> >>>>> we fail to get a dispatch budget?
> >>>>>
> >>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
> >>>>>
> >>>>> I've run a lot of stability testing here, and I haven't run into any
> >>>>> issues. This is with shared tags as well. So if you still see the failure
> >>>>> case with the current tree AND the above patch, then I'll try and get
> >>>>> a test case setup that hits it too so we can get to the bottom of this.
> >>>>
> >>>> Ming, I managed to reproduce the hang using null_blk. Note this is
> >>>> WITHOUT the patch mentioned above, running with that now.
> >>>>
> >>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> >>>>
> >>>> and using this fio file:
> >>>>
> >>>> [global]
> >>>> bs=4k
> >>>> rw=randread
> >>>> norandommap
> >>>> direct=1
> >>>> ioengine=libaio
> >>>> iodepth=4
> >>>>
> >>>> [nullb0]
> >>>> filename=/dev/nullb0
> >>>> [nullb1]
> >>>> filename=/dev/nullb1
> >>>> [nullb2]
> >>>> filename=/dev/nullb2
> >>>> [nullb3]
> >>>> filename=/dev/nullb3
> >>>>
> >>>> it seemed to keep running, but it hung when exiting. The troublesome
> >>>> device was nullb1:
> >>>>
> >>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
> >>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
> >>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >>>> [  492.535904] fio             D13208  3263   3211 0x00000000
> >>>> [  492.542535] Call Trace:
> >>>> [  492.545764]  __schedule+0x279/0x720
> >>>> [  492.550151]  schedule+0x33/0x90
> >>>> [  492.554145]  io_schedule+0x16/0x40
> >>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
> >>>> [  492.563211]  ? finish_wait+0x90/0x90
> >>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
> >>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
> >>>> [  492.577913]  generic_make_request+0xfc/0x2f0
> >>>> [  492.583177]  submit_bio+0x64/0x120
> >>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
> >>>> [  492.592736]  ? submit_bio+0x64/0x120
> >>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
> >>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
> >>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
> >>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
> >>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
> >>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
> >>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
> >>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
> >>>> [  492.640114]  blkdev_read_iter+0x35/0x40
> >>>> [  492.644877]  aio_read+0xc5/0x120
> >>>> [  492.648973]  ? aio_read_events+0x24c/0x340
> >>>> [  492.654124]  ? __might_sleep+0x4a/0x80
> >>>> [  492.658800]  do_io_submit+0x47c/0x5e0
> >>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
> >>>> [  492.668234]  SyS_io_submit+0x10/0x20
> >>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
> >>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
> >>>> [  492.683039] RIP: 0033:0x7f83d1871717
> >>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
> >>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
> >>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
> >>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
> >>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
> >>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
> >>>>
> >>>> and if we look at the debug entries, it's waiting on a scheduler tag:
> >>>>
> >>>> sched_tags:nr_tags=2
> >>>> sched_tags:nr_reserved_tags=0
> >>>> sched_tags:active_queues=0
> >>>> sched_tags:bitmap_tags:
> >>>> sched_tags:depth=2
> >>>> sched_tags:busy=2
> >>>> sched_tags:bits_per_word=64
> >>>> sched_tags:map_nr=1
> >>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
> >>>> sched_tags:wake_batch=1
> >>>> sched_tags:wake_index=4
> >>>> sched_tags:ws={
> >>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
> >>>> sched_tags:	{.wait_cnt=-115, .wait=active},
> >>>> sched_tags:}
> >>>> sched_tags:round_robin=0
> >>>> sched_tags_bitmap:00000000: 03
> >>>>
> >>>> with SCHED_RESTART being set:
> >>>>
> >>>> state:SCHED_RESTART
> >>>>
> >>>> and with the driver tags being idle:
> >>>>
> >>>> tags:nr_tags=1
> >>>> tags:nr_reserved_tags=0
> >>>> tags:active_queues=0
> >>>> tags:bitmap_tags:
> >>>> tags:depth=1
> >>>> tags:busy=0
> >>>> tags:bits_per_word=64
> >>>> tags:map_nr=1
> >>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
> >>>> tags:wake_batch=1
> >>>> tags:wake_index=3
> >>>> tags:ws={
> >>>> tags:	{.wait_cnt=-48, .wait=inactive},
> >>>> tags:	{.wait_cnt=-39, .wait=inactive},
> >>>> tags:	{.wait_cnt=-50, .wait=inactive},
> >>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>> tags:	{.wait_cnt=-25, .wait=inactive},
> >>>> tags:	{.wait_cnt=-24, .wait=inactive},
> >>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>> tags:}
> >>>
> >>> Unsurprisingly (since this is sched_tags starvation) this still happens
> >>> with the patch. Same trace as above. Note that dispatch has two requests
> >>> sitting ready:
> >>>
> >>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
> >>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
> >>>
> >>> while we're blocked trying to get a new sched tag off the regular
> >>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
> >>
> >> Just to keep everyone in the loop, this bug is not new to
> >> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> >> probably different to what Bart is hitting, but it's a bug none the
> >> less...
> > 
> > Thanks for running this test ans share us the result.
> > 
> >>
> >> To me, this looks like a race between marking the queue as needing a
> >> restart, and the driver tag being released and re-running the queue. If
> >> I do:
> >>
> >> # echo run > /sys/kernel/debug/block/nullb1/state
> >>
> >> then it starts just fine. So there must be a race between marking the
> >> need for a restart (when getting the driver tag fails), and when one of
> >> the shared tag queues finishes a request and releases the driver tag.
> > 
> > Just one hint, blk-mq's RESTART won't work if no requests are pending,
> > but looks no efficient/reliable way to handle that.
> 
> Requests are pending, see above output. My assumption/theory was that
> we were racing on setting restart. Simplified version:
> 
> CPUX					CPUY
> get_driver_tag, fails
> 					complete request, put tag
> 					check restart, not set
> mark needing restart

This case should be covered by TAG_WAITING.

If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
queue, then the 'complete request' on CPUY will trigger to run queue via
blk_mq_dispatch_wake().

Looks get_driver_tag() is run again after the hctx is added to tags's
wait queue, so TAG_WAITING should work fine for this case in theory.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-07 16:17         ` Bart Van Assche
  (?)
@ 2017-11-08  3:12         ` Jens Axboe
  -1 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  3:12 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-scsi, hch, linux-block, osandov, john.garry, loberman

On 11/07/2017 09:17 AM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 18:15 +0800, Ming Lei wrote:
>> Last time, you didn't mention the target patch for setting its
>> can_queue as 1, so I think you can't reproduce the issue on upstream
>> kernel without out-of-tree patch. Then looks it is another issue,
>> and we are making progress actually.
> 
> If I don't trust a patch I introduce additional tests. The fact that I
> modified the SRP initiator before this hang occurred does not mean that the
> approach of your patch is fine. What this means is that all your patch does
> is to reduce the race window and that there is still a race window.

I agree, reducing the depth to test that specific case is perfectly
valid, it doesn't make the test invalid. It's perfectly possible that
other use cases out there have exactly that configuration. My null_blk
test case is basically the same. But it would be nice if all of this was
out in the open, so everybody is clear on exactly what is being
run/tested.

However, where my trace is hung off getting a scheduler tag, yours seems
to be waiting on IO completion. So not the same fingerprint, which is
worrisome.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  3:12                     ` Ming Lei
@ 2017-11-08  3:17                         ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  3:17 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman, Omar Sandoval

On 11/07/2017 08:12 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
>> On 11/07/2017 06:03 PM, Ming Lei wrote:
>>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>>>> first:
>>>>>>>>>
>>>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>>>
>>>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>>>> info in dbgfs about where is the request.
>>>>>>>>
>>>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>>>>>> attributes either have not yet been created or have already disappeared.
>>>>>>>
>>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>>>>>> we fail to get a dispatch budget?
>>>>>>>
>>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>>>
>>>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>>>> issues. This is with shared tags as well. So if you still see the failure
>>>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>>>
>>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>>>> WITHOUT the patch mentioned above, running with that now.
>>>>>>
>>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
>>>>>>
>>>>>> and using this fio file:
>>>>>>
>>>>>> [global]
>>>>>> bs=4k
>>>>>> rw=randread
>>>>>> norandommap
>>>>>> direct=1
>>>>>> ioengine=libaio
>>>>>> iodepth=4
>>>>>>
>>>>>> [nullb0]
>>>>>> filename=/dev/nullb0
>>>>>> [nullb1]
>>>>>> filename=/dev/nullb1
>>>>>> [nullb2]
>>>>>> filename=/dev/nullb2
>>>>>> [nullb3]
>>>>>> filename=/dev/nullb3
>>>>>>
>>>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>>>> device was nullb1:
>>>>>>
>>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
>>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
>>>>>> [  492.542535] Call Trace:
>>>>>> [  492.545764]  __schedule+0x279/0x720
>>>>>> [  492.550151]  schedule+0x33/0x90
>>>>>> [  492.554145]  io_schedule+0x16/0x40
>>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>>>> [  492.583177]  submit_bio+0x64/0x120
>>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>>>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>>>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>>>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>>>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>>>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>>>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
>>>>>> [  492.644877]  aio_read+0xc5/0x120
>>>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
>>>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
>>>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
>>>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>>>>>> [  492.668234]  SyS_io_submit+0x10/0x20
>>>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
>>>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
>>>>>> [  492.683039] RIP: 0033:0x7f83d1871717
>>>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
>>>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
>>>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
>>>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
>>>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
>>>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
>>>>>>
>>>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
>>>>>>
>>>>>> sched_tags:nr_tags=2
>>>>>> sched_tags:nr_reserved_tags=0
>>>>>> sched_tags:active_queues=0
>>>>>> sched_tags:bitmap_tags:
>>>>>> sched_tags:depth=2
>>>>>> sched_tags:busy=2
>>>>>> sched_tags:bits_per_word=64
>>>>>> sched_tags:map_nr=1
>>>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
>>>>>> sched_tags:wake_batch=1
>>>>>> sched_tags:wake_index=4
>>>>>> sched_tags:ws={
>>>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
>>>>>> sched_tags:}
>>>>>> sched_tags:round_robin=0
>>>>>> sched_tags_bitmap:00000000: 03
>>>>>>
>>>>>> with SCHED_RESTART being set:
>>>>>>
>>>>>> state:SCHED_RESTART
>>>>>>
>>>>>> and with the driver tags being idle:
>>>>>>
>>>>>> tags:nr_tags=1
>>>>>> tags:nr_reserved_tags=0
>>>>>> tags:active_queues=0
>>>>>> tags:bitmap_tags:
>>>>>> tags:depth=1
>>>>>> tags:busy=0
>>>>>> tags:bits_per_word=64
>>>>>> tags:map_nr=1
>>>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>>>>>> tags:wake_batch=1
>>>>>> tags:wake_index=3
>>>>>> tags:ws={
>>>>>> tags:	{.wait_cnt=-48, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-39, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-50, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-25, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-24, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:}
>>>>>
>>>>> Unsurprisingly (since this is sched_tags starvation) this still happens
>>>>> with the patch. Same trace as above. Note that dispatch has two requests
>>>>> sitting ready:
>>>>>
>>>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
>>>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
>>>>>
>>>>> while we're blocked trying to get a new sched tag off the regular
>>>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
>>>>
>>>> Just to keep everyone in the loop, this bug is not new to
>>>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
>>>> probably different to what Bart is hitting, but it's a bug none the
>>>> less...
>>>
>>> Thanks for running this test ans share us the result.
>>>
>>>>
>>>> To me, this looks like a race between marking the queue as needing a
>>>> restart, and the driver tag being released and re-running the queue. If
>>>> I do:
>>>>
>>>> # echo run > /sys/kernel/debug/block/nullb1/state
>>>>
>>>> then it starts just fine. So there must be a race between marking the
>>>> need for a restart (when getting the driver tag fails), and when one of
>>>> the shared tag queues finishes a request and releases the driver tag.
>>>
>>> Just one hint, blk-mq's RESTART won't work if no requests are pending,
>>> but looks no efficient/reliable way to handle that.
>>
>> Requests are pending, see above output. My assumption/theory was that
>> we were racing on setting restart. Simplified version:
>>
>> CPUX					CPUY
>> get_driver_tag, fails
>> 					complete request, put tag
>> 					check restart, not set
>> mark needing restart
> 
> This case should be covered by TAG_WAITING.

Right. The keyword here being 'should'.

> If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
> queue, then the 'complete request' on CPUY will trigger to run queue via
> blk_mq_dispatch_wake().

I know how it's _supposed_ to work. I'm saying that the debug info
indicates that we have a problem in there, which matches what the
TAG_WAITING should be covering for us. The TAG_WAITING may be fine and
the issue could be somewhere else, it just looks identical for now.

> Looks get_driver_tag() is run again after the hctx is added to tags's
> wait queue, so TAG_WAITING should work fine for this case in theory.

Yep, in theory it should be fine. I'm going to dive in tomorrow and
figure out where it falls apart.

CC'ing Omar - one thing we talked about today was the issue being
somewhat similar to what kyber recently fixed in this commit:

commit 8cf466602028196b939255f1eb4e9817efd1db6d
Author: Omar Sandoval <osandov@fb.com>
Date:   Wed Oct 11 10:39:15 2017 -0700

    kyber: fix hang on domain token wait queue

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-08  3:17                         ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08  3:17 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On 11/07/2017 08:12 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
>> On 11/07/2017 06:03 PM, Ming Lei wrote:
>>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>>>> first:
>>>>>>>>>
>>>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>>>
>>>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>>>> info in dbgfs about where is the request.
>>>>>>>>
>>>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>>>>>> attributes either have not yet been created or have already disappeared.
>>>>>>>
>>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>>>>>> we fail to get a dispatch budget?
>>>>>>>
>>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>>>
>>>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>>>> issues. This is with shared tags as well. So if you still see the failure
>>>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>>>
>>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>>>> WITHOUT the patch mentioned above, running with that now.
>>>>>>
>>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
>>>>>>
>>>>>> and using this fio file:
>>>>>>
>>>>>> [global]
>>>>>> bs=4k
>>>>>> rw=randread
>>>>>> norandommap
>>>>>> direct=1
>>>>>> ioengine=libaio
>>>>>> iodepth=4
>>>>>>
>>>>>> [nullb0]
>>>>>> filename=/dev/nullb0
>>>>>> [nullb1]
>>>>>> filename=/dev/nullb1
>>>>>> [nullb2]
>>>>>> filename=/dev/nullb2
>>>>>> [nullb3]
>>>>>> filename=/dev/nullb3
>>>>>>
>>>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>>>> device was nullb1:
>>>>>>
>>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
>>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
>>>>>> [  492.542535] Call Trace:
>>>>>> [  492.545764]  __schedule+0x279/0x720
>>>>>> [  492.550151]  schedule+0x33/0x90
>>>>>> [  492.554145]  io_schedule+0x16/0x40
>>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>>>> [  492.583177]  submit_bio+0x64/0x120
>>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>>>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>>>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>>>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>>>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>>>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>>>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
>>>>>> [  492.644877]  aio_read+0xc5/0x120
>>>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
>>>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
>>>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
>>>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>>>>>> [  492.668234]  SyS_io_submit+0x10/0x20
>>>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
>>>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
>>>>>> [  492.683039] RIP: 0033:0x7f83d1871717
>>>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
>>>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
>>>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
>>>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
>>>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
>>>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
>>>>>>
>>>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
>>>>>>
>>>>>> sched_tags:nr_tags=2
>>>>>> sched_tags:nr_reserved_tags=0
>>>>>> sched_tags:active_queues=0
>>>>>> sched_tags:bitmap_tags:
>>>>>> sched_tags:depth=2
>>>>>> sched_tags:busy=2
>>>>>> sched_tags:bits_per_word=64
>>>>>> sched_tags:map_nr=1
>>>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
>>>>>> sched_tags:wake_batch=1
>>>>>> sched_tags:wake_index=4
>>>>>> sched_tags:ws={
>>>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
>>>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
>>>>>> sched_tags:}
>>>>>> sched_tags:round_robin=0
>>>>>> sched_tags_bitmap:00000000: 03
>>>>>>
>>>>>> with SCHED_RESTART being set:
>>>>>>
>>>>>> state:SCHED_RESTART
>>>>>>
>>>>>> and with the driver tags being idle:
>>>>>>
>>>>>> tags:nr_tags=1
>>>>>> tags:nr_reserved_tags=0
>>>>>> tags:active_queues=0
>>>>>> tags:bitmap_tags:
>>>>>> tags:depth=1
>>>>>> tags:busy=0
>>>>>> tags:bits_per_word=64
>>>>>> tags:map_nr=1
>>>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>>>>>> tags:wake_batch=1
>>>>>> tags:wake_index=3
>>>>>> tags:ws={
>>>>>> tags:	{.wait_cnt=-48, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-39, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-50, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-25, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-24, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>> tags:}
>>>>>
>>>>> Unsurprisingly (since this is sched_tags starvation) this still happens
>>>>> with the patch. Same trace as above. Note that dispatch has two requests
>>>>> sitting ready:
>>>>>
>>>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
>>>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
>>>>>
>>>>> while we're blocked trying to get a new sched tag off the regular
>>>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
>>>>
>>>> Just to keep everyone in the loop, this bug is not new to
>>>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
>>>> probably different to what Bart is hitting, but it's a bug none the
>>>> less...
>>>
>>> Thanks for running this test ans share us the result.
>>>
>>>>
>>>> To me, this looks like a race between marking the queue as needing a
>>>> restart, and the driver tag being released and re-running the queue. If
>>>> I do:
>>>>
>>>> # echo run > /sys/kernel/debug/block/nullb1/state
>>>>
>>>> then it starts just fine. So there must be a race between marking the
>>>> need for a restart (when getting the driver tag fails), and when one of
>>>> the shared tag queues finishes a request and releases the driver tag.
>>>
>>> Just one hint, blk-mq's RESTART won't work if no requests are pending,
>>> but looks no efficient/reliable way to handle that.
>>
>> Requests are pending, see above output. My assumption/theory was that
>> we were racing on setting restart. Simplified version:
>>
>> CPUX					CPUY
>> get_driver_tag, fails
>> 					complete request, put tag
>> 					check restart, not set
>> mark needing restart
> 
> This case should be covered by TAG_WAITING.

Right. The keyword here being 'should'.

> If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
> queue, then the 'complete request' on CPUY will trigger to run queue via
> blk_mq_dispatch_wake().

I know how it's _supposed_ to work. I'm saying that the debug info
indicates that we have a problem in there, which matches what the
TAG_WAITING should be covering for us. The TAG_WAITING may be fine and
the issue could be somewhere else, it just looks identical for now.

> Looks get_driver_tag() is run again after the hctx is added to tags's
> wait queue, so TAG_WAITING should work fine for this case in theory.

Yep, in theory it should be fine. I'm going to dive in tomorrow and
figure out where it falls apart.

CC'ing Omar - one thing we talked about today was the issue being
somewhat similar to what kyber recently fixed in this commit:

commit 8cf466602028196b939255f1eb4e9817efd1db6d
Author: Omar Sandoval <osandov@fb.com>
Date:   Wed Oct 11 10:39:15 2017 -0700

    kyber: fix hang on domain token wait queue

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  3:17                         ` Jens Axboe
  (?)
@ 2017-11-08  6:20                         ` Ming Lei
  2017-11-08 15:59                           ` Ming Lei
  -1 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08  6:20 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On Tue, Nov 07, 2017 at 08:17:59PM -0700, Jens Axboe wrote:
> On 11/07/2017 08:12 PM, Ming Lei wrote:
> > On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
> >> On 11/07/2017 06:03 PM, Ming Lei wrote:
> >>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
> >>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
> >>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
> >>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
> >>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> >>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> >>>>>>>>> If you can reproduce, please provide me at least the following log
> >>>>>>>>> first:
> >>>>>>>>>
> >>>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> >>>>>>>>>
> >>>>>>>>> If any pending requests arn't completed, please provide the related
> >>>>>>>>> info in dbgfs about where is the request.
> >>>>>>>>
> >>>>>>>> Every time I ran the above or a similar command its output was empty. I
> >>>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
> >>>>>>>> attributes either have not yet been created or have already disappeared.
> >>>>>>>
> >>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
> >>>>>>> we fail to get a dispatch budget?
> >>>>>>>
> >>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
> >>>>>>>
> >>>>>>> I've run a lot of stability testing here, and I haven't run into any
> >>>>>>> issues. This is with shared tags as well. So if you still see the failure
> >>>>>>> case with the current tree AND the above patch, then I'll try and get
> >>>>>>> a test case setup that hits it too so we can get to the bottom of this.
> >>>>>>
> >>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
> >>>>>> WITHOUT the patch mentioned above, running with that now.
> >>>>>>
> >>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> >>>>>>
> >>>>>> and using this fio file:
> >>>>>>
> >>>>>> [global]
> >>>>>> bs=4k
> >>>>>> rw=randread
> >>>>>> norandommap
> >>>>>> direct=1
> >>>>>> ioengine=libaio
> >>>>>> iodepth=4
> >>>>>>
> >>>>>> [nullb0]
> >>>>>> filename=/dev/nullb0
> >>>>>> [nullb1]
> >>>>>> filename=/dev/nullb1
> >>>>>> [nullb2]
> >>>>>> filename=/dev/nullb2
> >>>>>> [nullb3]
> >>>>>> filename=/dev/nullb3
> >>>>>>
> >>>>>> it seemed to keep running, but it hung when exiting. The troublesome
> >>>>>> device was nullb1:
> >>>>>>
> >>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
> >>>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
> >>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >>>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
> >>>>>> [  492.542535] Call Trace:
> >>>>>> [  492.545764]  __schedule+0x279/0x720
> >>>>>> [  492.550151]  schedule+0x33/0x90
> >>>>>> [  492.554145]  io_schedule+0x16/0x40
> >>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
> >>>>>> [  492.563211]  ? finish_wait+0x90/0x90
> >>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
> >>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
> >>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
> >>>>>> [  492.583177]  submit_bio+0x64/0x120
> >>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
> >>>>>> [  492.592736]  ? submit_bio+0x64/0x120
> >>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
> >>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
> >>>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
> >>>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
> >>>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
> >>>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
> >>>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
> >>>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
> >>>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
> >>>>>> [  492.644877]  aio_read+0xc5/0x120
> >>>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
> >>>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
> >>>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
> >>>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
> >>>>>> [  492.668234]  SyS_io_submit+0x10/0x20
> >>>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
> >>>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
> >>>>>> [  492.683039] RIP: 0033:0x7f83d1871717
> >>>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
> >>>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
> >>>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
> >>>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
> >>>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
> >>>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
> >>>>>>
> >>>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
> >>>>>>
> >>>>>> sched_tags:nr_tags=2
> >>>>>> sched_tags:nr_reserved_tags=0
> >>>>>> sched_tags:active_queues=0
> >>>>>> sched_tags:bitmap_tags:
> >>>>>> sched_tags:depth=2
> >>>>>> sched_tags:busy=2
> >>>>>> sched_tags:bits_per_word=64
> >>>>>> sched_tags:map_nr=1
> >>>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
> >>>>>> sched_tags:wake_batch=1
> >>>>>> sched_tags:wake_index=4
> >>>>>> sched_tags:ws={
> >>>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
> >>>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
> >>>>>> sched_tags:}
> >>>>>> sched_tags:round_robin=0
> >>>>>> sched_tags_bitmap:00000000: 03
> >>>>>>
> >>>>>> with SCHED_RESTART being set:
> >>>>>>
> >>>>>> state:SCHED_RESTART
> >>>>>>
> >>>>>> and with the driver tags being idle:
> >>>>>>
> >>>>>> tags:nr_tags=1
> >>>>>> tags:nr_reserved_tags=0
> >>>>>> tags:active_queues=0
> >>>>>> tags:bitmap_tags:
> >>>>>> tags:depth=1
> >>>>>> tags:busy=0
> >>>>>> tags:bits_per_word=64
> >>>>>> tags:map_nr=1
> >>>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
> >>>>>> tags:wake_batch=1
> >>>>>> tags:wake_index=3
> >>>>>> tags:ws={
> >>>>>> tags:	{.wait_cnt=-48, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-39, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-50, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-25, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-24, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> >>>>>> tags:}
> >>>>>
> >>>>> Unsurprisingly (since this is sched_tags starvation) this still happens
> >>>>> with the patch. Same trace as above. Note that dispatch has two requests
> >>>>> sitting ready:
> >>>>>
> >>>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
> >>>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
> >>>>>
> >>>>> while we're blocked trying to get a new sched tag off the regular
> >>>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
> >>>>
> >>>> Just to keep everyone in the loop, this bug is not new to
> >>>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> >>>> probably different to what Bart is hitting, but it's a bug none the
> >>>> less...
> >>>
> >>> Thanks for running this test ans share us the result.
> >>>
> >>>>
> >>>> To me, this looks like a race between marking the queue as needing a
> >>>> restart, and the driver tag being released and re-running the queue. If
> >>>> I do:
> >>>>
> >>>> # echo run > /sys/kernel/debug/block/nullb1/state
> >>>>
> >>>> then it starts just fine. So there must be a race between marking the
> >>>> need for a restart (when getting the driver tag fails), and when one of
> >>>> the shared tag queues finishes a request and releases the driver tag.
> >>>
> >>> Just one hint, blk-mq's RESTART won't work if no requests are pending,
> >>> but looks no efficient/reliable way to handle that.
> >>
> >> Requests are pending, see above output. My assumption/theory was that
> >> we were racing on setting restart. Simplified version:
> >>
> >> CPUX					CPUY
> >> get_driver_tag, fails
> >> 					complete request, put tag
> >> 					check restart, not set
> >> mark needing restart
> > 
> > This case should be covered by TAG_WAITING.
> 
> Right. The keyword here being 'should'.
> 
> > If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
> > queue, then the 'complete request' on CPUY will trigger to run queue via
> > blk_mq_dispatch_wake().
> 
> I know how it's _supposed_ to work. I'm saying that the debug info
> indicates that we have a problem in there, which matches what the
> TAG_WAITING should be covering for us. The TAG_WAITING may be fine and
> the issue could be somewhere else, it just looks identical for now.

One possible reason:

	- the request to be dispatched may not be added to hctx->dispatch
	when blk_mq_dispatch_wake() is triggered.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  6:20                         ` Ming Lei
@ 2017-11-08 15:59                           ` Ming Lei
  2017-11-08 18:19                             ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Ming Lei @ 2017-11-08 15:59 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On Wed, Nov 08, 2017 at 02:20:41PM +0800, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 08:17:59PM -0700, Jens Axboe wrote:
> > On 11/07/2017 08:12 PM, Ming Lei wrote:
> > > On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
> > >> On 11/07/2017 06:03 PM, Ming Lei wrote:
> > >>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
> > >>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
> > >>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
> > >>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
> > >>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
> > >>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
> > >>>>>>>>> If you can reproduce, please provide me at least the following log
> > >>>>>>>>> first:
> > >>>>>>>>>
> > >>>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
> > >>>>>>>>>
> > >>>>>>>>> If any pending requests arn't completed, please provide the related
> > >>>>>>>>> info in dbgfs about where is the request.
> > >>>>>>>>
> > >>>>>>>> Every time I ran the above or a similar command its output was empty. I
> > >>>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
> > >>>>>>>> attributes either have not yet been created or have already disappeared.
> > >>>>>>>
> > >>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
> > >>>>>>> we fail to get a dispatch budget?
> > >>>>>>>
> > >>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
> > >>>>>>>
> > >>>>>>> I've run a lot of stability testing here, and I haven't run into any
> > >>>>>>> issues. This is with shared tags as well. So if you still see the failure
> > >>>>>>> case with the current tree AND the above patch, then I'll try and get
> > >>>>>>> a test case setup that hits it too so we can get to the bottom of this.
> > >>>>>>
> > >>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
> > >>>>>> WITHOUT the patch mentioned above, running with that now.
> > >>>>>>
> > >>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> > >>>>>>
> > >>>>>> and using this fio file:
> > >>>>>>
> > >>>>>> [global]
> > >>>>>> bs=4k
> > >>>>>> rw=randread
> > >>>>>> norandommap
> > >>>>>> direct=1
> > >>>>>> ioengine=libaio
> > >>>>>> iodepth=4
> > >>>>>>
> > >>>>>> [nullb0]
> > >>>>>> filename=/dev/nullb0
> > >>>>>> [nullb1]
> > >>>>>> filename=/dev/nullb1
> > >>>>>> [nullb2]
> > >>>>>> filename=/dev/nullb2
> > >>>>>> [nullb3]
> > >>>>>> filename=/dev/nullb3
> > >>>>>>
> > >>>>>> it seemed to keep running, but it hung when exiting. The troublesome
> > >>>>>> device was nullb1:
> > >>>>>>
> > >>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
> > >>>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
> > >>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > >>>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
> > >>>>>> [  492.542535] Call Trace:
> > >>>>>> [  492.545764]  __schedule+0x279/0x720
> > >>>>>> [  492.550151]  schedule+0x33/0x90
> > >>>>>> [  492.554145]  io_schedule+0x16/0x40
> > >>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
> > >>>>>> [  492.563211]  ? finish_wait+0x90/0x90
> > >>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
> > >>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
> > >>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
> > >>>>>> [  492.583177]  submit_bio+0x64/0x120
> > >>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
> > >>>>>> [  492.592736]  ? submit_bio+0x64/0x120
> > >>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
> > >>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
> > >>>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
> > >>>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
> > >>>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
> > >>>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
> > >>>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
> > >>>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
> > >>>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
> > >>>>>> [  492.644877]  aio_read+0xc5/0x120
> > >>>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
> > >>>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
> > >>>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
> > >>>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
> > >>>>>> [  492.668234]  SyS_io_submit+0x10/0x20
> > >>>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
> > >>>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
> > >>>>>> [  492.683039] RIP: 0033:0x7f83d1871717
> > >>>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
> > >>>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
> > >>>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
> > >>>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
> > >>>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
> > >>>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
> > >>>>>>
> > >>>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
> > >>>>>>
> > >>>>>> sched_tags:nr_tags=2
> > >>>>>> sched_tags:nr_reserved_tags=0
> > >>>>>> sched_tags:active_queues=0
> > >>>>>> sched_tags:bitmap_tags:
> > >>>>>> sched_tags:depth=2
> > >>>>>> sched_tags:busy=2
> > >>>>>> sched_tags:bits_per_word=64
> > >>>>>> sched_tags:map_nr=1
> > >>>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
> > >>>>>> sched_tags:wake_batch=1
> > >>>>>> sched_tags:wake_index=4
> > >>>>>> sched_tags:ws={
> > >>>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
> > >>>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
> > >>>>>> sched_tags:}
> > >>>>>> sched_tags:round_robin=0
> > >>>>>> sched_tags_bitmap:00000000: 03
> > >>>>>>
> > >>>>>> with SCHED_RESTART being set:
> > >>>>>>
> > >>>>>> state:SCHED_RESTART
> > >>>>>>
> > >>>>>> and with the driver tags being idle:
> > >>>>>>
> > >>>>>> tags:nr_tags=1
> > >>>>>> tags:nr_reserved_tags=0
> > >>>>>> tags:active_queues=0
> > >>>>>> tags:bitmap_tags:
> > >>>>>> tags:depth=1
> > >>>>>> tags:busy=0
> > >>>>>> tags:bits_per_word=64
> > >>>>>> tags:map_nr=1
> > >>>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
> > >>>>>> tags:wake_batch=1
> > >>>>>> tags:wake_index=3
> > >>>>>> tags:ws={
> > >>>>>> tags:	{.wait_cnt=-48, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-39, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-50, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-25, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-24, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> > >>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
> > >>>>>> tags:}
> > >>>>>
> > >>>>> Unsurprisingly (since this is sched_tags starvation) this still happens
> > >>>>> with the patch. Same trace as above. Note that dispatch has two requests
> > >>>>> sitting ready:
> > >>>>>
> > >>>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
> > >>>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
> > >>>>>
> > >>>>> while we're blocked trying to get a new sched tag off the regular
> > >>>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
> > >>>>
> > >>>> Just to keep everyone in the loop, this bug is not new to
> > >>>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
> > >>>> probably different to what Bart is hitting, but it's a bug none the
> > >>>> less...
> > >>>
> > >>> Thanks for running this test ans share us the result.
> > >>>
> > >>>>
> > >>>> To me, this looks like a race between marking the queue as needing a
> > >>>> restart, and the driver tag being released and re-running the queue. If
> > >>>> I do:
> > >>>>
> > >>>> # echo run > /sys/kernel/debug/block/nullb1/state
> > >>>>
> > >>>> then it starts just fine. So there must be a race between marking the
> > >>>> need for a restart (when getting the driver tag fails), and when one of
> > >>>> the shared tag queues finishes a request and releases the driver tag.
> > >>>
> > >>> Just one hint, blk-mq's RESTART won't work if no requests are pending,
> > >>> but looks no efficient/reliable way to handle that.
> > >>
> > >> Requests are pending, see above output. My assumption/theory was that
> > >> we were racing on setting restart. Simplified version:
> > >>
> > >> CPUX					CPUY
> > >> get_driver_tag, fails
> > >> 					complete request, put tag
> > >> 					check restart, not set
> > >> mark needing restart
> > > 
> > > This case should be covered by TAG_WAITING.
> > 
> > Right. The keyword here being 'should'.
> > 
> > > If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
> > > queue, then the 'complete request' on CPUY will trigger to run queue via
> > > blk_mq_dispatch_wake().
> > 
> > I know how it's _supposed_ to work. I'm saying that the debug info
> > indicates that we have a problem in there, which matches what the
> > TAG_WAITING should be covering for us. The TAG_WAITING may be fine and
> > the issue could be somewhere else, it just looks identical for now.
> 
> One possible reason:
> 
> 	- the request to be dispatched may not be added to hctx->dispatch
> 	when blk_mq_dispatch_wake() is triggered.

If we call blk_mq_dispatch_wait_add() after request is added to hctx->dispatch,
TAG_WAITING becomes basically similar with SCHED_RESTART.

Now I don't object to revert 358a3a6bccb7(blk-mq: don't handle TAG_SHARED in restart)
any more, because SCHED_RESTART works fine from this viewpoint even
though it isn't efficient. We can improve it in the future.

Also we might need to unify both TAG_WAITING and SCHED_RESTART into one generic
restart mechanism, and make it efficient. IMO, both two aren't efficient enough.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08  3:06               ` Jens Axboe
@ 2017-11-08 16:41                   ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-08 16:41 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry,
	osandov, jejb, loberman

T24gVHVlLCAyMDE3LTExLTA3IGF0IDIwOjA2IC0wNzAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBB
dCB0aGlzIHBvaW50LCBJIGhhdmUgbm8gaWRlYSB3aGF0IEJhcnQncyBzZXR1cCBsb29rcyBsaWtl
LiBCYXJ0LCBpdA0KPiB3b3VsZCBiZSBSRUFMTFkgaGVscGZ1bCBpZiB5b3UgY291bGQgdGVsbCB1
cyBob3cgeW91IGFyZSByZXByb2R1Y2luZw0KPiB5b3VyIGhhbmcuIEkgZG9uJ3Qga25vdyB3aHkg
dGhpcyBoYXMgdG8gYmUgZHJhZ2dlZCBvdXQuDQoNCkhlbGxvIEplbnMsDQoNCkl0IGlzIGEgZGlz
YXBwb2ludG1lbnQgdG8gbWUgdGhhdCB5b3UgaGF2ZSBhbGxvd2VkIE1pbmcgdG8gZXZhbHVhdGUg
b3RoZXINCmFwcHJvYWNoZXMgdGhhbiByZXZlcnRpbmcgImJsay1tcTogZG9uJ3QgaGFuZGxlIFRB
R19TSEFSRUQgaW4gcmVzdGFydCIuIFRoYXQNCnBhdGNoIG5hbWVseSByZXBsYWNlcyBhbiBhbGdv
cml0aG0gdGhhdCBpcyB0cnVzdGVkIGJ5IHRoZSBjb21tdW5pdHkgd2l0aCBhbg0KYWxnb3JpdGht
IG9mIHdoaWNoIGV2ZW4gTWluZyBhY2tub3dsZWRnZWQgdGhhdCBpdCBpcyByYWN5LiBBIHF1b3Rl
IGZyb20gWzFdOg0KIklPIGhhbmcgbWF5IGJlIGNhdXNlZCBpZiBhbGwgcmVxdWVzdHMgYXJlIGNv
bXBsZXRlZCBqdXN0IGJlZm9yZSB0aGUgY3VycmVudA0KU0NTSSBkZXZpY2UgaXMgYWRkZWQgdG8g
c2hvc3QtPnN0YXJ2ZWRfbGlzdCIuIEkgZG9uJ3Qga25vdyBvZiBhbnkgd2F5IHRvIGZpeA0KdGhh
dCByYWNlIG90aGVyIHRoYW4gc2VyaWFsaXppbmcgcmVxdWVzdCBzdWJtaXNzaW9uIGFuZCBjb21w
bGV0aW9uIGJ5IGFkZGluZw0KbG9ja2luZyBhcm91bmQgdGhlc2UgYWN0aW9ucywgd2hpY2ggaXMg
c29tZXRoaW5nIHdlIGRvbid0IHdhbnQuIEhlbmNlIG15DQpyZXF1ZXN0IHRvIHJldmVydCB0aGF0
IHBhdGNoLg0KDQpSZWdhcmRpbmcgdGhlIHRlc3QgSSBydW4sIGhlcmUgaXMgYSBzdW1tYXJ5IG9m
IHdoYXQgSSBtZW50aW9uZWQgaW4gcHJldmlvdXMNCmUtbWFpbHM6DQoqIEkgbW9kaWZpZWQgdGhl
IFNSUCBpbml0aWF0b3Igc3VjaCB0aGF0IHRoZSBTQ1NJIHRhcmdldCBxdWV1ZSBkZXB0aCBpcw0K
ICByZWR1Y2VkIHRvIG9uZSBieSBzZXR0aW5nIHN0YXJnZXQtPmNhbl9xdWV1ZSB0byAxIGZyb20g
aW5zaWRlDQogIHNjc2lfaG9zdF90ZW1wbGF0ZS50YXJnZXRfYWxsb2MuDQoqIFdpdGggdGhhdCBt
b2RpZmllZCBTUlAgaW5pdGlhdG9yIEkgcnVuIHRoZSBzcnAtdGVzdCBzb2Z0d2FyZSBhcyBmb2xs
b3dzDQogIHVudGlsIHNvbWV0aGluZyBicmVha3M6DQogIHdoaWxlIC4vcnVuX3Rlc3RzIC1mIHhm
cyAtZCAtZSBkZWFkbGluZSAtciA2MDsgZG8gOjsgZG9uZQ0KDQpUb2RheSBhIHN5c3RlbSB3aXRo
IGF0IGxlYXN0IG9uZSBJbmZpbmlCYW5kIEhDQSBpcyByZXF1aXJlZCB0byBydW4gdGhhdCB0ZXN0
Lg0KV2hlbiBJIGhhdmUgdGhlIHRpbWUgSSB3aWxsIHBvc3QgdGhlIFNSUCBpbml0aWF0b3IgYW5k
IHRhcmdldCBwYXRjaGVzIG9uIHRoZQ0KbGludXgtcmRtYSBtYWlsaW5nIGxpc3QgdGhhdCBtYWtl
IGl0IHBvc3NpYmxlIHRvIHJ1biB0aGF0IHRlc3QgYWdhaW5zdCB0aGUNClNvZnRSb0NFIGRyaXZl
ciAoZHJpdmVycy9pbmZpbmliYW5kL3N3L3J4ZSkuIFRoZSBvbmx5IGhhcmR3YXJlIHJlcXVpcmVk
IHRvDQp1c2UgdGhhdCBkcml2ZXIgaXMgYW4gRXRoZXJuZXQgYWRhcHRlci4NCg0KQmFydC4NCg0K
WzFdIFtQQVRDSF0gU0NTSTogZG9uJ3QgZ2V0IHRhcmdldC9ob3N0IGJ1c3lfY291bnQgaW4gc2Nz
aV9tcV9nZXRfYnVkZ2V0KCkNCihodHRwczovL3d3dy5tYWlsLWFyY2hpdmUuY29tL2xpbnV4LWJs
b2NrQHZnZXIua2VybmVsLm9yZy9tc2cxNTI2My5odG1sKS4=

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-08 16:41                   ` Bart Van Assche
  0 siblings, 0 replies; 48+ messages in thread
From: Bart Van Assche @ 2017-11-08 16:41 UTC (permalink / raw)
  To: axboe, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry,
	osandov, jejb, loberman

On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
> At this point, I have no idea what Bart's setup looks like. Bart, it
> would be REALLY helpful if you could tell us how you are reproducing
> your hang. I don't know why this has to be dragged out.

Hello Jens,

It is a disappointment to me that you have allowed Ming to evaluate other
approaches than reverting "blk-mq: don't handle TAG_SHARED in restart". That
patch namely replaces an algorithm that is trusted by the community with an
algorithm of which even Ming acknowledged that it is racy. A quote from [1]:
"IO hang may be caused if all requests are completed just before the current
SCSI device is added to shost->starved_list". I don't know of any way to fix
that race other than serializing request submission and completion by adding
locking around these actions, which is something we don't want. Hence my
request to revert that patch.

Regarding the test I run, here is a summary of what I mentioned in previous
e-mails:
* I modified the SRP initiator such that the SCSI target queue depth is
  reduced to one by setting starget->can_queue to 1 from inside
  scsi_host_template.target_alloc.
* With that modified SRP initiator I run the srp-test software as follows
  until something breaks:
  while ./run_tests -f xfs -d -e deadline -r 60; do :; done

Today a system with at least one InfiniBand HCA is required to run that test.
When I have the time I will post the SRP initiator and target patches on the
linux-rdma mailing list that make it possible to run that test against the
SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware required to
use that driver is an Ethernet adapter.

Bart.

[1] [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
(https://www.mail-archive.com/linux-block@vger.kernel.org/msg15263.html).

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 16:41                   ` Bart Van Assche
  (?)
@ 2017-11-08 17:57                   ` Jens Axboe
  2017-11-08 18:22                       ` Laurence Oberman
  2017-11-09  4:02                     ` Ming Lei
  -1 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08 17:57 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry,
	osandov, jejb, loberman

On 11/08/2017 09:41 AM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
>> At this point, I have no idea what Bart's setup looks like. Bart, it
>> would be REALLY helpful if you could tell us how you are reproducing
>> your hang. I don't know why this has to be dragged out.
> 
> Hello Jens,
> 
> It is a disappointment to me that you have allowed Ming to evaluate other
> approaches than reverting "blk-mq: don't handle TAG_SHARED in restart". That
> patch namely replaces an algorithm that is trusted by the community with an
> algorithm of which even Ming acknowledged that it is racy. A quote from [1]:
> "IO hang may be caused if all requests are completed just before the current
> SCSI device is added to shost->starved_list". I don't know of any way to fix
> that race other than serializing request submission and completion by adding
> locking around these actions, which is something we don't want. Hence my
> request to revert that patch.

I was reluctant to revert it, in case we could work out a better way of
doing it. As I mentioned in the other replies, it's not exactly the
prettiest or most efficient. However, since we currently don't have
a good solution for the issue, I'm fine with reverting that patch.

> Regarding the test I run, here is a summary of what I mentioned in previous
> e-mails:
> * I modified the SRP initiator such that the SCSI target queue depth is
>   reduced to one by setting starget->can_queue to 1 from inside
>   scsi_host_template.target_alloc.
> * With that modified SRP initiator I run the srp-test software as follows
>   until something breaks:
>   while ./run_tests -f xfs -d -e deadline -r 60; do :; done

What kernel options are needed? Where do I download everything I need?

In other words, would it be possible to do a fuller guide for getting
this setup and running?

I'll run my simple test case as well, since it's currently breaking
basically everywhere.

> Today a system with at least one InfiniBand HCA is required to run that test.
> When I have the time I will post the SRP initiator and target patches on the
> linux-rdma mailing list that make it possible to run that test against the
> SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware required to
> use that driver is an Ethernet adapter.

OK, I guess I can't run it then... I'll have to rely on your testing.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 15:59                           ` Ming Lei
@ 2017-11-08 18:19                             ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08 18:19 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, linux-scsi, hch, linux-block, osandov,
	john.garry, loberman

On 11/08/2017 08:59 AM, Ming Lei wrote:
> On Wed, Nov 08, 2017 at 02:20:41PM +0800, Ming Lei wrote:
>> On Tue, Nov 07, 2017 at 08:17:59PM -0700, Jens Axboe wrote:
>>> On 11/07/2017 08:12 PM, Ming Lei wrote:
>>>> On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
>>>>> On 11/07/2017 06:03 PM, Ming Lei wrote:
>>>>>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>>>>>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>>>>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>>>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>>>>>>> first:
>>>>>>>>>>>>
>>>>>>>>>>>> 	find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>>>>>>
>>>>>>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>>>>>>> info in dbgfs about where is the request.
>>>>>>>>>>>
>>>>>>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>>>>>>> assume that's because the hang usually occurs in a phase where these debugfs
>>>>>>>>>>> attributes either have not yet been created or have already disappeared.
>>>>>>>>>>
>>>>>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>>>>>>>>> we fail to get a dispatch budget?
>>>>>>>>>>
>>>>>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>>>>>>
>>>>>>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>>>>>>> issues. This is with shared tags as well. So if you still see the failure
>>>>>>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>>>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>>>>>>
>>>>>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>>>>>>> WITHOUT the patch mentioned above, running with that now.
>>>>>>>>>
>>>>>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
>>>>>>>>>
>>>>>>>>> and using this fio file:
>>>>>>>>>
>>>>>>>>> [global]
>>>>>>>>> bs=4k
>>>>>>>>> rw=randread
>>>>>>>>> norandommap
>>>>>>>>> direct=1
>>>>>>>>> ioengine=libaio
>>>>>>>>> iodepth=4
>>>>>>>>>
>>>>>>>>> [nullb0]
>>>>>>>>> filename=/dev/nullb0
>>>>>>>>> [nullb1]
>>>>>>>>> filename=/dev/nullb1
>>>>>>>>> [nullb2]
>>>>>>>>> filename=/dev/nullb2
>>>>>>>>> [nullb3]
>>>>>>>>> filename=/dev/nullb3
>>>>>>>>>
>>>>>>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>>>>>>> device was nullb1:
>>>>>>>>>
>>>>>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>>>>>>> [  492.520782]       Not tainted 4.14.0-rc7+ #499
>>>>>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>>>>>> [  492.535904] fio             D13208  3263   3211 0x00000000
>>>>>>>>> [  492.542535] Call Trace:
>>>>>>>>> [  492.545764]  __schedule+0x279/0x720
>>>>>>>>> [  492.550151]  schedule+0x33/0x90
>>>>>>>>> [  492.554145]  io_schedule+0x16/0x40
>>>>>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>>>>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>>>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>>>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>>>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>>>>>>> [  492.583177]  submit_bio+0x64/0x120
>>>>>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>>>>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>>>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>>>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>>>>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>>>>>>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>>>>>>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>>>>>>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>>>>>>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>>>>>>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>>>>>>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
>>>>>>>>> [  492.644877]  aio_read+0xc5/0x120
>>>>>>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
>>>>>>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
>>>>>>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
>>>>>>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>>>>>>>>> [  492.668234]  SyS_io_submit+0x10/0x20
>>>>>>>>> [  492.672715]  ? SyS_io_submit+0x10/0x20
>>>>>>>>> [  492.677394]  entry_SYSCALL_64_fastpath+0x13/0x94
>>>>>>>>> [  492.683039] RIP: 0033:0x7f83d1871717
>>>>>>>>> [  492.687521] RSP: 002b:00007ffd38fe5a88 EFLAGS: 00000202 ORIG_RAX: 00000000000000d1
>>>>>>>>> [  492.696969] RAX: ffffffffffffffda RBX: 00007f83b6972b50 RCX: 00007f83d1871717
>>>>>>>>> [  492.705423] RDX: 0000000001f41418 RSI: 0000000000000001 RDI: 00007f83e4d36000
>>>>>>>>> [  492.713889] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000001f3f2e0
>>>>>>>>> [  492.722352] R10: 0000000000001000 R11: 0000000000000202 R12: 00007ffd38fe5be0
>>>>>>>>> [  492.730827] R13: 00007f83b6972b01 R14: 00007f83b69824b8 R15: 00007f83b6982368
>>>>>>>>>
>>>>>>>>> and if we look at the debug entries, it's waiting on a scheduler tag:
>>>>>>>>>
>>>>>>>>> sched_tags:nr_tags=2
>>>>>>>>> sched_tags:nr_reserved_tags=0
>>>>>>>>> sched_tags:active_queues=0
>>>>>>>>> sched_tags:bitmap_tags:
>>>>>>>>> sched_tags:depth=2
>>>>>>>>> sched_tags:busy=2
>>>>>>>>> sched_tags:bits_per_word=64
>>>>>>>>> sched_tags:map_nr=1
>>>>>>>>> sched_tags:alloc_hint={0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0}
>>>>>>>>> sched_tags:wake_batch=1
>>>>>>>>> sched_tags:wake_index=4
>>>>>>>>> sched_tags:ws={
>>>>>>>>> sched_tags:	{.wait_cnt=-102, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-126, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-72, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-96, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-134, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-116, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-90, .wait=inactive},
>>>>>>>>> sched_tags:	{.wait_cnt=-115, .wait=active},
>>>>>>>>> sched_tags:}
>>>>>>>>> sched_tags:round_robin=0
>>>>>>>>> sched_tags_bitmap:00000000: 03
>>>>>>>>>
>>>>>>>>> with SCHED_RESTART being set:
>>>>>>>>>
>>>>>>>>> state:SCHED_RESTART
>>>>>>>>>
>>>>>>>>> and with the driver tags being idle:
>>>>>>>>>
>>>>>>>>> tags:nr_tags=1
>>>>>>>>> tags:nr_reserved_tags=0
>>>>>>>>> tags:active_queues=0
>>>>>>>>> tags:bitmap_tags:
>>>>>>>>> tags:depth=1
>>>>>>>>> tags:busy=0
>>>>>>>>> tags:bits_per_word=64
>>>>>>>>> tags:map_nr=1
>>>>>>>>> tags:alloc_hint={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>>>>>>>>> tags:wake_batch=1
>>>>>>>>> tags:wake_index=3
>>>>>>>>> tags:ws={
>>>>>>>>> tags:	{.wait_cnt=-48, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-39, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-50, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-25, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-24, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>>>>> tags:	{.wait_cnt=-47, .wait=inactive},
>>>>>>>>> tags:}
>>>>>>>>
>>>>>>>> Unsurprisingly (since this is sched_tags starvation) this still happens
>>>>>>>> with the patch. Same trace as above. Note that dispatch has two requests
>>>>>>>> sitting ready:
>>>>>>>>
>>>>>>>> ffff881ff5ab1000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
>>>>>>>> ffff881ff5ab1240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}
>>>>>>>>
>>>>>>>> while we're blocked trying to get a new sched tag off the regular
>>>>>>>> blk_mq_make_request() path. It looks like a missing restart of the hctx.
>>>>>>>
>>>>>>> Just to keep everyone in the loop, this bug is not new to
>>>>>>> for-4.15/block, nor is it new to the current 4.41-rc or 4.13. So it's
>>>>>>> probably different to what Bart is hitting, but it's a bug none the
>>>>>>> less...
>>>>>>
>>>>>> Thanks for running this test ans share us the result.
>>>>>>
>>>>>>>
>>>>>>> To me, this looks like a race between marking the queue as needing a
>>>>>>> restart, and the driver tag being released and re-running the queue. If
>>>>>>> I do:
>>>>>>>
>>>>>>> # echo run > /sys/kernel/debug/block/nullb1/state
>>>>>>>
>>>>>>> then it starts just fine. So there must be a race between marking the
>>>>>>> need for a restart (when getting the driver tag fails), and when one of
>>>>>>> the shared tag queues finishes a request and releases the driver tag.
>>>>>>
>>>>>> Just one hint, blk-mq's RESTART won't work if no requests are pending,
>>>>>> but looks no efficient/reliable way to handle that.
>>>>>
>>>>> Requests are pending, see above output. My assumption/theory was that
>>>>> we were racing on setting restart. Simplified version:
>>>>>
>>>>> CPUX					CPUY
>>>>> get_driver_tag, fails
>>>>> 					complete request, put tag
>>>>> 					check restart, not set
>>>>> mark needing restart
>>>>
>>>> This case should be covered by TAG_WAITING.
>>>
>>> Right. The keyword here being 'should'.
>>>
>>>> If get_driver_tag fails on CPUX, this hctx will be added to tags's wait
>>>> queue, then the 'complete request' on CPUY will trigger to run queue via
>>>> blk_mq_dispatch_wake().
>>>
>>> I know how it's _supposed_ to work. I'm saying that the debug info
>>> indicates that we have a problem in there, which matches what the
>>> TAG_WAITING should be covering for us. The TAG_WAITING may be fine and
>>> the issue could be somewhere else, it just looks identical for now.
>>
>> One possible reason:
>>
>> 	- the request to be dispatched may not be added to hctx->dispatch
>> 	when blk_mq_dispatch_wake() is triggered.
> 
> If we call blk_mq_dispatch_wait_add() after request is added to hctx->dispatch,
> TAG_WAITING becomes basically similar with SCHED_RESTART.
> 
> Now I don't object to revert 358a3a6bccb7(blk-mq: don't handle TAG_SHARED in restart)
> any more, because SCHED_RESTART works fine from this viewpoint even
> though it isn't efficient. We can improve it in the future.
> 
> Also we might need to unify both TAG_WAITING and SCHED_RESTART into one generic
> restart mechanism, and make it efficient. IMO, both two aren't efficient enough.

This patch works for me, I don't see any more stalls after this happens.
On another positive note, for my 4 device test case, it also bumps the
per-device performance from (run where it actually finished):

nullb0: (groupid=0, jobs=1): err= 0: pid=3540: Wed Nov  8 10:57:43 2017
   read: IOPS=73.6k, BW=288MiB/s (302MB/s)(84.3GiB/300001msec)
    slat (nsec): min=1214, max=911980, avg=12070.86, stdev=15586.24
    clat (nsec): min=1637, max=1011.2k, avg=41375.87, stdev=28103.51
     lat (usec): min=7, max=1045, avg=53.56, stdev=32.30
    clat percentiles (usec):
     |  1.00th=[   11],  5.00th=[   14], 10.00th=[   17], 20.00th=[   21],
     | 30.00th=[   26], 40.00th=[   30], 50.00th=[   35], 60.00th=[   41],
     | 70.00th=[   47], 80.00th=[   56], 90.00th=[   73], 95.00th=[   95],
     | 99.00th=[  149], 99.50th=[  161], 99.90th=[  190], 99.95th=[  204],
     | 99.99th=[  258]
   bw (  KiB/s): min=223672, max=368880, per=24.76%, avg=294699.27, stdev=45893.24, samples=599
   iops        : min=55918, max=92220, avg=73674.76, stdev=11473.32, samples=599
  lat (usec)   : 2=0.01%, 10=0.89%, 20=17.13%, 50=56.13%, 100=21.42%
  lat (usec)   : 250=4.42%, 500=0.01%, 750=0.01%, 1000=0.01%
  lat (msec)   : 2=0.01%
  cpu          : usr=7.84%, sys=52.94%, ctx=9763310, majf=0, minf=14

to

nullb3: (groupid=0, jobs=1): err= 0: pid=3128: Wed Nov  8 11:08:18 2017
   read: IOPS=215k, BW=841MiB/s (882MB/s)(246GiB/300001msec)
    slat (nsec): min=1335, max=739184, avg=3920.20, stdev=2561.74
    clat (nsec): min=1039, max=763200, avg=14229.43, stdev=5382.08
     lat (usec): min=3, max=772, avg=18.20, stdev= 6.34
    clat percentiles (nsec):
     |  1.00th=[ 7712],  5.00th=[ 9408], 10.00th=[10048], 20.00th=[10816],
     | 30.00th=[12096], 40.00th=[12864], 50.00th=[13504], 60.00th=[14400],
     | 70.00th=[15552], 80.00th=[16768], 90.00th=[18816], 95.00th=[21632],
     | 99.00th=[29312], 99.50th=[31872], 99.90th=[37120], 99.95th=[41216],
     | 99.99th=[63232]
   bw (  KiB/s): min=658704, max=1012880, per=26.09%, avg=861735.73, stdev=63136.40, samples=599
   iops        : min=164676, max=253220, avg=215433.91, stdev=15784.07, samples=599
  lat (usec)   : 2=0.01%, 4=0.01%, 10=9.52%, 20=83.55%, 50=6.91%
  lat (usec)   : 100=0.02%, 250=0.01%, 750=0.01%, 1000=0.01%
  cpu          : usr=13.23%, sys=56.78%, ctx=33334423, majf=0, minf=14

which is a massive win on all metrics.

Patch on top of current for-4.15/block


diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532af..bb7f08415203 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
-	HCTX_STATE_NAME(TAG_WAITING),
 	HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb..adb6c8bceb7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1005,42 +1005,53 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int fla
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->entry);
-	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+				     struct request *rq)
 {
+	wait_queue_entry_t *wait = &(*hctx)->dispatch_wait;
 	struct sbq_wait_state *ws;
 
-	/*
-	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
-	 * The thread which wins the race to grab this bit adds the hardware
-	 * queue to the wait queue.
-	 */
-	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
-	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+	if (!list_empty_careful(&wait->entry))
 		return false;
 
-	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+	spin_lock(&(*hctx)->lock);
+	if (!list_empty(&wait->entry)) {
+		spin_unlock(&(*hctx)->lock);
+		return false;
+	}
+
+	init_waitqueue_func_entry(wait, blk_mq_dispatch_wake);
+	ws = bt_wait_ptr(&(*hctx)->tags->bitmap_tags, *hctx);
+	add_wait_queue(&ws->wait, wait);
+	spin_unlock(&(*hctx)->lock);
 
 	/*
-	 * As soon as this returns, it's no longer safe to fiddle with
-	 * hctx->dispatch_wait, since a completion can wake up the wait queue
-	 * and unlock the bit.
+	 * It's possible that a tag was freed in the window between the
+	 * allocation failure and adding the hardware queue to the wait
+	 * queue. If we can get a tag now, remove ourselves from the
+	 * wait queue to ensure someone else gets the wakeup.
 	 */
-	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
-	return true;
+	if (blk_mq_get_driver_tag(rq, hctx, false)) {
+		spin_lock_irq(&ws->wait.lock);
+		list_del_init(&wait->entry);
+		spin_unlock_irq(&ws->wait.lock);
+		return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-		bool got_budget)
+			     bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq, *nxt;
+	bool failed_tag = false;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1062,20 +1073,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx)) {
-				if (got_budget)
-					blk_mq_put_dispatch_budget(hctx);
-				break;
-			}
-
-			/*
-			 * It's possible that a tag was freed in the window
-			 * between the allocation failure and adding the
-			 * hardware queue to the wait queue.
-			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
+				failed_tag = true;
 				break;
 			}
 		}
@@ -1140,10 +1141,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * it is no longer set that means that it was cleared by another
 		 * thread and hence that a queue rerun is needed.
 		 *
-		 * If TAG_WAITING is set that means that an I/O scheduler has
-		 * been configured and another thread is waiting for a driver
-		 * tag. To guarantee fairness, do not rerun this hardware queue
-		 * but let the other thread grab the driver tag.
+		 * If 'failed_tag' is set, that means that we failed getting
+		 * a driver tag with an I/O scheduler attached. Ensure that
+		 * we run the queue AFTER adding our entries back to the list,
+		 * so we don't stall if the queue run and wakeup happened
+		 * before we got here.
 		 *
 		 * If no I/O scheduler has been configured it is possible that
 		 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1157,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) &&
-		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		if (!blk_mq_sched_needs_restart(hctx) || failed_tag)
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da7..2f0036b977a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -181,8 +181,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
-	BLK_MQ_S_TAG_WAITING	= 3,
-	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_START_ON_RUN	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 17:57                   ` Jens Axboe
@ 2017-11-08 18:22                       ` Laurence Oberman
  2017-11-09  4:02                     ` Ming Lei
  1 sibling, 0 replies; 48+ messages in thread
From: Laurence Oberman @ 2017-11-08 18:22 UTC (permalink / raw)
  To: Jens Axboe, Bart Van Assche, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry, osandov, jejb

On Wed, 2017-11-08 at 10:57 -0700, Jens Axboe wrote:
> On 11/08/2017 09:41 AM, Bart Van Assche wrote:
> > On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
> > > At this point, I have no idea what Bart's setup looks like. Bart,
> > > it
> > > would be REALLY helpful if you could tell us how you are
> > > reproducing
> > > your hang. I don't know why this has to be dragged out.
> > 
> > Hello Jens,
> > 
> > It is a disappointment to me that you have allowed Ming to evaluate
> > other
> > approaches than reverting "blk-mq: don't handle TAG_SHARED in
> > restart". That
> > patch namely replaces an algorithm that is trusted by the community
> > with an
> > algorithm of which even Ming acknowledged that it is racy. A quote
> > from [1]:
> > "IO hang may be caused if all requests are completed just before
> > the current
> > SCSI device is added to shost->starved_list". I don't know of any
> > way to fix
> > that race other than serializing request submission and completion
> > by adding
> > locking around these actions, which is something we don't want.
> > Hence my
> > request to revert that patch.
> 
> I was reluctant to revert it, in case we could work out a better way
> of
> doing it. As I mentioned in the other replies, it's not exactly the
> prettiest or most efficient. However, since we currently don't have
> a good solution for the issue, I'm fine with reverting that patch.
> 
> > Regarding the test I run, here is a summary of what I mentioned in
> > previous
> > e-mails:
> > * I modified the SRP initiator such that the SCSI target queue
> > depth is
> >   reduced to one by setting starget->can_queue to 1 from inside
> >   scsi_host_template.target_alloc.
> > * With that modified SRP initiator I run the srp-test software as
> > follows
> >   until something breaks:
> >   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
> 
> What kernel options are needed? Where do I download everything I
> need?
> 
> In other words, would it be possible to do a fuller guide for getting
> this setup and running?
> 
> I'll run my simple test case as well, since it's currently breaking
> basically everywhere.
> 
> > Today a system with at least one InfiniBand HCA is required to run
> > that test.
> > When I have the time I will post the SRP initiator and target
> > patches on the
> > linux-rdma mailing list that make it possible to run that test
> > against the
> > SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware
> > required to
> > use that driver is an Ethernet adapter.
> 
> OK, I guess I can't run it then... I'll have to rely on your testing.

Hello 

I agree with Bart in this case, we should revert this.
My test-bed is tied up and I have not been able to give it back to Ming
so he could follow up on Bart's last update.

Right now its safer to revert.

Thanks
Laurence
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
@ 2017-11-08 18:22                       ` Laurence Oberman
  0 siblings, 0 replies; 48+ messages in thread
From: Laurence Oberman @ 2017-11-08 18:22 UTC (permalink / raw)
  To: Jens Axboe, Bart Van Assche, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry, osandov, jejb

On Wed, 2017-11-08 at 10:57 -0700, Jens Axboe wrote:
> On 11/08/2017 09:41 AM, Bart Van Assche wrote:
> > On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
> > > At this point, I have no idea what Bart's setup looks like. Bart,
> > > it
> > > would be REALLY helpful if you could tell us how you are
> > > reproducing
> > > your hang. I don't know why this has to be dragged out.
> > 
> > Hello Jens,
> > 
> > It is a disappointment to me that you have allowed Ming to evaluate
> > other
> > approaches than reverting "blk-mq: don't handle TAG_SHARED in
> > restart". That
> > patch namely replaces an algorithm that is trusted by the community
> > with an
> > algorithm of which even Ming acknowledged that it is racy. A quote
> > from [1]:
> > "IO hang may be caused if all requests are completed just before
> > the current
> > SCSI device is added to shost->starved_list". I don't know of any
> > way to fix
> > that race other than serializing request submission and completion
> > by adding
> > locking around these actions, which is something we don't want.
> > Hence my
> > request to revert that patch.
> 
> I was reluctant to revert it, in case we could work out a better way
> of
> doing it. As I mentioned in the other replies, it's not exactly the
> prettiest or most efficient. However, since we currently don't have
> a good solution for the issue, I'm fine with reverting that patch.
> 
> > Regarding the test I run, here is a summary of what I mentioned in
> > previous
> > e-mails:
> > * I modified the SRP initiator such that the SCSI target queue
> > depth is
> >   reduced to one by setting starget->can_queue to 1 from inside
> >   scsi_host_template.target_alloc.
> > * With that modified SRP initiator I run the srp-test software as
> > follows
> >   until something breaks:
> >   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
> 
> What kernel options are needed? Where do I download everything I
> need?
> 
> In other words, would it be possible to do a fuller guide for getting
> this setup and running?
> 
> I'll run my simple test case as well, since it's currently breaking
> basically everywhere.
> 
> > Today a system with at least one InfiniBand HCA is required to run
> > that test.
> > When I have the time I will post the SRP initiator and target
> > patches on the
> > linux-rdma mailing list that make it possible to run that test
> > against the
> > SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware
> > required to
> > use that driver is an Ethernet adapter.
> 
> OK, I guess I can't run it then... I'll have to rely on your testing.

Hello 

I agree with Bart in this case, we should revert this.
My test-bed is tied up and I have not been able to give it back to Ming
so he could follow up on Bart's last update.

Right now its safer to revert.

Thanks
Laurence
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 18:22                       ` Laurence Oberman
  (?)
@ 2017-11-08 18:28                       ` Jens Axboe
  -1 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2017-11-08 18:28 UTC (permalink / raw)
  To: Laurence Oberman, Bart Van Assche, ming.lei
  Cc: linux-block, hch, martin.petersen, linux-scsi, john.garry, osandov, jejb

On 11/08/2017 11:22 AM, Laurence Oberman wrote:
> On Wed, 2017-11-08 at 10:57 -0700, Jens Axboe wrote:
>> On 11/08/2017 09:41 AM, Bart Van Assche wrote:
>>> On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
>>>> At this point, I have no idea what Bart's setup looks like. Bart,
>>>> it
>>>> would be REALLY helpful if you could tell us how you are
>>>> reproducing
>>>> your hang. I don't know why this has to be dragged out.
>>>
>>> Hello Jens,
>>>
>>> It is a disappointment to me that you have allowed Ming to evaluate
>>> other
>>> approaches than reverting "blk-mq: don't handle TAG_SHARED in
>>> restart". That
>>> patch namely replaces an algorithm that is trusted by the community
>>> with an
>>> algorithm of which even Ming acknowledged that it is racy. A quote
>>> from [1]:
>>> "IO hang may be caused if all requests are completed just before
>>> the current
>>> SCSI device is added to shost->starved_list". I don't know of any
>>> way to fix
>>> that race other than serializing request submission and completion
>>> by adding
>>> locking around these actions, which is something we don't want.
>>> Hence my
>>> request to revert that patch.
>>
>> I was reluctant to revert it, in case we could work out a better way
>> of
>> doing it. As I mentioned in the other replies, it's not exactly the
>> prettiest or most efficient. However, since we currently don't have
>> a good solution for the issue, I'm fine with reverting that patch.
>>
>>> Regarding the test I run, here is a summary of what I mentioned in
>>> previous
>>> e-mails:
>>> * I modified the SRP initiator such that the SCSI target queue
>>> depth is
>>>   reduced to one by setting starget->can_queue to 1 from inside
>>>   scsi_host_template.target_alloc.
>>> * With that modified SRP initiator I run the srp-test software as
>>> follows
>>>   until something breaks:
>>>   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
>>
>> What kernel options are needed? Where do I download everything I
>> need?
>>
>> In other words, would it be possible to do a fuller guide for getting
>> this setup and running?
>>
>> I'll run my simple test case as well, since it's currently breaking
>> basically everywhere.
>>
>>> Today a system with at least one InfiniBand HCA is required to run
>>> that test.
>>> When I have the time I will post the SRP initiator and target
>>> patches on the
>>> linux-rdma mailing list that make it possible to run that test
>>> against the
>>> SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware
>>> required to
>>> use that driver is an Ethernet adapter.
>>
>> OK, I guess I can't run it then... I'll have to rely on your testing.
> 
> Hello 
> 
> I agree with Bart in this case, we should revert this.
> My test-bed is tied up and I have not been able to give it back to Ming
> so he could follow up on Bart's last update.
> 
> Right now its safer to revert.

I had already reverted it when sending out that email, so we should be
all set (hopefully).

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 16:41                   ` Bart Van Assche
  (?)
  (?)
@ 2017-11-09  2:05                   ` Ming Lei
  -1 siblings, 0 replies; 48+ messages in thread
From: Ming Lei @ 2017-11-09  2:05 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe, linux-block, hch, martin.petersen, linux-scsi, john.garry,
	osandov, jejb, loberman

On Wed, Nov 08, 2017 at 04:41:35PM +0000, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
> > At this point, I have no idea what Bart's setup looks like. Bart, it
> > would be REALLY helpful if you could tell us how you are reproducing
> > your hang. I don't know why this has to be dragged out.
> 
> Hello Jens,
> 
> It is a disappointment to me that you have allowed Ming to evaluate other
> approaches than reverting "blk-mq: don't handle TAG_SHARED in restart". That

I have mentioned in another email to Jens, that I agree to revert that
patch because of TAG_WAITING's issue in Jens's test case.

> patch namely replaces an algorithm that is trusted by the community with an
> algorithm of which even Ming acknowledged that it is racy. A quote from [1]:
> "IO hang may be caused if all requests are completed just before the current
> SCSI device is added to shost->starved_list". I don't know of any way to fix
> that race other than serializing request submission and completion by adding
> locking around these actions, which is something we don't want. Hence my
> request to revert that patch.

That can't be the reason for this revert.

This issue[1] is fixed by '[PATCH] SCSI: don't get target/host busy_count in
scsi_mq_get_budget()', follows the idea:

	- we add sdev into shost->starved_list in scsi_target_queue_ready(),
	  and the return value of BLK_STS_RESOURCE is set

	- atomic_read(&sdev->device_busy) is checked to see if there is
	  pending request, queue will be run if it is zero, otherwise we
	  depend on scsi_end_request() from pending request to restart queue.

	- you may mention sdev->device_busy may become 0 just after the
	  check, then the completion still see the sdev in
	  shost->starved_list and do the restart, and no IO hang

If you think something above is wrong, please comment on it directly.
Without this patch, no need any out-of-tree patch, IO hang can be
triggered in test 01 of srp-test. After this patch is applied on
V4.14-rc4, no IO hang can be observed any more.

> 
> Regarding the test I run, here is a summary of what I mentioned in previous
> e-mails:
> * I modified the SRP initiator such that the SCSI target queue depth is
>   reduced to one by setting starget->can_queue to 1 from inside
>   scsi_host_template.target_alloc.
> * With that modified SRP initiator I run the srp-test software as follows
>   until something breaks:
>   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
> 
> Today a system with at least one InfiniBand HCA is required to run that test.
> When I have the time I will post the SRP initiator and target patches on the
> linux-rdma mailing list that make it possible to run that test against the
> SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware required to
> use that driver is an Ethernet adapter.

The thing is that we still don't know the root cause for your issue, and
keeping the restart for TAG_SHARED can be thought as a workaround. Maybe
it is same with Jens, maybe others, we don't know, and even without any
log provided, such as sched_tags or tags.

It is easy to see > 20% IOPS drops with restart for TAG_SHARED in 8
luns scsi debug test.

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()
  2017-11-08 17:57                   ` Jens Axboe
  2017-11-08 18:22                       ` Laurence Oberman
@ 2017-11-09  4:02                     ` Ming Lei
  1 sibling, 0 replies; 48+ messages in thread
From: Ming Lei @ 2017-11-09  4:02 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Bart Van Assche, linux-block, hch, martin.petersen, linux-scsi,
	john.garry, osandov, jejb, loberman

On Wed, Nov 08, 2017 at 10:57:23AM -0700, Jens Axboe wrote:
> On 11/08/2017 09:41 AM, Bart Van Assche wrote:
> > On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
> >> At this point, I have no idea what Bart's setup looks like. Bart, it
> >> would be REALLY helpful if you could tell us how you are reproducing
> >> your hang. I don't know why this has to be dragged out.
> > 
> > Hello Jens,
> > 
> > It is a disappointment to me that you have allowed Ming to evaluate other
> > approaches than reverting "blk-mq: don't handle TAG_SHARED in restart". That
> > patch namely replaces an algorithm that is trusted by the community with an
> > algorithm of which even Ming acknowledged that it is racy. A quote from [1]:
> > "IO hang may be caused if all requests are completed just before the current
> > SCSI device is added to shost->starved_list". I don't know of any way to fix
> > that race other than serializing request submission and completion by adding
> > locking around these actions, which is something we don't want. Hence my
> > request to revert that patch.
> 
> I was reluctant to revert it, in case we could work out a better way of
> doing it. As I mentioned in the other replies, it's not exactly the
> prettiest or most efficient. However, since we currently don't have
> a good solution for the issue, I'm fine with reverting that patch.
> 
> > Regarding the test I run, here is a summary of what I mentioned in previous
> > e-mails:
> > * I modified the SRP initiator such that the SCSI target queue depth is
> >   reduced to one by setting starget->can_queue to 1 from inside
> >   scsi_host_template.target_alloc.
> > * With that modified SRP initiator I run the srp-test software as follows
> >   until something breaks:
> >   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
> 
> What kernel options are needed? Where do I download everything I need?
> 
> In other words, would it be possible to do a fuller guide for getting
> this setup and running?
> 
> I'll run my simple test case as well, since it's currently breaking
> basically everywhere.
> 
> > Today a system with at least one InfiniBand HCA is required to run that test.
> > When I have the time I will post the SRP initiator and target patches on the
> > linux-rdma mailing list that make it possible to run that test against the
> > SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware required to
> > use that driver is an Ethernet adapter.
> 
> OK, I guess I can't run it then... I'll have to rely on your testing.

Even we don't need to run it, just post out the log from 'tags' or 'sched_tags',
which should tell us more, when this IO hang happens.

Unfortunately still not see such logs up to now, :-(

-- 
Ming

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2017-11-09  4:02 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-04  1:55 [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget() Ming Lei
2017-11-04 14:19 ` Jens Axboe
2017-11-06 19:45   ` Bart Van Assche
2017-11-06 19:45     ` Bart Van Assche
2017-11-07  2:11     ` Ming Lei
2017-11-07 16:20       ` Bart Van Assche
2017-11-07 16:20         ` Bart Van Assche
2017-11-07 16:29         ` Jens Axboe
2017-11-07 17:10           ` Jens Axboe
2017-11-07 17:36             ` Jens Axboe
2017-11-07 22:06               ` Jens Axboe
2017-11-07 22:34                 ` Bart Van Assche
2017-11-07 22:34                   ` Bart Van Assche
2017-11-07 22:39                   ` Jens Axboe
2017-11-08  0:50                   ` Ming Lei
2017-11-08  1:03                 ` Ming Lei
2017-11-08  3:01                   ` Jens Axboe
2017-11-08  3:12                     ` Ming Lei
2017-11-08  3:17                       ` Jens Axboe
2017-11-08  3:17                         ` Jens Axboe
2017-11-08  6:20                         ` Ming Lei
2017-11-08 15:59                           ` Ming Lei
2017-11-08 18:19                             ` Jens Axboe
2017-11-07 17:34           ` Bart Van Assche
2017-11-07 17:34             ` Bart Van Assche
2017-11-08  0:53             ` Ming Lei
2017-11-08  2:06               ` Ming Lei
2017-11-08  0:39         ` Ming Lei
2017-11-08  2:55           ` Jens Axboe
2017-11-08  2:58             ` Ming Lei
2017-11-08  3:06               ` Jens Axboe
2017-11-08 16:41                 ` Bart Van Assche
2017-11-08 16:41                   ` Bart Van Assche
2017-11-08 17:57                   ` Jens Axboe
2017-11-08 18:22                     ` Laurence Oberman
2017-11-08 18:22                       ` Laurence Oberman
2017-11-08 18:28                       ` Jens Axboe
2017-11-09  4:02                     ` Ming Lei
2017-11-09  2:05                   ` Ming Lei
2017-11-07 10:15     ` Ming Lei
2017-11-07 16:17       ` Bart Van Assche
2017-11-07 16:17         ` Bart Van Assche
2017-11-08  3:12         ` Jens Axboe
2017-11-06 18:04 ` Bart Van Assche
2017-11-06 18:04   ` Bart Van Assche
2017-11-07  2:19   ` Ming Lei
2017-11-07  3:53     ` Martin K. Petersen
2017-11-07  3:53       ` Martin K. Petersen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.