All of lore.kernel.org
 help / color / mirror / Atom feed
From: maxg@mellanox.com (Max Gurtovoy)
Subject: [PATCH] nvme: unquiesce the queue before cleaup it
Date: Sun, 29 Apr 2018 18:47:22 +0300	[thread overview]
Message-ID: <5c2b476f-19ae-613c-aa0c-9001a706f54c@mellanox.com> (raw)
In-Reply-To: <9b6c9563-957e-0819-ab2a-c89dda39a8d6@oracle.com>



On 4/28/2018 6:38 PM, jianchao.wang wrote:
> Hi Max
> 
> On 04/27/2018 04:51 PM, jianchao.wang wrote:
>> Hi Max
>>
>> On 04/26/2018 06:23 PM, Max Gurtovoy wrote:
>>> Hi Jianchao,
>>> I actually tried this scenario with real HW and was able to repro the hang.
>>> Unfortunatly, after applying your patch I got NULL deref:
>>> BUG: unable to handle kernel NULL pointer dereference at 0000000000000014
>>> WARNING: CPU: 5 PID: 0 at block/blk-mq.c:526 __blk_mq_complete_request+0x154/0x160
>>>
>>
>> Could you please share the whole backtrace here ?
> 
> I have reproduced the WARNING in __blk_mq_complete_request.
> 
> [107612.367131] RIP: 0010:__blk_mq_complete_request+0x2aa/0x2e0
> [107612.367133] RSP: 0018:ffff88036e627b68 EFLAGS: 00010202
> [107612.367137] RAX: 0000000000000002 RBX: ffff88034708a440 RCX: ffffffff9452df6c
> [107612.367140] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88034708a540
> [107612.367142] RBP: 1ffff1006dcc4f73 R08: ffffed007901472f R09: ffffed007901472f
> [107612.367144] R10: 0000000000000001 R11: ffffed007901472e R12: ffffe8ffffdc8940
> [107612.367146] R13: ffff88034708a540 R14: ffff8803c51c33c0 R15: ffff88032d091ea0
> [107612.367149] FS:  0000000000000000(0000) GS:ffff8803c8080000(0000) knlGS:0000000000000000
> [107612.367151] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [107612.367153] CR2: 00000000018ee000 CR3: 00000003c1e13001 CR4: 00000000003606e0
> [107612.367155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [107612.367157] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [107612.367159] Call Trace:
> [107612.367166]  blk_mq_complete_request+0x10a/0x140
> [107612.367170]  ? hctx_lock+0xe0/0xe0
> [107612.367176]  ? __wait_rcu_gp+0x1b6/0x210
> [107612.367184]  ? nvme_start_ctrl+0x190/0x190
> [107612.367188]  bt_tags_iter+0x87/0xa0
> [107612.367194]  blk_mq_tagset_busy_iter+0x1da/0x3e0
> [107612.367201]  ? nvme_start_ctrl+0x190/0x190
> [107612.367206]  ? __blk_mq_get_tag+0x140/0x140
> [107612.367210]  ? nvme_start_ctrl+0x190/0x190
> [107612.367221]  nvme_rdma_shutdown_ctrl+0x50/0xe0 [nvme_rdma]
> [107612.367228]  nvme_rdma_reset_ctrl_work+0x2c/0xd0 [nvme_rdma]
> [107612.367233]  process_one_work+0x3ca/0xaa0
> [107612.367243]  ? pwq_dec_nr_in_flight+0x110/0x110
> [107612.367246]  ? worker_thread+0x19d/0x6c0
> [107612.367257]  worker_thread+0x4e2/0x6c0
> [107612.367283]  kthread+0x18d/0x1e0
> [107612.367286]  ? rescuer_thread+0x740/0x740
> [107612.367289]  ? kthread_delayed_work_timer_fn+0xe0/0xe0
> [107612.367293]  ? kthread_delayed_work_timer_fn+0xe0/0xe0
> [107612.367296]  ret_from_fork+0x24/0x30
> 
> in my test, there is _no_ timeout, but just reset controller
> 
> The state of the request is COMPLETE.
> 
> This is a race reset work and normal rmda completion path.
> nvme_rdma_reset_ctrl_work
>    -> nvme_rdma_shutdown_ctrl
>      -> nvme_cancel_request
>        -> blk_mq_complete_request
> we don't stop the rdma completion path to be triggered before we invoke nvme_cancel_request.

We fixed that in this way (on hold since the block layer fix is not yet 
pushed):

  drivers/nvme/host/rdma.c | 8 ++++++--
  1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c1abfc8..56fccac 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -734,7 +734,6 @@ static struct blk_mq_tag_set 
*nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
  static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
  		bool remove)
  {
-	nvme_rdma_stop_queue(&ctrl->queues[0]);
  	if (remove) {
  		blk_cleanup_queue(ctrl->ctrl.admin_q);
  		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
@@ -817,7 +816,6 @@ static int nvme_rdma_configure_admin_queue(struct 
nvme_rdma_ctrl *ctrl,
  static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
  		bool remove)
  {
-	nvme_rdma_stop_io_queues(ctrl);
  	if (remove) {
  		blk_cleanup_queue(ctrl->ctrl.connect_q);
  		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
@@ -947,6 +945,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct 
work_struct *work)
  	return;

  destroy_admin:
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
  	nvme_rdma_destroy_admin_queue(ctrl, false);
  requeue:
  	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
@@ -963,12 +962,14 @@ static void nvme_rdma_error_recovery_work(struct 
work_struct *work)

  	if (ctrl->ctrl.queue_count > 1) {
  		nvme_stop_queues(&ctrl->ctrl);
+		nvme_rdma_stop_io_queues(ctrl);
  		blk_mq_tagset_busy_iter(&ctrl->tag_set,
  					nvme_cancel_request, &ctrl->ctrl);
  		nvme_rdma_destroy_io_queues(ctrl, false);
  	}

  	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
  	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
  				nvme_cancel_request, &ctrl->ctrl);
  	nvme_rdma_destroy_admin_queue(ctrl, false);
@@ -1725,6 +1726,7 @@ static void nvme_rdma_shutdown_ctrl(struct 
nvme_rdma_ctrl *ctrl, bool shutdown)
  {
  	if (ctrl->ctrl.queue_count > 1) {
  		nvme_stop_queues(&ctrl->ctrl);
+		nvme_rdma_stop_io_queues(ctrl);
  		blk_mq_tagset_busy_iter(&ctrl->tag_set,
  					nvme_cancel_request, &ctrl->ctrl);
  		nvme_rdma_destroy_io_queues(ctrl, shutdown);
@@ -1736,6 +1738,7 @@ static void nvme_rdma_shutdown_ctrl(struct 
nvme_rdma_ctrl *ctrl, bool shutdown)
  		nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);

  	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
  	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
  				nvme_cancel_request, &ctrl->ctrl);
  	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
@@ -2001,6 +2004,7 @@ static struct nvme_ctrl 
*nvme_rdma_create_ctrl(struct device *dev,
  	return &ctrl->ctrl;

  out_remove_admin_queue:
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
  	nvme_rdma_destroy_admin_queue(ctrl, true);
  out_kfree_queues:
  	kfree(ctrl->queues);



> 
> I think it is related to a change introduced by the new blk-mq timeout implementation.
> before it, the blk_mq_complete_request use blk_mark_rq_complete to sync with timeout path.
> obviously, this make blk_mq_complete_request be able to sync with itself.
> the blk_mq_complete_request from reset work and rdma completion path could dance with each other well.
> 
> however, after this new mechanism is introduced, blk_mq_complete_request cannot sync with itself anymore.
> so we indeed have a risk that complete request twice when we reset the controller forcibly in rdma driver.
> 
> Thanks
> Jianchao
> 
> 
>>
>> Thanks
>> Jianchao
>>
>>> I guess it's the same issue the IsraelR/Bart/Ming are trying to fix in the block layer regarding completing requests.
>>> I'll add IsraelR proposed fix to nvme-rdma that is currently on hold and see what happens.
>>> Nontheless, I don't like the situation that the reset and delete flows can run concurrently.
>>>
>>> -Max.
>>>
>>> On 4/26/2018 11:27 AM, jianchao.wang wrote:
>>>> Hi Max
>>>>
>>>> I did a similar test on nvme-rdma, the underlying fabric is soft-RoCE.
>>>> A io loop, reset controller loop? and a delete/create controller loop.
>>>> And found io hang below:
>>>>
>>>> [? 230.884590] WARNING: CPU: 0 PID: 150 at /home/will/u04/source_code/linux-stable/drivers/nvme/host/rdma.c:1755 nvme_rdma_reset_ctrl_work+0x3d/0xb0 [nvme_rdma]
>>>> [? 230.884689] CPU: 0 PID: 150 Comm: kworker/u16:3 Not tainted 4.17.0-rc1+ #16
>>>> [? 230.884690] Hardware name: LENOVO 10MLS0E339/3106, BIOS M1AKT22A 06/27/2017
>>>> [? 230.884693] Workqueue: nvme-reset-wq nvme_rdma_reset_ctrl_work [nvme_rdma]
>>>> [? 230.884696] RIP: 0010:nvme_rdma_reset_ctrl_work+0x3d/0xb0 [nvme_rdma]
>>>> [? 230.884697] RSP: 0018:ffffa9e30255be40 EFLAGS: 00010246
>>>> [? 230.884699] RAX: 0000000000000000 RBX: ffff9633b64c2000 RCX: 0000000000000000
>>>> [? 230.884700] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff9ac05516
>>>> [? 230.884701] RBP: ffff9633b64c23b8 R08: 0000000000000001 R09: 0000000000000000
>>>> [? 230.884702] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9633b64c2970
>>>> [? 230.884703] R13: 0000000000000000 R14: 0ffff96340ffcc80 R15: ffff9634102af9c0
>>>> [? 230.884705] FS:? 0000000000000000(0000) GS:ffff963422c00000(0000) knlGS:0000000000000000
>>>> [? 230.884706] CS:? 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [? 230.884707] CR2: 0000000001ce9ca4 CR3: 000000038040f005 CR4: 00000000003606f0
>>>> [? 230.884708] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>> [? 230.884709] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>> [? 230.884710] Call Trace:
>>>> [? 230.884717]? process_one_work+0x1c0/0x680
>>>> [? 230.884722]? worker_thread+0x22b/0x430
>>>> [? 230.884728]? kthread+0x100/0x140
>>>> [? 230.884730]? ? rescuer_thread+0x370/0x370
>>>> [? 230.884731]? ? kthread_delayed_work_timer_fn+0x80/0x80
>>>> [? 230.884736]? ret_from_fork+0x24/0x30
>>>> [? 230.884744] Code: ff 48 8d ab b8 03 00 00 48 89 ef e8 1e 25 d7 d9 31 f6 48 89 df e8 44 f9 ff ff be 04 00 00 00 48 89 ef e8 b7 13 d7 d9 84 c0 75 07 <0f> 0b 5b 5d 41 5c c3 31 f6 48 89 df e8 b2 fa ff ff 85 c0 75 39
>>>> [? 230.884799] irq event stamp: 88150
>>>> [? 230.884802] hardirqs last? enabled at (88149): [<ffffffff9ac05529>] _raw_spin_unlock_irqrestore+0x59/0x70
>>>> [? 230.884803] hardirqs last disabled at (88150): [<ffffffff9ae0116c>] error_entry+0x6c/0xc0
>>>> [? 230.884805] softirqs last? enabled at (87998): [<ffffffff9aa603a0>] sk_common_release+0x60/0xd0
>>>> [? 230.884807] softirqs last disabled at (87996): [<ffffffff9aa60375>] sk_common_release+0x35/0xd0
>>>> [? 230.884808] ---[ end trace c03fac253b80d77d ]---
>>>> [? 277.472094] INFO: task kworker/u16:0:6 blocked for more than 30 seconds.
>>>> [? 277.473482]?????? Tainted: G??????? W???????? 4.17.0-rc1+ #16
>>>> [? 277.474825] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> [? 277.476293] kworker/u16:0?? D??? 0???? 6????? 2 0x80000000
>>>> [? 277.476317] Workqueue: nvme-delete-wq nvme_delete_ctrl_work
>>>> [? 277.476335] Call Trace:
>>>> [? 277.476365]? ? __schedule+0x3de/0xac0
>>>> [? 277.476393]? schedule+0x3c/0x90
>>>> [? 277.476405]? blk_mq_freeze_queue_wait+0x44/0x90
>>>> [? 277.476419]? ? wait_woken+0x90/0x90
>>>> [? 277.476437]? blk_cleanup_queue+0xe1/0x280
>>>> [? 277.476453]? nvme_ns_remove+0x1c8/0x260
>>>> [? 277.476475]? nvme_remove_namespaces+0x7f/0xa0
>>>> [? 277.476495]? nvme_delete_ctrl_work+0x4b/0x80
>>>> [? 277.476508]? process_one_work+0x1c0/0x680
>>>> [? 277.476535]? worker_thread+0x22b/0x430
>>>> [? 277.476562]? kthread+0x100/0x140
>>>> [? 277.476572]? ? rescuer_thread+0x370/0x370
>>>> [? 277.476579]? ? kthread_delayed_work_timer_fn+0x80/0x80
>>>> [? 277.476598]? ret_from_fork+0x24/0x30
>>>>
>>>> 00000000ea33345b {.op=READ, .cmd_flags=23, .rq_flags=MQ_INFLIGHT|IO_STAT, .state=idle, .tag=19, .internal_tag=-1}
>>>> 0000000071218c5a {.op=READ, .cmd_flags=23, .rq_flags=MQ_INFLIGHT|IO_STAT, .state=idle, .tag=45, .internal_tag=-1}
>>>> root at will-ThinkCentre-M910s:/sys/kernel/debug/block/nvme1c1n1# cat state
>>>> DYING|BYPASS|NOMERGES|SAME_COMP|NONROT|IO_STAT|DISCARD|NOXMERGES|INIT_DONE|NO_SG_MERGE|POLL|WC|FUA|STATS|QUIESCED
>>>>
>>>> This is due to a race following:
>>>>
>>>> nvme_delete_ctrl??????????????????? nvme_rdma_reset_ctrl_work
>>>>  ?? -> change state to DELETING
>>>>  ?? -> queue delete_work
>>>>  ?????????????????????????????????????? -> nvme_rdma_shutdown_ctrl
>>>>  ???????????????????????????????????????? -> nvme_stop_queues
>>>>  ?????????????????????????????????????????? -> blk_mq_quiesce_queue
>>>>  ???????????????? ?????????????????????????????????????? -> change to LIVE state fails
>>>>  ???????????????????????????????????????? -> return
>>>> nvme_delete_ctrl_work
>>>>  ?? -> nvme_remove_namespaces
>>>>  ???? -> nvme_ns_remove
>>>>  ?????? -> blk_cleanup_queue
>>>>  ???????? -> blk_freeze_queue
>>>>
>>>> This patch could also fix this issue.
>>>>
>>>> Thanks
>>>> Jianchao
>>>>
>>>> On 04/22/2018 11:00 PM, jianchao.wang wrote:
>>>>> Hi Max
>>>>>
>>>>> That's really appreciated!
>>>>> Here is my test script.
>>>>>
>>>>> loop_reset_controller.sh
>>>>> #!/bin/bash
>>>>> while true
>>>>> do
>>>>>  ????echo 1 > /sys/block/nvme0n1/device/reset_controller
>>>>>  ????sleep 1
>>>>> done
>>>>>
>>>>> loop_unbind_driver.sh
>>>>> #!/bin/bash
>>>>> while true
>>>>> do
>>>>>  ????echo "0000:02:00.0" > /sys/bus/pci/drivers/nvme/unbind
>>>>>  ????sleep 2
>>>>>  ????echo "0000:02:00.0" > /sys/bus/pci/drivers/nvme/bind
>>>>>  ????sleep 2
>>>>> done
>>>>>
>>>>> loop_io.sh
>>>>> #!/bin/bash
>>>>>
>>>>> file="/dev/nvme0n1"
>>>>> echo $file
>>>>> while true;
>>>>> do
>>>>>  ????if [ -e $file ];then
>>>>>  ??????? fio fio_job_rand_read.ini
>>>>>  ????else
>>>>>  ??????? echo "Not found"
>>>>>  ??????? sleep 1
>>>>>  ????fi
>>>>> done
>>>>>
>>>>> The fio jobs is as below:
>>>>> size=512m
>>>>> rw=randread
>>>>> bs=4k
>>>>> ioengine=libaio
>>>>> iodepth=64
>>>>> direct=1
>>>>> numjobs=16
>>>>> filename=/dev/nvme0n1
>>>>> group_reporting
>>>>>
>>>>> I started in sequence, loop_io.sh, loop_reset_controller.sh, loop_unbind_driver.sh.
>>>>> And if lucky, I will get io hang in 3 minutes. ;)
>>>>> Such as:
>>>>>
>>>>> [? 142.858074] nvme nvme0: pci function 0000:02:00.0
>>>>> [? 144.972256] nvme nvme0: failed to mark controller state 1
>>>>> [? 144.972289] nvme nvme0: Removing after probe failure status: 0
>>>>> [? 185.312344] INFO: task bash:1673 blocked for more than 30 seconds.
>>>>> [? 185.312889]?????? Not tainted 4.17.0-rc1+ #6
>>>>> [? 185.312950] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>> [? 185.313049] bash??????????? D??? 0? 1673?? 1629 0x00000080
>>>>> [? 185.313061] Call Trace:
>>>>> [? 185.313083]? ? __schedule+0x3de/0xac0
>>>>> [? 185.313103]? schedule+0x3c/0x90
>>>>> [? 185.313111]? blk_mq_freeze_queue_wait+0x44/0x90
>>>>> [? 185.313123]? ? wait_woken+0x90/0x90
>>>>> [? 185.313133]? blk_cleanup_queue+0xe1/0x280
>>>>> [? 185.313145]? nvme_ns_remove+0x1c8/0x260
>>>>> [? 185.313159]? nvme_remove_namespaces+0x7f/0xa0
>>>>> [? 185.313170]? nvme_remove+0x6c/0x130
>>>>> [? 185.313181]? pci_device_remove+0x36/0xb0
>>>>> [? 185.313193]? device_release_driver_internal+0x160/0x230
>>>>> [? 185.313205]? unbind_store+0xfe/0x150
>>>>> [? 185.313219]? kernfs_fop_write+0x114/0x190
>>>>> [? 185.313234]? __vfs_write+0x23/0x150
>>>>> [? 185.313246]? ? rcu_read_lock_sched_held+0x3f/0x70
>>>>> [? 185.313252]? ? preempt_count_sub+0x92/0xd0
>>>>> [? 185.313259]? ? __sb_start_write+0xf8/0x200
>>>>> [? 185.313271]? vfs_write+0xc5/0x1c0
>>>>> [? 185.313284]? ksys_write+0x45/0xa0
>>>>> [? 185.313298]? do_syscall_64+0x5a/0x1a0
>>>>> [? 185.313308]? entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>>>
>>>>> And get following information in block debugfs:
>>>>> root at will-ThinkCentre-M910s:/sys/kernel/debug/block/nvme0n1# cat hctx6/cpu6/rq_list
>>>>> 000000001192d19b {.op=READ, .cmd_flags=, .rq_flags=IO_STAT, .state=idle, .tag=69, .internal_tag=-1}
>>>>> 00000000c33c8a5b {.op=READ, .cmd_flags=, .rq_flags=IO_STAT, .state=idle, .tag=78, .internal_tag=-1}
>>>>> root at will-ThinkCentre-M910s:/sys/kernel/debug/block/nvme0n1# cat state
>>>>> DYING|BYPASS|NOMERGES|SAME_COMP|NONROT|IO_STAT|DISCARD|NOXMERGES|INIT_DONE|NO_SG_MERGE|POLL|WC|FUA|STATS|QUIESCED
>>>>>
>>>>> We can see there were reqs on ctx rq_list and the request_queue is QUIESCED.
>>>>>
>>>>> Thanks again !!
>>>>> Jianchao
>>>>>
>>>>> On 04/22/2018 10:48 PM, Max Gurtovoy wrote:
>>>>>>
>>>>>>
>>>>>> On 4/22/2018 5:25 PM, jianchao.wang wrote:
>>>>>>> Hi Max
>>>>>>>
>>>>>>> No, I only tested it on PCIe one.
>>>>>>> And sorry for that I didn't state that.
>>>>>>
>>>>>> Please send your exact test steps and we'll run it using RDMA transport.
>>>>>> I also want to run a mini regression on this one since it may effect other flows.
>>>>>>
>>>>>>>
>>>>>>> Thanks
>>>>>>> Jianchao
>>>>>>>
>>>>>>> On 04/22/2018 10:18 PM, Max Gurtovoy wrote:
>>>>>>>> Hi Jianchao,
>>>>>>>> Since this patch is in the core, have you tested it using some fabrics drives too ? RDMA/FC ?
>>>>>>>>
>>>>>>>> thanks,
>>>>>>>> Max.
>>>>>>>>
>>>>>>>> On 4/22/2018 4:32 PM, jianchao.wang wrote:
>>>>>>>>> Hi keith
>>>>>>>>>
>>>>>>>>> Would you please take a look at this patch.
>>>>>>>>>
>>>>>>>>> This issue could be reproduced easily with a driver bind/unbind loop,
>>>>>>>>> a reset loop and a IO loop at the same time.
>>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>> Jianchao
>>>>>>>>>
>>>>>>>>> On 04/19/2018 04:29 PM, Jianchao Wang wrote:
>>>>>>>>>> There is race between nvme_remove and nvme_reset_work that can
>>>>>>>>>> lead to io hang.
>>>>>>>>>>
>>>>>>>>>> nvme_remove??????????????????? nvme_reset_work
>>>>>>>>>> -> change state to DELETING
>>>>>>>>>>  ????????????????????????????????? -> fail to change state to LIVE
>>>>>>>>>>  ????????????????????????????????? -> nvme_remove_dead_ctrl
>>>>>>>>>>  ??????????????????????????????????? -> nvme_dev_disable
>>>>>>>>>>  ????????????????????????????????????? -> quiesce request_queue
>>>>>>>>>>  ??????????????????????????????????? -> queue remove_work
>>>>>>>>>> -> cancel_work_sync reset_work
>>>>>>>>>> -> nvme_remove_namespaces
>>>>>>>>>>  ???? -> splice ctrl->namespaces
>>>>>>>>>>  ????????????????????????????????? nvme_remove_dead_ctrl_work
>>>>>>>>>>  ????????????????????????????????? -> nvme_kill_queues
>>>>>>>>>>  ???? -> nvme_ns_remove?????????????? do nothing
>>>>>>>>>>  ?????? -> blk_cleanup_queue
>>>>>>>>>>  ???????? -> blk_freeze_queue
>>>>>>>>>> Finally, the request_queue is quiesced state when wait freeze,
>>>>>>>>>> we will get io hang here.
>>>>>>>>>>
>>>>>>>>>> To fix it, unquiesce the request_queue directly before nvme_ns_remove.
>>>>>>>>>> We have spliced the ctrl->namespaces, so nobody could access them
>>>>>>>>>> and quiesce the queue any more.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Jianchao Wang <jianchao.w.wang at oracle.com>
>>>>>>>>>> ---
>>>>>>>>>>  ??? drivers/nvme/host/core.c | 9 ++++++++-
>>>>>>>>>>  ??? 1 file changed, 8 insertions(+), 1 deletion(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>>>>>>>> index 9df4f71..0e95082 100644
>>>>>>>>>> --- a/drivers/nvme/host/core.c
>>>>>>>>>> +++ b/drivers/nvme/host/core.c
>>>>>>>>>> @@ -3249,8 +3249,15 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
>>>>>>>>>>  ??????? list_splice_init(&ctrl->namespaces, &ns_list);
>>>>>>>>>>  ??????? up_write(&ctrl->namespaces_rwsem);
>>>>>>>>>>  ??? -??? list_for_each_entry_safe(ns, next, &ns_list, list)
>>>>>>>>>> +??? /*
>>>>>>>>>> +???? * After splice the namespaces list from the ctrl->namespaces,
>>>>>>>>>> +???? * nobody could get them anymore, let's unquiesce the request_queue
>>>>>>>>>> +???? * forcibly to avoid io hang.
>>>>>>>>>> +???? */
>>>>>>>>>> +??? list_for_each_entry_safe(ns, next, &ns_list, list) {
>>>>>>>>>> +??????? blk_mq_unquiesce_queue(ns->queue);
>>>>>>>>>>  ??????????? nvme_ns_remove(ns);
>>>>>>>>>> +??? }
>>>>>>>>>>  ??? }
>>>>>>>>>>  ??? EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
>>>>>>>>>>     
>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> Linux-nvme mailing list
>>>>>>>>> Linux-nvme at lists.infradead.org
>>>>>>>>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=eQ9q70WFDS-d0s-KndBw8MOJvcBM6wuuKUNklqTC3h8&s=oBasfz9JoJw4yQF4EaWcNfKChZ1HMCkfHVZqyjvYVHQ&e=
>>>>>>>>>
>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> Linux-nvme mailing list
>>>>>>>> Linux-nvme at lists.infradead.org
>>>>>>>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=eQ9q70WFDS-d0s-KndBw8MOJvcBM6wuuKUNklqTC3h8&s=oBasfz9JoJw4yQF4EaWcNfKChZ1HMCkfHVZqyjvYVHQ&e=
>>>>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> Linux-nvme mailing list
>>>>> Linux-nvme at lists.infradead.org
>>>>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=ivKrSi6co_9l91-czj52P2ABAPyKOwv9GrD3UeCbhio&s=2U00IBVk4CVt0Q-jiZX3sEWEEA2IleOjPU_PD38Fgvg&e=
>>>>>
>>>
>>
>> _______________________________________________
>> Linux-nvme mailing list
>> Linux-nvme at lists.infradead.org
>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=SGZpECIOlIjMScsyTFhGuTB5fZTrhZS6gNLbeUUecmU&s=a0HxxV_1N2g59zBdAaehBUEmIG1pZGxzZKQOby6Hn8k&e=
>>

      reply	other threads:[~2018-04-29 15:47 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-19  8:29 [PATCH] nvme: unquiesce the queue before cleaup it Jianchao Wang
2018-04-19  8:29 ` Jianchao Wang
2018-04-22 13:32 ` jianchao.wang
2018-04-22 13:32   ` jianchao.wang
2018-04-22 14:18   ` Max Gurtovoy
2018-04-22 14:25     ` jianchao.wang
2018-04-22 14:25       ` jianchao.wang
2018-04-22 14:48       ` Max Gurtovoy
2018-04-22 15:00         ` jianchao.wang
2018-04-22 15:00           ` jianchao.wang
2018-04-26  8:27           ` jianchao.wang
2018-04-26  8:27             ` jianchao.wang
2018-04-26 10:23             ` Max Gurtovoy
2018-04-27  8:51               ` jianchao.wang
2018-04-27  8:51                 ` jianchao.wang
2018-04-28 15:38                 ` jianchao.wang
2018-04-28 15:38                   ` jianchao.wang
2018-04-29 15:47                   ` Max Gurtovoy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5c2b476f-19ae-613c-aa0c-9001a706f54c@mellanox.com \
    --to=maxg@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.