NVMeoF: multipath stuck after bringing one ethernet port down

From: shahar.salzman@gmail.com (shahar.salzman)
Subject: NVMeoF: multipath stuck after bringing one ethernet port down
Date: Thu, 25 May 2017 16:08:55 +0300	[thread overview]
Message-ID: <09009407-5da3-b4d9-924b-920c40b96df1@gmail.com> (raw)
In-Reply-To: <cc73ebe9-ab1c-18cf-5c5c-240358d82f3c@kaminario.com>

This is the patch I applied (Sagi's + my additions), for the 
mlnx-nvme-rdma source RPM:

--- mlnx-nvme-rdma-4.0/source/rdma.c.orig       2017-01-29 
22:26:53.000000000 +0200
+++ mlnx-nvme-rdma-4.0/source/rdma.c    2017-05-25 11:12:00.703099000 +0300
@@ -715,35 +715,32 @@
         if (ret)
                 goto requeue;

-       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
-
         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
         if (ret)
-               goto stop_admin_q;
+               goto requeue;

         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);

         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
         if (ret)
-               goto stop_admin_q;
+               goto requeue;

         nvme_start_keep_alive(&ctrl->ctrl);

         if (ctrl->queue_count > 1) {
                 ret = nvme_rdma_init_io_queues(ctrl);
                 if (ret)
-                       goto stop_admin_q;
+                       goto requeue;

                 ret = nvme_rdma_connect_io_queues(ctrl);
                 if (ret)
-                       goto stop_admin_q;
+                       goto requeue;
         }

         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
         WARN_ON_ONCE(!changed);

         if (ctrl->queue_count > 1) {
-               nvme_start_queues(&ctrl->ctrl);
                 nvme_queue_scan(&ctrl->ctrl);
                 nvme_queue_async_events(&ctrl->ctrl);
         }
@@ -752,8 +749,6 @@

         return;

-stop_admin_q:
-       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
  requeue:
         /* Make sure we are not resetting/deleting */
         if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
@@ -788,6 +783,13 @@
         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                 nvme_cancel_request, &ctrl->ctrl);

+       /*
+        * queues are not a live anymore, so restart the queues to fail fast
+        * new IO
+        */
+       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+       nvme_start_queues(&ctrl->ctrl);
+
         dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
                 ctrl->reconnect_delay);

@@ -1426,7 +1428,7 @@
         WARN_ON_ONCE(rq->tag < 0);

         if (!nvme_rdma_queue_is_ready(queue, rq))
-               return BLK_MQ_RQ_QUEUE_BUSY;
+               return BLK_MQ_RQ_QUEUE_ERROR;

         dev = queue->device->dev;
         ib_dma_sync_single_for_cpu(dev, sqe->dma,
@@ -2031,6 +2033,8 @@
  {
         int ret;

+       pr_info("Loading nvme-rdma with Sagi G. patch to support "
+               "multipath\n");
         nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
         if (!nvme_rdma_wq)
                 return -ENOMEM;
--- mlnx-nvme-rdma-4.0/source/core.c.orig       2017-05-25 
14:50:22.941022000 +0300
+++ mlnx-nvme-rdma-4.0/source/core.c    2017-05-25 14:50:30.563588000 +0300
@@ -488,7 +488,6 @@
         if (error) {
                 dev_err(ctrl->device,
                         "failed nvme_keep_alive_end_io error=%d\n", error);
-               return;
         }

         schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);


On 05/25/2017 03:27 PM, shahar.salzman wrote:
> Returning the QUEUE_ERROR also solved the "nvme list" issue reported, 
> but the paths where not coming back after I reconnected the ports.
>
> I played with the keep alive thread so that it didn't exit if it got 
> an error, and that was resolved as well.
>
>
> On 05/25/2017 02:06 PM, shahar.salzman wrote:
>> OK, so the indefinite retries are due to the 
>> nvme/host/rdma.c:nvme_rdma_queue_rq returning BLK_MQ_RQ_QUEUE_BUSY 
>> when the queue is not ready, wouldn't it be better to return 
>> BLK_MQ_RQ_QUEUE_ERROR so that the application may handle it? Or 
>> alternatively return an error after several retries (per Q or 
>> something).
>>
>> I tested a patch (on top of Sagi's) converting the return value to 
>> QUEUE_ERROR, and both dd, and multipath -ll return instantly.
>>
>>
>> On 05/23/2017 08:31 PM, shahar.salzman wrote:
>>> I used the patch with my Mellanox OFED (mlnx-nvme-rdma), it 
>>> patrially helps but the overall behaviour is still not good...
>>>
>>> If I do a dd (direct_IO) or multipath -ll during the path 
>>> disconnect, I get indefinite retries with nearly zero time between 
>>> retries, until the path is reconnected, and then dd is completed 
>>> successfully, paths come back to life, returning OK status to the 
>>> process which completes successfully.I would expect IO to fail after 
>>> a certain timeout, and that the retries should be paced somewhat
>>>
>>> If I do an nvme list (blk_execute_rq), then the process stays in D 
>>> state (I still see the retries), the block layer (at leasy for this 
>>> device) seems to be stuck, and I have to power cycle the server to 
>>> get it to work...
>>>
>>> Just a reminder, I am using Mellanox OFED 4.0, and kernel 4.9.6, the 
>>> plan here is to upgrade to the latest stable kernel but not sure 
>>> when we'll get to it.
>>>
>>>
>>> On 05/18/2017 08:52 AM, shahar.salzman wrote:
>>>> I have seen this too, but was internally investigating it as I am 
>>>> running 4.9.6 and not upstream. I will be able to check this patch 
>>>> on Sunday/Monday.
>>>>
>>>> On my setup the IOs never complete even after the path is 
>>>> reconnected, and the process remains stuck in D state, and the path 
>>>> unusable until I power cycle the server.
>>>>
>>>> Some more information, hope it helps:
>>>>
>>>> Dumping the stuck process stack (multipath -ll):
>>>>
>>>> [root at kblock01-knode02 ~]# cat /proc/9715/stack
>>>> [<ffffffff94354727>] blk_execute_rq+0x97/0x110
>>>> [<ffffffffc0a72f8a>] __nvme_submit_user_cmd+0xca/0x300 [nvme_core]
>>>> [<ffffffffc0a73369>] nvme_submit_user_cmd+0x29/0x30 [nvme_core]
>>>> [<ffffffffc0a734bd>] nvme_user_cmd+0x14d/0x180 [nvme_core]
>>>> [<ffffffffc0a7376c>] nvme_ioctl+0x7c/0xa0 [nvme_core]
>>>> [<ffffffff9435d548>] __blkdev_driver_ioctl+0x28/0x30
>>>> [<ffffffff9435dce1>] blkdev_ioctl+0x131/0x8f0
>>>> [<ffffffff9428993c>] block_ioctl+0x3c/0x40
>>>> [<ffffffff94260ae8>] vfs_ioctl+0x18/0x30
>>>> [<ffffffff94261201>] do_vfs_ioctl+0x161/0x600
>>>> [<ffffffff94261732>] SyS_ioctl+0x92/0xa0
>>>> [<ffffffff9479cc77>] entry_SYSCALL_64_fastpath+0x1a/0xa9
>>>> [<ffffffffffffffff>] 0xffffffffffffffff
>>>>
>>>> Looking at the dissasembly:
>>>>
>>>> 0000000000000170 <blk_execute_rq>:
>>>>
>>>> ...
>>>>
>>>>  1fa:   e8 00 00 00 00          callq  1ff <blk_execute_rq+0x8f>
>>>>         /* Prevent hang_check timer from firing at us during very 
>>>> long I/O */
>>>>         hang_check = sysctl_hung_task_timeout_secs;
>>>>         if (hang_check)
>>>>                 while (!wait_for_completion_io_timeout(&wait, 
>>>> hang_check * (HZ/2)));
>>>>         else
>>>>                 wait_for_completion_io(&wait);
>>>>  1ff:   4c 89 e7                mov    %r12,%rdi
>>>>  202:   e8 00 00 00 00          callq  207 <blk_execute_rq+0x97>
>>>>
>>>>         if (rq->errors)
>>>>  207:   83 bb 04 01 00 00 01    cmpl   $0x1,0x104(%rbx)
>>>>
>>>> I ran btrace, and it seems that the IO running before the path 
>>>> disconnect is properly cleaned, and only the IO running after gets 
>>>> stuck (in the btrace example bellow I run both multipath -ll, and a 
>>>> dd):
>>>>
>>>> 259,2   15    27548     4.790727567  8527  I  RS 764837376 + 8 [fio]
>>>> 259,2   15    27549     4.790728103  8527  D  RS 764837376 + 8 [fio]
>>>> 259,2   15    27550     4.791244566  8527  A   R 738007008 + 8 <- 
>>>> (253,0) 738007008
>>>> 259,2   15    27551     4.791245136  8527  I  RS 738007008 + 8 [fio]
>>>> 259,2   15    27552     4.791245803  8527  D  RS 738007008 + 8 [fio]
>>>> <<<<< From this stage there are no more fio Queue requests, only 
>>>> completions
>>>> 259,2    8     9118     4.758116423  8062  C  RS 2294539440 + 8 [0]
>>>> 259,2    8     9119     4.758559461  8062  C  RS 160307464 + 8 [0]
>>>> 259,2    8     9120     4.759009990     0  C  RS 2330623512 + 8 [0]
>>>> 259,2    8     9121     4.759457400     0  C  RS 2657710096 + 8 [0]
>>>> 259,2    8     9122     4.759928113     0  C  RS 2988256176 + 8 [0]
>>>> 259,2    8     9123     4.760388009     0  C  RS 789190936 + 8 [0]
>>>> 259,2    8     9124     4.760835889     0  C  RS 2915035352 + 8 [0]
>>>> 259,2    8     9125     4.761304282     0  C  RS 2458313624 + 8 [0]
>>>> 259,2    8     9126     4.761779694     0  C  RS 2280411456 + 8 [0]
>>>> 259,2    8     9127     4.762286741     0  C  RS 2082207376 + 8 [0]
>>>> 259,2    8     9128     4.762783722     0  C  RS 2874601688 + 8 [0]
>>>> ....
>>>> 259,2    8     9173     4.785798485     0  C  RS 828737568 + 8 [0]
>>>> 259,2    8     9174     4.786301391     0  C  RS 229168888 + 8 [0]
>>>> 259,2    8     9175     4.786769105     0  C  RS 893604096 + 8 [0]
>>>> 259,2    8     9176     4.787270594     0  C  RS 2308778728 + 8 [0]
>>>> 259,2    8     9177     4.787797448     0  C  RS 2190029912 + 8 [0]
>>>> 259,2    8     9178     4.788298956     0  C  RS 2652012944 + 8 [0]
>>>> 259,2    8     9179     4.788803759     0  C  RS 2205242008 + 8 [0]
>>>> 259,2    8     9180     4.789309423     0  C  RS 1948528416 + 8 [0]
>>>> 259,2    8     9181     4.789831240     0  C  RS 2003421288 + 8 [0]
>>>> 259,2    8     9182     4.790343528     0  C  RS 2464964728 + 8 [0]
>>>> 259,2    8     9183     4.790854684     0  C  RS 764837376 + 8 [0]
>>>> <<<<<< This is probably where the port had been disconnected
>>>> 259,2   21        7     9.413088410  8574  Q   R 0 + 8 [multipathd]
>>>> 259,2   21        8     9.413089387  8574  G   R 0 + 8 [multipathd]
>>>> 259,2   21        9     9.413095030  8574  U   N [multipathd] 1
>>>> 259,2   21       10     9.413095367  8574  I  RS 0 + 8 [multipathd]
>>>> 259,2   21       11     9.413096534  8574  D  RS 0 + 8 [multipathd]
>>>> 259,2   10        1    14.381330190  2016  C  RS 738007008 + 8 [7]
>>>> 259,2   21       12    14.381394890     0  R  RS 0 + 8 [7]
>>>> 259,2    8     9184    80.500537429  8657  Q   R 0 + 8 [dd]
>>>> 259,2    8     9185    80.500540122  8657  G   R 0 + 8 [dd]
>>>> 259,2    8     9186    80.500545289  8657  U   N [dd] 1
>>>> 259,2    8     9187    80.500545792  8657  I  RS 0 + 8 [dd]
>>>> <<<<<< At this stage, I reconnected the path:
>>>> 259,2    4        1  8381.791090134 11127  Q   R 0 + 8 [dd]
>>>> 259,2    4        2  8381.791093611 11127  G   R 0 + 8 [dd]
>>>> 259,2    4        3  8381.791098288 11127  U   N [dd] 1
>>>> 259,2    4        4  8381.791098791 11127  I  RS 0 + 8 [dd]
>>>>
>>>>
>>>> As I wrote above, I will check the patch on Sunday/Monday.
>>>>
>>>> On 05/17/2017 08:28 PM, Sagi Grimberg wrote:
>>>>> Hi Alex,
>>>>>
>>>>>> I am trying to test failure scenarios of NVMeoF + multipath. I bring
>>>>>> one of ports down and expect to see failed paths using "multipath
>>>>>> -ll". Instead I see that "multipath -ll" get stuck.
>>>>>>
>>>>>> reproduce:
>>>>>> 1. Connected to NVMeoF device through 2 ports.
>>>>>> 2. Bind them with multipath.
>>>>>> 3. Bring one port down (ifconfig eth3 down)
>>>>>> 4. Execute "multipath -ll" command and it will get stuck.
>>>>>> From strace I see that multipath is stuck in io_destroy() during
>>>>>> release of resources. As I understand io_destroy is stuck because of
>>>>>> io_cancel() that failed. And io_cancel() failed because of port that
>>>>>> was disabled in step 3.
>>>>>
>>>>> Hmm, it looks like we do take care of failing fast pending IO, but 
>>>>> once
>>>>> we schedule periodic reconnects the request queues are already 
>>>>> stopped
>>>>> and new incoming requests may block until we successfully reconnect.
>>>>>
>>>>> I don't have too much time for it at the moment, but here is an 
>>>>> untested
>>>>> patch for you to try out:
>>>>>
>>>>> -- 
>>>>> [PATCH] nvme-rdma: restart queues after we at error recovery to fast
>>>>>  fail incoming io
>>>>>
>>>>> When we encounter an transport/controller errors, error recovery
>>>>> kicks in which performs:
>>>>> 1. stops io/admin queues
>>>>> 2. moves transport queues out of LIVE state
>>>>> 3. fast fail pending io
>>>>> 4. schedule periodic reconnects.
>>>>>
>>>>> But we also need to fast fail incoming IO taht enters after we
>>>>> already scheduled. Given that our queue is not LIVE anymore, simply
>>>>> restart the request queues to fail in .queue_rq
>>>>>
>>>>> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
>>>>> ---
>>>>>  drivers/nvme/host/rdma.c | 20 +++++++++++---------
>>>>>  1 file changed, 11 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>>>>> index dd1c6deef82f..a0aa2bfb91ee 100644
>>>>> --- a/drivers/nvme/host/rdma.c
>>>>> +++ b/drivers/nvme/host/rdma.c
>>>>> @@ -753,28 +753,26 @@ static void 
>>>>> nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
>>>>>         if (ret)
>>>>>                 goto requeue;
>>>>>
>>>>> - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
>>>>> -
>>>>>         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
>>>>>         if (ret)
>>>>> -               goto stop_admin_q;
>>>>> +               goto requeue;
>>>>>
>>>>>         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
>>>>>
>>>>>         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
>>>>>         if (ret)
>>>>> -               goto stop_admin_q;
>>>>> +               goto requeue;
>>>>>
>>>>>         nvme_start_keep_alive(&ctrl->ctrl);
>>>>>
>>>>>         if (ctrl->queue_count > 1) {
>>>>>                 ret = nvme_rdma_init_io_queues(ctrl);
>>>>>                 if (ret)
>>>>> -                       goto stop_admin_q;
>>>>> +                       goto requeue;
>>>>>
>>>>>                 ret = nvme_rdma_connect_io_queues(ctrl);
>>>>>                 if (ret)
>>>>> -                       goto stop_admin_q;
>>>>> +                       goto requeue;
>>>>>         }
>>>>>
>>>>>         changed = nvme_change_ctrl_state(&ctrl->ctrl, 
>>>>> NVME_CTRL_LIVE);
>>>>> @@ -782,7 +780,6 @@ static void 
>>>>> nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
>>>>>         ctrl->ctrl.opts->nr_reconnects = 0;
>>>>>
>>>>>         if (ctrl->queue_count > 1) {
>>>>> -               nvme_start_queues(&ctrl->ctrl);
>>>>>                 nvme_queue_scan(&ctrl->ctrl);
>>>>> nvme_queue_async_events(&ctrl->ctrl);
>>>>>         }
>>>>> @@ -791,8 +788,6 @@ static void 
>>>>> nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
>>>>>
>>>>>         return;
>>>>>
>>>>> -stop_admin_q:
>>>>> -       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
>>>>>  requeue:
>>>>>         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
>>>>> ctrl->ctrl.opts->nr_reconnects);
>>>>> @@ -823,6 +818,13 @@ static void 
>>>>> nvme_rdma_error_recovery_work(struct work_struct *work)
>>>>> blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
>>>>>                                 nvme_cancel_request, &ctrl->ctrl);
>>>>>
>>>>> +       /*
>>>>> +        * queues are not a live anymore, so restart the queues to 
>>>>> fail fast
>>>>> +        * new IO
>>>>> +        */
>>>>> + blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
>>>>> +       nvme_start_queues(&ctrl->ctrl);
>>>>> +
>>>>>         nvme_rdma_reconnect_or_remove(ctrl);
>>>>>  }
>>>>>
>>>>> -- 
>>>>>
>>>>> _______________________________________________
>>>>> Linux-nvme mailing list
>>>>> Linux-nvme at lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/linux-nvme
>>>>
>>>
>>
>