Re: [PATCH] nvme-multipath: do not reset controller on unknown status

From: Sagi Grimberg <sagi@grimberg.me>
To: Christoph Hellwig <hch@lst.de>, Hannes Reinecke <hare@suse.de>
Cc: Keith Busch <keith.busch@intel.com>,
	linux-nvme@lists.infradead.org,
	John Managhini <john.meneghini@netapp.com>
Subject: Re: [PATCH] nvme-multipath: do not reset controller on unknown status
Date: Wed, 12 Feb 2020 11:33:20 -0800	[thread overview]
Message-ID: <6d4d18e3-c3a1-7d93-5abf-1ae46e18ca8c@grimberg.me> (raw)
In-Reply-To: <20200212175317.GA5813@lst.de>

On 2/12/20 9:53 AM, Christoph Hellwig wrote:
> On Wed, Feb 12, 2020 at 02:41:40PM +0100, Hannes Reinecke wrote:
>> We're seeing occasional controller resets during straight I/O,
>> but only when multipath is active.
>> The problem here is the nvme-multipath will reset the controller
>> on every unknown status, which really is an odd behaviour, seeing
>> that the host already received a perfectly good status; it's just
>> that it's not smart enough to understand it.
>> And resetting wouldn't help at all; the error status will continue
>> to be received.
>> So we should rather pass up any unknown error to the generic
>> routines and let them deal with this situation.
> 
> What unknown status do you see?

My question exactly, I don't understand what is the good status that
the host is not smart enough to understand?

I have a vague recollection that John sent some questions in that area
in the past...

> 
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> Cc: John Managhini <john.meneghini@netapp.com>
>> ---
>>   drivers/nvme/host/core.c      |  4 ++--
>>   drivers/nvme/host/multipath.c | 18 ++++++++++--------
>>   drivers/nvme/host/nvme.h      |  2 +-
>>   3 files changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 5dc32b72e7fa..edb081781ae7 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -293,8 +293,8 @@ void nvme_complete_rq(struct request *req)
>>   	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
>>   		if ((req->cmd_flags & REQ_NVME_MPATH) &&
>>   		    blk_path_error(status)) {
>> -			nvme_failover_req(req);
>> -			return;
>> +			if (nvme_failover_req(req))
>> +				return;
> 
> This reads weird, why not:
> 
> 		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> 		    blk_path_error(status) && nvme_failover_req(req))
> 			return;
> 
> ?
> 
> But see below, with your updated checks (assuming this makes sense
> as we need more explanation) we don't even need the blk_path_error
> call.

I think it reads better that we call nvme_failover_req only for actual
path errors.

> 
>> -void nvme_failover_req(struct request *req)
>> +bool nvme_failover_req(struct request *req)
>>   {
>>   	struct nvme_ns *ns = req->q->queuedata;
>>   	u16 status = nvme_req(req)->status;
>>   	unsigned long flags;
>> +	bool handled = false;
>>   
>>   	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>>   	blk_steal_bios(&ns->head->requeue_list, req);
>>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>> -	blk_mq_end_request(req, 0);
> 
> You can't just steal the bios without completing the request.
> 
>>   
>>   	switch (status & 0x7ff) {
>>   	case NVME_SC_ANA_TRANSITION:
>> @@ -88,11 +88,13 @@ void nvme_failover_req(struct request *req)
>>   		 * mark the the path as pending and kick of a re-read of the ANA
>>   		 * log page ASAP.
>>   		 */
>> +		blk_mq_end_request(req, 0);
>>   		nvme_mpath_clear_current_path(ns);
>>   		if (ns->ctrl->ana_log_buf) {
>>   			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
>>   			queue_work(nvme_wq, &ns->ctrl->ana_work);
>>   		}
>> +		handled = true;
>>   		break;
>>   	case NVME_SC_HOST_PATH_ERROR:
>>   	case NVME_SC_HOST_ABORTED_CMD:
>> @@ -100,18 +102,18 @@ void nvme_failover_req(struct request *req)
>>   		 * Temporary transport disruption in talking to the controller.
>>   		 * Try to send on a new path.
>>   		 */
>> +		blk_mq_end_request(req, 0);
>>   		nvme_mpath_clear_current_path(ns);
>> +		handled = true;
>>   		break;
>>   	default:
>> -		/*
>> -		 * Reset the controller for any non-ANA error as we don't know
>> -		 * what caused the error.
>> -		 */
>> -		nvme_reset_ctrl(ns->ctrl);
>> +		/* Delegate to common error handling */
>>   		break;

What will happen in the common case? right now it will just retry
it on the same path, is that the desired behavior?

I guess we need to understand the phenomenon better.

Right now the code says, we don't know what went wrong here, so we
are going to reset the controller because it acts weird, which can
make some sense, given that the host doesn't understand what is going
on...

>>   	}
>>   
>> -	kblockd_schedule_work(&ns->head->requeue_work);
>> +	if (handled)
>> +		kblockd_schedule_work(&ns->head->requeue_work);
>> +	return handled;
> 
> I think you can defer the blk_mq_end_request to here as well.  Then
> just return false from the default case and you don't need the handled
> variable.
> 
> So in the end this should be something like this:
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 5dc32b72e7fa..68443564b930 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -291,11 +291,8 @@ void nvme_complete_rq(struct request *req)
>   		nvme_req(req)->ctrl->comp_seen = true;
>   
>   	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> +		if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
>   			return;
> -		}
>   
>   		if (!blk_queue_dying(req->q)) {
>   			nvme_retry_req(req);
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 797c18337d96..7b62b14985c2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -64,18 +64,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> -void nvme_failover_req(struct request *req)
> +bool nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> -	u16 status = nvme_req(req)->status;
>   	unsigned long flags;
>   
> -	spin_lock_irqsave(&ns->head->requeue_lock, flags);
> -	blk_steal_bios(&ns->head->requeue_list, req);
> -	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> -	blk_mq_end_request(req, 0);
> -
> -	switch (status & 0x7ff) {
> +	switch (nvme_req(req)->status & 0x7ff) {
>   	case NVME_SC_ANA_TRANSITION:
>   	case NVME_SC_ANA_INACCESSIBLE:
>   	case NVME_SC_ANA_PERSISTENT_LOSS:
> @@ -103,15 +97,16 @@ void nvme_failover_req(struct request *req)
>   		nvme_mpath_clear_current_path(ns);
>   		break;
>   	default:
> -		/*
> -		 * Reset the controller for any non-ANA error as we don't know
> -		 * what caused the error.
> -		 */
> -		nvme_reset_ctrl(ns->ctrl);
> -		break;
> +		return false;
>   	}
>   
> +	spin_lock_irqsave(&ns->head->requeue_lock, flags);
> +	blk_steal_bios(&ns->head->requeue_list, req);
> +	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> +	blk_mq_end_request(req, 0);
> +
>   	kblockd_schedule_work(&ns->head->requeue_work);
> +	return true;
>   }
>   
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 1024fec7914c..d800b9a51c2c 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -550,7 +550,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
>   void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   			struct nvme_ctrl *ctrl, int *flags);
> -void nvme_failover_req(struct request *req);
> +bool nvme_failover_req(struct request *req);
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
>   void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
> @@ -599,8 +599,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
>   }
>   
> -static inline void nvme_failover_req(struct request *req)
> +static inline bool nvme_failover_req(struct request *req)
>   {
> +	return false;
>   }
>   static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>   {

I'm assuming the below doesn't belong..

> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index 1b8014ab0b25..85b7e080c20b 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -1071,7 +1071,7 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
>   		unsigned int rem = length % maxp;
>   		unsigned chain = true;
>   
> -		if (sg_is_last(s))
> +		if (req->num_queued_sgs + 1 == req->request.num_mapped_sgs)
>   			chain = false;
>   
>   		if (rem && usb_endpoint_dir_out(dep->endpoint.desc) && !chain) {
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 9cfc668f91f4..047c9864c3c7 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -152,6 +152,35 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
>   	},
>   };
>   
> +static void btrfs_scratch_superblocks(struct block_device *bdev,
> +		const char *device_path)
> +{
> +	struct buffer_head *bh;
> +	struct btrfs_super_block *disk_super;
> +	int copy_num;
> +
> +	if (!bdev)
> +		return;
> +
> +	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
> +		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
> +			continue;
> +
> +		disk_super = (struct btrfs_super_block *)bh->b_data;
> +
> +		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> +		set_buffer_dirty(bh);
> +		sync_dirty_buffer(bh);
> +		brelse(bh);
> +	}
> +
> +	/* Notify udev that device has changed */
> +	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
> +
> +	/* Update ctime/mtime for device path for libblkid */
> +	update_dev_time(device_path);
> +}
> +
>   const char *btrfs_bg_type_to_raid_name(u64 flags)
>   {
>   	const int index = btrfs_bg_flags_to_raid_index(flags);
> @@ -7317,36 +7346,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
>   	return 0;
>   }
>   
> -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
> -{
> -	struct buffer_head *bh;
> -	struct btrfs_super_block *disk_super;
> -	int copy_num;
> -
> -	if (!bdev)
> -		return;
> -
> -	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
> -		copy_num++) {
> -
> -		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
> -			continue;
> -
> -		disk_super = (struct btrfs_super_block *)bh->b_data;
> -
> -		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> -		set_buffer_dirty(bh);
> -		sync_dirty_buffer(bh);
> -		brelse(bh);
> -	}
> -
> -	/* Notify udev that device has changed */
> -	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
> -
> -	/* Update ctime/mtime for device path for libblkid */
> -	update_dev_time(device_path);
> -}
> -
>   /*
>    * Update the size and bytes used for each device where it changed.  This is
>    * delayed since we would otherwise get errors while writing out the
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 409f4816fb89..9394cfde7ace 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -473,7 +473,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
>   void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
>   void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
>   void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
> -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
>   int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
>   			   u64 logical, u64 len);
>   unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
> 

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme