From: Martin Wilck <mwilck@suse.com>
To: Don Brace <don.brace@microchip.com>,
Kevin.Barnett@microchip.com, scott.teel@microchip.com,
Justin.Lindley@microchip.com, scott.benesh@microchip.com,
gerry.morong@microchip.com, mahesh.rajashekhara@microchip.com,
hch@infradead.org, jejb@linux.vnet.ibm.com,
joseph.szczypek@hpe.com, POSWALD@suse.com
Cc: linux-scsi@vger.kernel.org
Subject: Re: [PATCH V3 21/25] smartpqi: add additional logging for LUN resets
Date: Fri, 08 Jan 2021 01:27:23 +0100 [thread overview]
Message-ID: <2b62a04b71dfe762b7a04906b3962e690587ed78.camel@suse.com> (raw)
In-Reply-To: <160763258244.26927.17723549050349595895.stgit@brunhilda>
On Thu, 2020-12-10 at 14:36 -0600, Don Brace wrote:
> From: Kevin Barnett <kevin.barnett@microchip.com>
>
> * Add additional logging to help in debugging issues
> with LUN resets.
>
> Reviewed-by: Mahesh Rajashekhara <mahesh.rajashekhara@microchip.com>
> Reviewed-by: Scott Benesh <scott.benesh@microchip.com>
> Reviewed-by: Scott Teel <scott.teel@microchip.com>
> Signed-off-by: Kevin Barnett <kevin.barnett@microchip.com>
> Signed-off-by: Don Brace <don.brace@microchip.com>
The patch description is not complete, as the patch also changes
some timings. Two remarks below.
Cheers,
Martin
> ---
> drivers/scsi/smartpqi/smartpqi_init.c | 125
> +++++++++++++++++++++++----------
> 1 file changed, 89 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/scsi/smartpqi/smartpqi_init.c
> b/drivers/scsi/smartpqi/smartpqi_init.c
> index 6b624413c8e6..1c51a59f1da6 100644
> --- a/drivers/scsi/smartpqi/smartpqi_init.c
> +++ b/drivers/scsi/smartpqi/smartpqi_init.c
> @@ -84,7 +84,7 @@ static void pqi_ofa_setup_host_buffer(struct
> pqi_ctrl_info *ctrl_info);
> static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info
> *ctrl_info);
> static int pqi_ofa_host_memory_update(struct pqi_ctrl_info
> *ctrl_info);
> static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info
> *ctrl_info,
> - struct pqi_scsi_dev *device, unsigned long timeout_secs);
> + struct pqi_scsi_dev *device, unsigned long timeout_msecs);
>
> /* for flags argument to pqi_submit_raid_request_synchronous() */
> #define PQI_SYNC_FLAGS_INTERRUPTABLE 0x1
> @@ -335,11 +335,34 @@ static void pqi_wait_if_ctrl_blocked(struct
> pqi_ctrl_info *ctrl_info)
> atomic_dec(&ctrl_info->num_blocked_threads);
> }
>
> +#define PQI_QUIESE_WARNING_TIMEOUT_SECS 10
Did you mean QUIESCE ?
> +
> static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info
> *ctrl_info)
> {
> + unsigned long start_jiffies;
> + unsigned long warning_timeout;
> + bool displayed_warning;
> +
> + displayed_warning = false;
> + start_jiffies = jiffies;
> + warning_timeout = (PQI_QUIESE_WARNING_TIMEOUT_SECS * PQI_HZ)
> + start_jiffies;
> +
> while (atomic_read(&ctrl_info->num_busy_threads) >
> - atomic_read(&ctrl_info->num_blocked_threads))
> + atomic_read(&ctrl_info->num_blocked_threads)) {
> + if (time_after(jiffies, warning_timeout)) {
> + dev_warn(&ctrl_info->pci_dev->dev,
> + "waiting %u seconds for driver
> activity to quiesce\n",
> + jiffies_to_msecs(jiffies -
> start_jiffies) / 1000);
> + displayed_warning = true;
> + warning_timeout =
> (PQI_QUIESE_WARNING_TIMEOUT_SECS * PQI_HZ) + jiffies;
> + }
> usleep_range(1000, 2000);
> + }
> +
> + if (displayed_warning)
> + dev_warn(&ctrl_info->pci_dev->dev,
> + "driver activity quiesced after waiting for
> %u seconds\n",
> + jiffies_to_msecs(jiffies - start_jiffies) /
> 1000);
> }
>
> static inline bool pqi_device_offline(struct pqi_scsi_dev *device)
> @@ -1670,7 +1693,7 @@ static int pqi_add_device(struct pqi_ctrl_info
> *ctrl_info,
> return rc;
> }
>
> -#define PQI_PENDING_IO_TIMEOUT_SECS 20
> +#define PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS (20 * 1000)
>
> static inline void pqi_remove_device(struct pqi_ctrl_info
> *ctrl_info, struct pqi_scsi_dev *device)
> {
> @@ -1678,7 +1701,8 @@ static inline void pqi_remove_device(struct
> pqi_ctrl_info *ctrl_info, struct pqi
>
> pqi_device_remove_start(device);
>
> - rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> PQI_PENDING_IO_TIMEOUT_SECS);
> + rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> + PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS);
> if (rc)
> dev_err(&ctrl_info->pci_dev->dev,
> "scsi %d:%d:%d:%d removing device with %d
> outstanding command(s)\n",
> @@ -3070,7 +3094,7 @@ static void pqi_process_io_error(unsigned int
> iu_type,
> }
> }
>
> -static int pqi_interpret_task_management_response(
> +static int pqi_interpret_task_management_response(struct
> pqi_ctrl_info *ctrl_info,
> struct pqi_task_management_response *response)
> {
> int rc;
> @@ -3088,6 +3112,10 @@ static int
> pqi_interpret_task_management_response(
> break;
> }
>
> + if (rc)
> + dev_err(&ctrl_info->pci_dev->dev,
> + "Task Management Function error: %d (response
> code: %u)\n", rc, response->response_code);
> +
> return rc;
> }
>
> @@ -3156,9 +3184,8 @@ static int pqi_process_io_intr(struct
> pqi_ctrl_info *ctrl_info, struct pqi_queue
> &((struct pqi_vendor_general_response
> *)response)->status);
> break;
> case PQI_RESPONSE_IU_TASK_MANAGEMENT:
> - io_request->status =
> -
> pqi_interpret_task_management_response(
> - (void *)response);
> + io_request->status =
> pqi_interpret_task_management_response(ctrl_info,
> + (void *)response);
> break;
> case PQI_RESPONSE_IU_AIO_PATH_DISABLED:
> pqi_aio_path_disabled(io_request);
> @@ -5862,24 +5889,37 @@ static void
> pqi_fail_io_queued_for_device(struct pqi_ctrl_info *ctrl_info,
> }
> }
>
> +#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS 10
> +
> static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info
> *ctrl_info,
> - struct pqi_scsi_dev *device, unsigned long timeout_secs)
> + struct pqi_scsi_dev *device, unsigned long timeout_msecs)
> {
> - unsigned long timeout;
> + int cmds_outstanding;
> + unsigned long start_jiffies;
> + unsigned long warning_timeout;
> + unsigned long msecs_waiting;
>
> + start_jiffies = jiffies;
> + warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS *
> PQI_HZ) + start_jiffies;
>
> - timeout = (timeout_secs * PQI_HZ) + jiffies;
> -
> - while (atomic_read(&device->scsi_cmds_outstanding)) {
> + while ((cmds_outstanding = atomic_read(&device-
> >scsi_cmds_outstanding)) > 0) {
> pqi_check_ctrl_health(ctrl_info);
> if (pqi_ctrl_offline(ctrl_info))
> return -ENXIO;
> - if (timeout_secs != NO_TIMEOUT) {
> - if (time_after(jiffies, timeout)) {
> - dev_err(&ctrl_info->pci_dev->dev,
> - "timed out waiting for
> pending I/O\n");
> - return -ETIMEDOUT;
> - }
> + msecs_waiting = jiffies_to_msecs(jiffies -
> start_jiffies);
> + if (msecs_waiting > timeout_msecs) {
> + dev_err(&ctrl_info->pci_dev->dev,
> + "scsi %d:%d:%d:%d: timed out after
> %lu seconds waiting for %d outstanding command(s)\n",
> + ctrl_info->scsi_host->host_no,
> device->bus, device->target,
> + device->lun, msecs_waiting / 1000,
> cmds_outstanding);
> + return -ETIMEDOUT;
> + }
> + if (time_after(jiffies, warning_timeout)) {
> + dev_warn(&ctrl_info->pci_dev->dev,
> + "scsi %d:%d:%d:%d: waiting %lu
> seconds for %d outstanding command(s)\n",
> + ctrl_info->scsi_host->host_no,
> device->bus, device->target,
> + device->lun, msecs_waiting / 1000,
> cmds_outstanding);
> + warning_timeout =
> (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * PQI_HZ) + jiffies;
> }
> usleep_range(1000, 2000);
> }
> @@ -5895,13 +5935,15 @@ static void pqi_lun_reset_complete(struct
> pqi_io_request *io_request,
> complete(waiting);
> }
>
> -#define PQI_LUN_RESET_TIMEOUT_SECS 30
> #define PQI_LUN_RESET_POLL_COMPLETION_SECS 10
>
> static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info
> *ctrl_info,
> struct pqi_scsi_dev *device, struct completion *wait)
> {
> int rc;
> + unsigned int wait_secs;
> +
> + wait_secs = 0;
>
> while (1) {
> if (wait_for_completion_io_timeout(wait,
> @@ -5915,13 +5957,21 @@ static int
> pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
> rc = -ENXIO;
> break;
> }
> +
> + wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS;
> +
> + dev_warn(&ctrl_info->pci_dev->dev,
> + "scsi %d:%d:%d:%d: waiting %u seconds for LUN
> reset to complete\n",
> + ctrl_info->scsi_host->host_no, device->bus,
> device->target, device->lun,
> + wait_secs);
> }
>
> return rc;
> }
>
> -static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info,
> - struct pqi_scsi_dev *device)
> +#define PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS 30
> +
> +static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct
> pqi_scsi_dev *device)
> {
> int rc;
> struct pqi_io_request *io_request;
> @@ -5943,8 +5993,7 @@ static int pqi_lun_reset(struct pqi_ctrl_info
> *ctrl_info,
> sizeof(request->lun_number));
> request->task_management_function =
> SOP_TASK_MANAGEMENT_LUN_RESET;
> if (ctrl_info->tmf_iu_timeout_supported)
> - put_unaligned_le16(PQI_LUN_RESET_TIMEOUT_SECS,
> - &request->timeout);
> + put_unaligned_le16(PQI_LUN_RESET_FIRMWARE_TIMEOUT_SEC
> S, &request->timeout);
>
> pqi_start_io(ctrl_info, &ctrl_info-
> >queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH,
> io_request);
> @@ -5958,29 +6007,33 @@ static int pqi_lun_reset(struct pqi_ctrl_info
> *ctrl_info,
> return rc;
> }
>
> -#define PQI_LUN_RESET_RETRIES 3
> -#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS 10000
> -#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_SECS 120
> +#define PQI_LUN_RESET_RETRIES 3
> +#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS (10 * 1000)
> +#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS (10 * 60 *
> 1000)
10 minutes? Isn't that a bit much?
> +#define PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS (2 * 60 *
> 1000)
Why wait less long after a failure?
>
> -static int pqi_lun_reset_with_retries(struct pqi_ctrl_info
> *ctrl_info,
> - struct pqi_scsi_dev *device)
> +static int pqi_lun_reset_with_retries(struct pqi_ctrl_info
> *ctrl_info, struct pqi_scsi_dev *device)
> {
> - int rc;
> + int reset_rc;
> + int wait_rc;
> unsigned int retries;
> - unsigned long timeout_secs;
> + unsigned long timeout_msecs;
>
> for (retries = 0;;) {
> - rc = pqi_lun_reset(ctrl_info, device);
> - if (rc == 0 || ++retries > PQI_LUN_RESET_RETRIES)
> + reset_rc = pqi_lun_reset(ctrl_info, device);
> + if (reset_rc == 0 || ++retries >
> PQI_LUN_RESET_RETRIES)
> break;
> msleep(PQI_LUN_RESET_RETRY_INTERVAL_MSECS);
> }
>
> - timeout_secs = rc ? PQI_LUN_RESET_PENDING_IO_TIMEOUT_SECS :
> NO_TIMEOUT;
> + timeout_msecs = reset_rc ?
> PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS :
> + PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS;
>
> - rc |= pqi_device_wait_for_pending_io(ctrl_info, device,
> timeout_secs);
> + wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> timeout_msecs);
> + if (wait_rc && reset_rc == 0)
> + reset_rc = wait_rc;
>
> - return rc == 0 ? SUCCESS : FAILED;
> + return reset_rc == 0 ? SUCCESS : FAILED;
> }
>
> static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info,
>
next prev parent reply other threads:[~2021-01-08 0:28 UTC|newest]
Thread overview: 91+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-10 20:34 [PATCH V3 00/25] smartpqi updates Don Brace
2020-12-10 20:34 ` [PATCH V3 01/25] smartpqi: add support for product id Don Brace
2021-01-07 16:43 ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 02/25] smartpqi: refactor aio submission code Don Brace
2021-01-07 16:43 ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 03/25] smartpqi: refactor build sg list code Don Brace
2021-01-07 16:43 ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 04/25] smartpqi: add support for raid5 and raid6 writes Don Brace
2021-01-07 16:44 ` Martin Wilck
2021-01-08 22:56 ` Don.Brace
2021-01-13 10:26 ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 05/25] smartpqi: add support for raid1 writes Don Brace
2021-01-07 16:44 ` Martin Wilck
2021-01-09 16:56 ` Don.Brace
2020-12-10 20:34 ` [PATCH V3 06/25] smartpqi: add support for BMIC sense feature cmd and feature bits Don Brace
2021-01-07 16:44 ` Martin Wilck
2021-01-11 17:22 ` Don.Brace
2021-01-22 16:45 ` Don.Brace
2021-01-22 19:04 ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 07/25] smartpqi: update AIO Sub Page 0x02 support Don Brace
2021-01-07 16:44 ` Martin Wilck
2021-01-11 20:53 ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 08/25] smartpqi: add support for long firmware version Don Brace
2021-01-07 16:45 ` Martin Wilck
2021-01-11 22:25 ` Don.Brace
2021-01-22 20:01 ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 09/25] smartpqi: align code with oob driver Don Brace
2021-01-08 0:13 ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 10/25] smartpqi: add stream detection Don Brace
2021-01-08 0:14 ` Martin Wilck
2021-01-15 21:58 ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 11/25] smartpqi: add host level stream detection enable Don Brace
2021-01-08 0:13 ` Martin Wilck
2021-01-12 20:28 ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 12/25] smartpqi: enable support for NVMe encryption Don Brace
2021-01-08 0:14 ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 13/25] smartpqi: disable write_same for nvme hba disks Don Brace
2021-01-08 0:13 ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 14/25] smartpqi: fix driver synchronization issues Don Brace
2021-01-07 23:32 ` Martin Wilck
2021-01-08 4:13 ` Martin K. Petersen
2021-01-15 21:13 ` Don.Brace
2021-01-27 23:01 ` Don.Brace
[not found] ` <c1e6b199f5ccda5ccec5223dfcbd1fba22171c86.camel@suse.com>
2021-02-01 22:47 ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 15/25] smartpqi: fix host qdepth limit Don Brace
2020-12-14 17:54 ` Paul Menzel
2020-12-15 20:23 ` Don.Brace
2021-01-07 23:43 ` Martin Wilck
2021-01-15 21:17 ` Don.Brace
2021-01-19 10:33 ` John Garry
2021-01-19 14:12 ` Martin Wilck
2021-01-19 17:43 ` Paul Menzel
2021-01-20 16:42 ` Donald Buczek
2021-01-20 17:03 ` Don.Brace
2021-01-20 18:35 ` Martin Wilck
2021-02-10 15:27 ` Don.Brace
2021-02-10 15:42 ` John Garry
2021-02-10 16:29 ` Don.Brace
2021-03-29 21:15 ` Paul Menzel
2021-03-29 21:16 ` Paul Menzel
2021-03-30 14:37 ` Donald Buczek
2020-12-10 20:35 ` [PATCH V3 16/25] smartpqi: convert snprintf to scnprintf Don Brace
2021-01-07 23:51 ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 17/25] smartpqi: change timing of release of QRM memory during OFA Don Brace
2021-01-08 0:14 ` Martin Wilck
2021-01-27 17:46 ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 18/25] smartpqi: return busy indication for IOCTLs when ofa is active Don Brace
2020-12-10 20:36 ` [PATCH V3 19/25] smartpqi: add phy id support for the physical drives Don Brace
2021-01-08 0:03 ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 20/25] smartpqi: update sas initiator_port_protocols and target_port_protocols Don Brace
2021-01-08 0:12 ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 21/25] smartpqi: add additional logging for LUN resets Don Brace
2021-01-08 0:27 ` Martin Wilck [this message]
2021-01-25 17:09 ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 22/25] smartpqi: update enclosure identifier in sysf Don Brace
2021-01-08 0:30 ` Martin Wilck
2021-01-25 17:13 ` Don.Brace
2021-01-25 19:44 ` Martin Wilck
2021-01-25 20:36 ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 23/25] smartpqi: correct system hangs when resuming from hibernation Don Brace
2021-01-08 0:34 ` Martin Wilck
2021-01-27 17:39 ` Don.Brace
2021-01-27 17:45 ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 24/25] smartpqi: add new pci ids Don Brace
2021-01-08 0:35 ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 25/25] smartpqi: update version to 2.1.6-005 Don Brace
2020-12-21 14:31 ` [PATCH V3 00/25] smartpqi updates Donald Buczek
[not found] ` <SN6PR11MB2848D8C9DF9856A2B7AA69ACE1C00@SN6PR11MB2848.namprd11.prod.outlook.com>
2020-12-22 13:13 ` Donald Buczek
2020-12-28 15:57 ` Don.Brace
2020-12-28 19:25 ` Don.Brace
2020-12-28 22:36 ` Donald Buczek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2b62a04b71dfe762b7a04906b3962e690587ed78.camel@suse.com \
--to=mwilck@suse.com \
--cc=Justin.Lindley@microchip.com \
--cc=Kevin.Barnett@microchip.com \
--cc=POSWALD@suse.com \
--cc=don.brace@microchip.com \
--cc=gerry.morong@microchip.com \
--cc=hch@infradead.org \
--cc=jejb@linux.vnet.ibm.com \
--cc=joseph.szczypek@hpe.com \
--cc=linux-scsi@vger.kernel.org \
--cc=mahesh.rajashekhara@microchip.com \
--cc=scott.benesh@microchip.com \
--cc=scott.teel@microchip.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).