All of lore.kernel.org
 help / color / mirror / Atom feed
From: Martin Wilck <mwilck@suse.com>
To: Don Brace <don.brace@microchip.com>,
	Kevin.Barnett@microchip.com, scott.teel@microchip.com,
	Justin.Lindley@microchip.com, scott.benesh@microchip.com,
	gerry.morong@microchip.com, mahesh.rajashekhara@microchip.com,
	hch@infradead.org, jejb@linux.vnet.ibm.com,
	joseph.szczypek@hpe.com, POSWALD@suse.com
Cc: linux-scsi@vger.kernel.org
Subject: Re: [PATCH V3 21/25] smartpqi: add additional logging for LUN resets
Date: Fri, 08 Jan 2021 01:27:23 +0100	[thread overview]
Message-ID: <2b62a04b71dfe762b7a04906b3962e690587ed78.camel@suse.com> (raw)
In-Reply-To: <160763258244.26927.17723549050349595895.stgit@brunhilda>

On Thu, 2020-12-10 at 14:36 -0600, Don Brace wrote:
> From: Kevin Barnett <kevin.barnett@microchip.com>
> 
> * Add additional logging to help in debugging issues
>   with LUN resets.
> 
> Reviewed-by: Mahesh Rajashekhara <mahesh.rajashekhara@microchip.com>
> Reviewed-by: Scott Benesh <scott.benesh@microchip.com>
> Reviewed-by: Scott Teel <scott.teel@microchip.com>
> Signed-off-by: Kevin Barnett <kevin.barnett@microchip.com>
> Signed-off-by: Don Brace <don.brace@microchip.com>

The patch description is not complete, as the patch also changes
some timings. Two remarks below.

Cheers,
Martin

> ---
>  drivers/scsi/smartpqi/smartpqi_init.c |  125
> +++++++++++++++++++++++----------
>  1 file changed, 89 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/scsi/smartpqi/smartpqi_init.c
> b/drivers/scsi/smartpqi/smartpqi_init.c
> index 6b624413c8e6..1c51a59f1da6 100644
> --- a/drivers/scsi/smartpqi/smartpqi_init.c
> +++ b/drivers/scsi/smartpqi/smartpqi_init.c
> @@ -84,7 +84,7 @@ static void pqi_ofa_setup_host_buffer(struct
> pqi_ctrl_info *ctrl_info);
>  static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info
> *ctrl_info);
>  static int pqi_ofa_host_memory_update(struct pqi_ctrl_info
> *ctrl_info);
>  static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info
> *ctrl_info,
> -       struct pqi_scsi_dev *device, unsigned long timeout_secs);
> +       struct pqi_scsi_dev *device, unsigned long timeout_msecs);
>  
>  /* for flags argument to pqi_submit_raid_request_synchronous() */
>  #define PQI_SYNC_FLAGS_INTERRUPTABLE   0x1
> @@ -335,11 +335,34 @@ static void pqi_wait_if_ctrl_blocked(struct
> pqi_ctrl_info *ctrl_info)
>         atomic_dec(&ctrl_info->num_blocked_threads);
>  }
>  
> +#define PQI_QUIESE_WARNING_TIMEOUT_SECS                10

Did you mean QUIESCE ?

> +
>  static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info
> *ctrl_info)
>  {
> +       unsigned long start_jiffies;
> +       unsigned long warning_timeout;
> +       bool displayed_warning;
> +
> +       displayed_warning = false;
> +       start_jiffies = jiffies;
> +       warning_timeout = (PQI_QUIESE_WARNING_TIMEOUT_SECS * PQI_HZ)
> + start_jiffies;
> +
>         while (atomic_read(&ctrl_info->num_busy_threads) >
> -               atomic_read(&ctrl_info->num_blocked_threads))
> +               atomic_read(&ctrl_info->num_blocked_threads)) {
> +               if (time_after(jiffies, warning_timeout)) {
> +                       dev_warn(&ctrl_info->pci_dev->dev,
> +                               "waiting %u seconds for driver
> activity to quiesce\n",
> +                               jiffies_to_msecs(jiffies -
> start_jiffies) / 1000);
> +                       displayed_warning = true;
> +                       warning_timeout =
> (PQI_QUIESE_WARNING_TIMEOUT_SECS * PQI_HZ) + jiffies;
> +               }
>                 usleep_range(1000, 2000);
> +       }
> +
> +       if (displayed_warning)
> +               dev_warn(&ctrl_info->pci_dev->dev,
> +                       "driver activity quiesced after waiting for
> %u seconds\n",
> +                       jiffies_to_msecs(jiffies - start_jiffies) /
> 1000);
>  }
>  
>  static inline bool pqi_device_offline(struct pqi_scsi_dev *device)
> @@ -1670,7 +1693,7 @@ static int pqi_add_device(struct pqi_ctrl_info
> *ctrl_info,
>         return rc;
>  }
>  
> -#define PQI_PENDING_IO_TIMEOUT_SECS    20
> +#define PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS     (20 * 1000)
>  
>  static inline void pqi_remove_device(struct pqi_ctrl_info
> *ctrl_info, struct pqi_scsi_dev *device)
>  {
> @@ -1678,7 +1701,8 @@ static inline void pqi_remove_device(struct
> pqi_ctrl_info *ctrl_info, struct pqi
>  
>         pqi_device_remove_start(device);
>  
> -       rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> PQI_PENDING_IO_TIMEOUT_SECS);
> +       rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> +               PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS);
>         if (rc)
>                 dev_err(&ctrl_info->pci_dev->dev,
>                         "scsi %d:%d:%d:%d removing device with %d
> outstanding command(s)\n",
> @@ -3070,7 +3094,7 @@ static void pqi_process_io_error(unsigned int
> iu_type,
>         }
>  }
>  
> -static int pqi_interpret_task_management_response(
> +static int pqi_interpret_task_management_response(struct
> pqi_ctrl_info *ctrl_info,
>         struct pqi_task_management_response *response)
>  {
>         int rc;
> @@ -3088,6 +3112,10 @@ static int
> pqi_interpret_task_management_response(
>                 break;
>         }
>  
> +       if (rc)
> +               dev_err(&ctrl_info->pci_dev->dev,
> +                       "Task Management Function error: %d (response
> code: %u)\n", rc, response->response_code);
> +
>         return rc;
>  }
>  
> @@ -3156,9 +3184,8 @@ static int pqi_process_io_intr(struct
> pqi_ctrl_info *ctrl_info, struct pqi_queue
>                                 &((struct pqi_vendor_general_response
> *)response)->status);
>                         break;
>                 case PQI_RESPONSE_IU_TASK_MANAGEMENT:
> -                       io_request->status =
> -
>                                pqi_interpret_task_management_response(
> -                                       (void *)response);
> +                       io_request->status =
> pqi_interpret_task_management_response(ctrl_info,
> +                               (void *)response);
>                         break;
>                 case PQI_RESPONSE_IU_AIO_PATH_DISABLED:
>                         pqi_aio_path_disabled(io_request);
> @@ -5862,24 +5889,37 @@ static void
> pqi_fail_io_queued_for_device(struct pqi_ctrl_info *ctrl_info,
>         }
>  }
>  
> +#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS    10
> +
>  static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info
> *ctrl_info,
> -       struct pqi_scsi_dev *device, unsigned long timeout_secs)
> +       struct pqi_scsi_dev *device, unsigned long timeout_msecs)
>  {
> -       unsigned long timeout;
> +       int cmds_outstanding;
> +       unsigned long start_jiffies;
> +       unsigned long warning_timeout;
> +       unsigned long msecs_waiting;
>  
> +       start_jiffies = jiffies;
> +       warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS *
> PQI_HZ) + start_jiffies;
>  
> -       timeout = (timeout_secs * PQI_HZ) + jiffies;
> -
> -       while (atomic_read(&device->scsi_cmds_outstanding)) {
> +       while ((cmds_outstanding = atomic_read(&device-
> >scsi_cmds_outstanding)) > 0) {
>                 pqi_check_ctrl_health(ctrl_info);
>                 if (pqi_ctrl_offline(ctrl_info))
>                         return -ENXIO;
> -               if (timeout_secs != NO_TIMEOUT) {
> -                       if (time_after(jiffies, timeout)) {
> -                               dev_err(&ctrl_info->pci_dev->dev,
> -                                       "timed out waiting for
> pending I/O\n");
> -                               return -ETIMEDOUT;
> -                       }
> +               msecs_waiting = jiffies_to_msecs(jiffies -
> start_jiffies);
> +               if (msecs_waiting > timeout_msecs) {
> +                       dev_err(&ctrl_info->pci_dev->dev,
> +                               "scsi %d:%d:%d:%d: timed out after
> %lu seconds waiting for %d outstanding command(s)\n",
> +                               ctrl_info->scsi_host->host_no,
> device->bus, device->target,
> +                               device->lun, msecs_waiting / 1000,
> cmds_outstanding);
> +                       return -ETIMEDOUT;
> +               }
> +               if (time_after(jiffies, warning_timeout)) {
> +                       dev_warn(&ctrl_info->pci_dev->dev,
> +                               "scsi %d:%d:%d:%d: waiting %lu
> seconds for %d outstanding command(s)\n",
> +                               ctrl_info->scsi_host->host_no,
> device->bus, device->target,
> +                               device->lun, msecs_waiting / 1000,
> cmds_outstanding);
> +                       warning_timeout =
> (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * PQI_HZ) + jiffies;
>                 }
>                 usleep_range(1000, 2000);
>         }
> @@ -5895,13 +5935,15 @@ static void pqi_lun_reset_complete(struct
> pqi_io_request *io_request,
>         complete(waiting);
>  }
>  
> -#define PQI_LUN_RESET_TIMEOUT_SECS             30
>  #define PQI_LUN_RESET_POLL_COMPLETION_SECS     10
>  
>  static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info
> *ctrl_info,
>         struct pqi_scsi_dev *device, struct completion *wait)
>  {
>         int rc;
> +       unsigned int wait_secs;
> +
> +       wait_secs = 0;
>  
>         while (1) {
>                 if (wait_for_completion_io_timeout(wait,
> @@ -5915,13 +5957,21 @@ static int
> pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
>                         rc = -ENXIO;
>                         break;
>                 }
> +
> +               wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS;
> +
> +               dev_warn(&ctrl_info->pci_dev->dev,
> +                       "scsi %d:%d:%d:%d: waiting %u seconds for LUN
> reset to complete\n",
> +                       ctrl_info->scsi_host->host_no, device->bus,
> device->target, device->lun,
> +                       wait_secs);
>         }
>  
>         return rc;
>  }
>  
> -static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info,
> -       struct pqi_scsi_dev *device)
> +#define PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS    30
> +
> +static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct
> pqi_scsi_dev *device)
>  {
>         int rc;
>         struct pqi_io_request *io_request;
> @@ -5943,8 +5993,7 @@ static int pqi_lun_reset(struct pqi_ctrl_info
> *ctrl_info,
>                 sizeof(request->lun_number));
>         request->task_management_function =
> SOP_TASK_MANAGEMENT_LUN_RESET;
>         if (ctrl_info->tmf_iu_timeout_supported)
> -               put_unaligned_le16(PQI_LUN_RESET_TIMEOUT_SECS,
> -                                       &request->timeout);
> +               put_unaligned_le16(PQI_LUN_RESET_FIRMWARE_TIMEOUT_SEC
> S, &request->timeout);
>  
>         pqi_start_io(ctrl_info, &ctrl_info-
> >queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH,
>                 io_request);
> @@ -5958,29 +6007,33 @@ static int pqi_lun_reset(struct pqi_ctrl_info
> *ctrl_info,
>         return rc;
>  }
>  
> -#define PQI_LUN_RESET_RETRIES                  3
> -#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS     10000
> -#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_SECS  120
> +#define PQI_LUN_RESET_RETRIES                          3
> +#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS             (10 * 1000)
> +#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS         (10 * 60 *
> 1000)

10 minutes? Isn't that a bit much?

> +#define PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS  (2 * 60 *
> 1000)

Why wait less long after a failure?



>  
> -static int pqi_lun_reset_with_retries(struct pqi_ctrl_info
> *ctrl_info,
> -       struct pqi_scsi_dev *device)
> +static int pqi_lun_reset_with_retries(struct pqi_ctrl_info
> *ctrl_info, struct pqi_scsi_dev *device)
>  {
> -       int rc;
> +       int reset_rc;
> +       int wait_rc;
>         unsigned int retries;
> -       unsigned long timeout_secs;
> +       unsigned long timeout_msecs;
>  
>         for (retries = 0;;) {
> -               rc = pqi_lun_reset(ctrl_info, device);
> -               if (rc == 0 || ++retries > PQI_LUN_RESET_RETRIES)
> +               reset_rc = pqi_lun_reset(ctrl_info, device);
> +               if (reset_rc == 0 || ++retries >
> PQI_LUN_RESET_RETRIES)
>                         break;
>                 msleep(PQI_LUN_RESET_RETRY_INTERVAL_MSECS);
>         }
>  
> -       timeout_secs = rc ? PQI_LUN_RESET_PENDING_IO_TIMEOUT_SECS :
> NO_TIMEOUT;
> +       timeout_msecs = reset_rc ?
> PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS :
> +               PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS;
>  
> -       rc |= pqi_device_wait_for_pending_io(ctrl_info, device,
> timeout_secs);
> +       wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device,
> timeout_msecs);
> +       if (wait_rc && reset_rc == 0)
> +               reset_rc = wait_rc;
>  
> -       return rc == 0 ? SUCCESS : FAILED;
> +       return reset_rc == 0 ? SUCCESS : FAILED;
>  }
>  
>  static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info,
> 



  reply	other threads:[~2021-01-08  0:28 UTC|newest]

Thread overview: 91+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-10 20:34 [PATCH V3 00/25] smartpqi updates Don Brace
2020-12-10 20:34 ` [PATCH V3 01/25] smartpqi: add support for product id Don Brace
2021-01-07 16:43   ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 02/25] smartpqi: refactor aio submission code Don Brace
2021-01-07 16:43   ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 03/25] smartpqi: refactor build sg list code Don Brace
2021-01-07 16:43   ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 04/25] smartpqi: add support for raid5 and raid6 writes Don Brace
2021-01-07 16:44   ` Martin Wilck
2021-01-08 22:56     ` Don.Brace
2021-01-13 10:26       ` Martin Wilck
2020-12-10 20:34 ` [PATCH V3 05/25] smartpqi: add support for raid1 writes Don Brace
2021-01-07 16:44   ` Martin Wilck
2021-01-09 16:56     ` Don.Brace
2020-12-10 20:34 ` [PATCH V3 06/25] smartpqi: add support for BMIC sense feature cmd and feature bits Don Brace
2021-01-07 16:44   ` Martin Wilck
2021-01-11 17:22     ` Don.Brace
2021-01-22 16:45     ` Don.Brace
2021-01-22 19:04       ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 07/25] smartpqi: update AIO Sub Page 0x02 support Don Brace
2021-01-07 16:44   ` Martin Wilck
2021-01-11 20:53     ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 08/25] smartpqi: add support for long firmware version Don Brace
2021-01-07 16:45   ` Martin Wilck
2021-01-11 22:25     ` Don.Brace
2021-01-22 20:01     ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 09/25] smartpqi: align code with oob driver Don Brace
2021-01-08  0:13   ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 10/25] smartpqi: add stream detection Don Brace
2021-01-08  0:14   ` Martin Wilck
2021-01-15 21:58     ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 11/25] smartpqi: add host level stream detection enable Don Brace
2021-01-08  0:13   ` Martin Wilck
2021-01-12 20:28     ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 12/25] smartpqi: enable support for NVMe encryption Don Brace
2021-01-08  0:14   ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 13/25] smartpqi: disable write_same for nvme hba disks Don Brace
2021-01-08  0:13   ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 14/25] smartpqi: fix driver synchronization issues Don Brace
2021-01-07 23:32   ` Martin Wilck
2021-01-08  4:13     ` Martin K. Petersen
2021-01-15 21:13     ` Don.Brace
2021-01-27 23:01     ` Don.Brace
     [not found]       ` <c1e6b199f5ccda5ccec5223dfcbd1fba22171c86.camel@suse.com>
2021-02-01 22:47         ` Don.Brace
2020-12-10 20:35 ` [PATCH V3 15/25] smartpqi: fix host qdepth limit Don Brace
2020-12-14 17:54   ` Paul Menzel
2020-12-15 20:23     ` Don.Brace
2021-01-07 23:43       ` Martin Wilck
2021-01-15 21:17         ` Don.Brace
2021-01-19 10:33           ` John Garry
2021-01-19 14:12             ` Martin Wilck
2021-01-19 17:43               ` Paul Menzel
2021-01-20 16:42               ` Donald Buczek
2021-01-20 17:03                 ` Don.Brace
2021-01-20 18:35                 ` Martin Wilck
2021-02-10 15:27             ` Don.Brace
2021-02-10 15:42               ` John Garry
2021-02-10 16:29                 ` Don.Brace
2021-03-29 21:15                   ` Paul Menzel
2021-03-29 21:16                     ` Paul Menzel
2021-03-30 14:37                       ` Donald Buczek
2020-12-10 20:35 ` [PATCH V3 16/25] smartpqi: convert snprintf to scnprintf Don Brace
2021-01-07 23:51   ` Martin Wilck
2020-12-10 20:35 ` [PATCH V3 17/25] smartpqi: change timing of release of QRM memory during OFA Don Brace
2021-01-08  0:14   ` Martin Wilck
2021-01-27 17:46     ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 18/25] smartpqi: return busy indication for IOCTLs when ofa is active Don Brace
2020-12-10 20:36 ` [PATCH V3 19/25] smartpqi: add phy id support for the physical drives Don Brace
2021-01-08  0:03   ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 20/25] smartpqi: update sas initiator_port_protocols and target_port_protocols Don Brace
2021-01-08  0:12   ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 21/25] smartpqi: add additional logging for LUN resets Don Brace
2021-01-08  0:27   ` Martin Wilck [this message]
2021-01-25 17:09     ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 22/25] smartpqi: update enclosure identifier in sysf Don Brace
2021-01-08  0:30   ` Martin Wilck
2021-01-25 17:13     ` Don.Brace
2021-01-25 19:44       ` Martin Wilck
2021-01-25 20:36         ` Don.Brace
2020-12-10 20:36 ` [PATCH V3 23/25] smartpqi: correct system hangs when resuming from hibernation Don Brace
2021-01-08  0:34   ` Martin Wilck
2021-01-27 17:39     ` Don.Brace
2021-01-27 17:45       ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 24/25] smartpqi: add new pci ids Don Brace
2021-01-08  0:35   ` Martin Wilck
2020-12-10 20:36 ` [PATCH V3 25/25] smartpqi: update version to 2.1.6-005 Don Brace
2020-12-21 14:31 ` [PATCH V3 00/25] smartpqi updates Donald Buczek
     [not found]   ` <SN6PR11MB2848D8C9DF9856A2B7AA69ACE1C00@SN6PR11MB2848.namprd11.prod.outlook.com>
2020-12-22 13:13     ` Donald Buczek
2020-12-28 15:57       ` Don.Brace
2020-12-28 19:25         ` Don.Brace
2020-12-28 22:36           ` Donald Buczek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2b62a04b71dfe762b7a04906b3962e690587ed78.camel@suse.com \
    --to=mwilck@suse.com \
    --cc=Justin.Lindley@microchip.com \
    --cc=Kevin.Barnett@microchip.com \
    --cc=POSWALD@suse.com \
    --cc=don.brace@microchip.com \
    --cc=gerry.morong@microchip.com \
    --cc=hch@infradead.org \
    --cc=jejb@linux.vnet.ibm.com \
    --cc=joseph.szczypek@hpe.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=mahesh.rajashekhara@microchip.com \
    --cc=scott.benesh@microchip.com \
    --cc=scott.teel@microchip.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.