All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/25] zfcp: fix missing SCSI trace for result of eh_host_reset_handler
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 02/25] zfcp: fix missing SCSI trace for retry of abort / scsi_eh TMF Steffen Maier
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

For problem determination we need to see whether and why we were
successful or not. This allows deduction of scsi_eh escalation.

Example trace record formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : SCSI
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1
Tag            : schrh_r        SCSI host reset handler result
Request ID     : 0x0000000000000000                     none (invalid)
SCSI ID        : 0xffffffff                             none (invalid)
SCSI LUN       : 0xffffffff                             none (invalid)
SCSI LUN high  : 0xffffffff                             none (invalid)
SCSI result    : 0x00002002     field re-used for midlayer value: SUCCESS
                                or in other cases: 0x2009 == FAST_IO_FAIL
SCSI retries   : 0xff                                   none (invalid)
SCSI allowed   : 0xff                                   none (invalid)
SCSI scribble  : 0xffffffffffffffff                     none (invalid)
SCSI opcode    : ffffffff ffffffff ffffffff ffffffff    none (invalid)
FCP rsp inf cod: 0xff                                   none (invalid)
FCP rsp IU     : 00000000 00000000 00000000 00000000    none (invalid)
                 00000000 00000000

v2.6.35 commit a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from
fc_block_scsi_eh to scsi eh") introduced the first return with something
other than the previously hardcoded single SUCCESS return path.

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh")
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_dbf.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/s390/scsi/zfcp_ext.h  |  2 ++
 drivers/s390/scsi/zfcp_scsi.c | 11 ++++++-----
 3 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c
index a8b831000b2d..1e5ea5e4992b 100644
--- a/drivers/s390/scsi/zfcp_dbf.c
+++ b/drivers/s390/scsi/zfcp_dbf.c
@@ -643,6 +643,46 @@ void zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *sc,
 	spin_unlock_irqrestore(&dbf->scsi_lock, flags);
 }
 
+/**
+ * zfcp_dbf_scsi_eh() - Trace event for special cases of scsi_eh callbacks.
+ * @tag: Identifier for event.
+ * @adapter: Pointer to zfcp adapter as context for this event.
+ * @scsi_id: SCSI ID/target to indicate scope of task management function (TMF).
+ * @ret: Return value of calling function.
+ *
+ * This SCSI trace variant does not depend on any of:
+ * scsi_cmnd, zfcp_fsf_req, scsi_device.
+ */
+void zfcp_dbf_scsi_eh(char *tag, struct zfcp_adapter *adapter,
+		      unsigned int scsi_id, int ret)
+{
+	struct zfcp_dbf *dbf = adapter->dbf;
+	struct zfcp_dbf_scsi *rec = &dbf->scsi_buf;
+	unsigned long flags;
+	static int const level = 1;
+
+	if (unlikely(!debug_level_enabled(adapter->dbf->scsi, level)))
+		return;
+
+	spin_lock_irqsave(&dbf->scsi_lock, flags);
+	memset(rec, 0, sizeof(*rec));
+
+	memcpy(rec->tag, tag, ZFCP_DBF_TAG_LEN);
+	rec->id = ZFCP_DBF_SCSI_CMND;
+	rec->scsi_result = ret; /* re-use field, int is 4 bytes and fits */
+	rec->scsi_retries = ~0;
+	rec->scsi_allowed = ~0;
+	rec->fcp_rsp_info = ~0;
+	rec->scsi_id = scsi_id;
+	rec->scsi_lun = (u32)ZFCP_DBF_INVALID_LUN;
+	rec->scsi_lun_64_hi = (u32)(ZFCP_DBF_INVALID_LUN >> 32);
+	rec->host_scribble = ~0;
+	memset(rec->scsi_opcode, 0xff, ZFCP_DBF_SCSI_OPCODE);
+
+	debug_event(dbf->scsi, level, rec, sizeof(*rec));
+	spin_unlock_irqrestore(&dbf->scsi_lock, flags);
+}
+
 static debug_info_t *zfcp_dbf_reg(const char *name, int size, int rec_size)
 {
 	struct debug_info *d;
diff --git a/drivers/s390/scsi/zfcp_ext.h b/drivers/s390/scsi/zfcp_ext.h
index bf8ea4df2bb8..e55f42ce1168 100644
--- a/drivers/s390/scsi/zfcp_ext.h
+++ b/drivers/s390/scsi/zfcp_ext.h
@@ -49,6 +49,8 @@ extern void zfcp_dbf_san_res(char *, struct zfcp_fsf_req *);
 extern void zfcp_dbf_san_in_els(char *, struct zfcp_fsf_req *);
 extern void zfcp_dbf_scsi(char *, int, struct scsi_cmnd *,
 			  struct zfcp_fsf_req *);
+extern void zfcp_dbf_scsi_eh(char *tag, struct zfcp_adapter *adapter,
+			     unsigned int scsi_id, int ret);
 
 /* zfcp_erp.c */
 extern void zfcp_erp_set_adapter_status(struct zfcp_adapter *, u32);
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 4d2ba5682493..a62357f5e8b4 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -323,15 +323,16 @@ static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
 {
 	struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(scpnt->device);
 	struct zfcp_adapter *adapter = zfcp_sdev->port->adapter;
-	int ret;
+	int ret = SUCCESS, fc_ret;
 
 	zfcp_erp_adapter_reopen(adapter, 0, "schrh_1");
 	zfcp_erp_wait(adapter);
-	ret = fc_block_scsi_eh(scpnt);
-	if (ret)
-		return ret;
+	fc_ret = fc_block_scsi_eh(scpnt);
+	if (fc_ret)
+		ret = fc_ret;
 
-	return SUCCESS;
+	zfcp_dbf_scsi_eh("schrh_r", adapter, ~0, ret);
+	return ret;
 }
 
 struct scsi_transport_template *zfcp_scsi_transport_template;
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 02/25] zfcp: fix missing SCSI trace for retry of abort / scsi_eh TMF
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
  2018-05-17 17:14 ` [PATCH 01/25] zfcp: fix missing SCSI trace for result of eh_host_reset_handler Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 03/25] zfcp: fix misleading REC trigger trace where erp_action setup failed Steffen Maier
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

We already have a SCSI trace for the end of abort and scsi_eh TMF. Due to
zfcp_erp_wait() and fc_block_scsi_eh() time can pass between the
start of our eh callback and an actual send/recv of an abort / TMF request.
In order to see the temporal sequence including any abort / TMF send
retries, add a trace before the above two blocking functions.
This supports problem determination with scsi_eh and parallel zfcp ERP.

No need to explicitly trace the beginning of our eh callback, since we
typically can send an abort / TMF and see its HBA response (in the worst
case, it's a pseudo response on dismiss all of adapter recovery, e.g. due
to an FSF request timeout [fsrth_1] of the abort / TMF). If we cannot send,
we now get a trace record for the first "abrt_wt" or "[lt]r_wait" which
denotes almost the beginning of the callback.

No need to explicitly trace the wakeup after the above two blocking
functions because the next retry loop causes another trace in any case
and that is sufficient.

Example trace records formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : SCSI
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1
Tag            : abrt_wt        abort, before zfcp_erp_wait()
Request ID     : 0x0000000000000000                     none (invalid)
SCSI ID        : 0x<scsi_id>
SCSI LUN       : 0x<scsi_lun>
SCSI LUN high  : 0x<scsi_lun_high>
SCSI result    : 0x<scsi_result_of_cmd_to_be_aborted>
SCSI retries   : 0x<retries_of_cmd_to_be_aborted>
SCSI allowed   : 0x<allowed_retries_of_cmd_to_be_aborted>
SCSI scribble  : 0x<req_id_of_cmd_to_be_aborted>
SCSI opcode    : <CDB_of_cmd_to_be_aborted>
FCP rsp inf cod: 0x..                                   none (invalid)
FCP rsp IU     : ...                                    none (invalid)

Timestamp      : ...
Area           : SCSI
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1
Tag            : lr_wait        LUN reset, before zfcp_erp_wait()
Request ID     : 0x0000000000000000                     none (invalid)
SCSI ID        : 0x<scsi_id>
SCSI LUN       : 0x<scsi_lun>
SCSI LUN high  : 0x<scsi_lun_high>
SCSI result    : 0x...                                  unrelated
SCSI retries   : 0x..                                   unrelated
SCSI allowed   : 0x..                                   unrelated
SCSI scribble  : 0x...                                  unrelated
SCSI opcode    : ...                                    unrelated
FCP rsp inf cod: 0x..                                   none (invalid)
FCP rsp IU     : ...                                    none (invalid)

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp")
Fixes: af4de36d911a ("[SCSI] zfcp: Block scsi_eh thread for rport state BLOCKED")
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_scsi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index a62357f5e8b4..4fdb1665b0e6 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -181,6 +181,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
 		if (abrt_req)
 			break;
 
+		zfcp_dbf_scsi_abort("abrt_wt", scpnt, NULL);
 		zfcp_erp_wait(adapter);
 		ret = fc_block_scsi_eh(scpnt);
 		if (ret) {
@@ -277,6 +278,7 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags)
 		if (fsf_req)
 			break;
 
+		zfcp_dbf_scsi_devreset("wait", scpnt, tm_flags, NULL);
 		zfcp_erp_wait(adapter);
 		ret = fc_block_scsi_eh(scpnt);
 		if (ret) {
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 03/25] zfcp: fix misleading REC trigger trace where erp_action setup failed
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
  2018-05-17 17:14 ` [PATCH 01/25] zfcp: fix missing SCSI trace for result of eh_host_reset_handler Steffen Maier
  2018-05-17 17:14 ` [PATCH 02/25] zfcp: fix missing SCSI trace for retry of abort / scsi_eh TMF Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 04/25] zfcp: fix missing REC trigger trace on terminate_rport_io early return Steffen Maier
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

If a SCSI device is deleted during scsi_eh host reset, we cannot get a
reference to the SCSI device anymore since scsi_device_get returns !=0
by design. Assuming the recovery of adapter and port(s) was successful,
zfcp_erp_strategy_followup_success() attempts to trigger a LUN reset for
the half-gone SCSI device. Unfortunately, it causes the following confusing
trace record which states that zfcp will do a LUN recovery as "ERP need" is
ZFCP_ERP_ACTION_REOPEN_LUN == 1 and equals "ERP want".

Old example trace record formatted with zfcpdbf from s390-tools:

Tag:           : ersfs_3 ERP, trigger, unit reopen, port reopen succeeded
LUN            : 0x<FCP_LUN>
WWPN           : 0x<WWPN>
D_ID           : 0x<N_Port-ID>
Adapter status : 0x5400050b
Port status    : 0x54000001
LUN status     : 0x40000000     ZFCP_STATUS_COMMON_RUNNING
                                but not ZFCP_STATUS_COMMON_UNBLOCKED as it
                                was closed on close part of adapter reopen
ERP want       : 0x01
ERP need       : 0x01           misleading

However, zfcp_erp_setup_act() returns NULL as it cannot get the reference.
Hence, zfcp_erp_action_enqueue() takes an early goto out and _NO_ recovery
actually happens.

We always do want the recovery trigger trace record even if no erp_action
could be enqueued as in this case. For other cases where we did not enqueue
an erp_action, 'need' has always been zero to indicate this. In order to
indicate above goto out, introduce an eyecatcher "flag" to mark the
"ERP need" as 'not needed' but still keep the information which erp_action
type, that zfcp_erp_required_act() had decided upon, is needed.
0xc_ is chosen to be visibly different from 0x0_ in "ERP want".

New example trace record formatted with zfcpdbf from s390-tools:

Tag:           : ersfs_3 ERP, trigger, unit reopen, port reopen succeeded
LUN            : 0x<FCP_LUN>
WWPN           : 0x<WWPN>
D_ID           : 0x<N_Port-ID>
Adapter status : 0x5400050b
Port status    : 0x54000001
LUN status     : 0x40000000
ERP want       : 0x01
ERP need       : 0xc1           would need LUN ERP, but no action set up
                   ^

Before v2.6.38 commit ae0904f60fab ("[SCSI] zfcp: Redesign of the debug
tracing for recovery actions.") we could detect this case because the
"erp_action" field in the trace was NULL. The rework removed erp_action
as argument and field from the trace.

This patch here is for tracing. A fix to allow LUN recovery in the case at
hand is a topic for a separate patch.

See also commit fdbd1c5e27da ("[SCSI] zfcp: Allow running unit/LUN shutdown
without acquiring reference") for a similar case and background info.

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: ae0904f60fab ("[SCSI] zfcp: Redesign of the debug tracing for recovery actions.")
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_erp.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 1d91a32db08e..d9cd25b56cfa 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -35,11 +35,23 @@ enum zfcp_erp_steps {
 	ZFCP_ERP_STEP_LUN_OPENING	= 0x2000,
 };
 
+/**
+ * enum zfcp_erp_act_type - Type of ERP action object.
+ * @ZFCP_ERP_ACTION_REOPEN_LUN: LUN recovery.
+ * @ZFCP_ERP_ACTION_REOPEN_PORT: Port recovery.
+ * @ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: Forced port recovery.
+ * @ZFCP_ERP_ACTION_REOPEN_ADAPTER: Adapter recovery.
+ * @ZFCP_ERP_ACTION_NONE: Eyecatcher pseudo flag to bitwise or-combine with
+ *			  either of the other enum values.
+ *			  Used to indicate that an ERP action could not be
+ *			  set up despite a detected need for some recovery.
+ */
 enum zfcp_erp_act_type {
 	ZFCP_ERP_ACTION_REOPEN_LUN         = 1,
 	ZFCP_ERP_ACTION_REOPEN_PORT	   = 2,
 	ZFCP_ERP_ACTION_REOPEN_PORT_FORCED = 3,
 	ZFCP_ERP_ACTION_REOPEN_ADAPTER     = 4,
+	ZFCP_ERP_ACTION_NONE		   = 0xc0,
 };
 
 enum zfcp_erp_act_state {
@@ -257,8 +269,10 @@ static int zfcp_erp_action_enqueue(int want, struct zfcp_adapter *adapter,
 		goto out;
 
 	act = zfcp_erp_setup_act(need, act_status, adapter, port, sdev);
-	if (!act)
+	if (!act) {
+		need |= ZFCP_ERP_ACTION_NONE; /* marker for trace */
 		goto out;
+	}
 	atomic_or(ZFCP_STATUS_ADAPTER_ERP_PENDING, &adapter->status);
 	++adapter->erp_total_count;
 	list_add_tail(&act->list, &adapter->erp_ready_head);
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 04/25] zfcp: fix missing REC trigger trace on terminate_rport_io early return
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
                   ` (2 preceding siblings ...)
  2018-05-17 17:14 ` [PATCH 03/25] zfcp: fix misleading REC trigger trace where erp_action setup failed Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 05/25] zfcp: fix missing REC trigger trace on terminate_rport_io for ERP_FAILED Steffen Maier
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

get_device() and its internally used kobject_get() only return NULL
if they get passed NULL as argument. zfcp_get_port_by_wwpn() loops over
adapter->port_list so the iteration variable port is always non-NULL.
Struct device is embedded in struct zfcp_port so &port->dev is always
non-NULL. This is the argument to get_device().
However, if we get an fc_rport in terminate_rport_io() for which we cannot
find a match within zfcp_get_port_by_wwpn(), the latter can return NULL.
v2.6.30 commit 70932935b61e ("[SCSI] zfcp: Fix oops when port disappears")
introduced an early return without adding a trace record for this case.
Even if we don't need recovery in this case, for debugging we should still
see that our callback was invoked originally by scsi_transport_fc.

Example trace record formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : REC
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1
Tag            : sctrpin        SCSI terminate rport I/O, no zfcp port
LUN            : 0xffffffffffffffff                     none (invalid)
WWPN           : 0x<wwpn>               WWPN
D_ID           : 0x<n_port_id>          N_Port-ID
Adapter status : 0x...
Port status    : 0xffffffff             unknown (-1)
LUN status     : 0x00000000                             none (invalid)
Ready count    : 0x...
Running count  : 0x...
ERP want       : 0x03                   ZFCP_ERP_ACTION_REOPEN_PORT_FORCED
ERP need       : 0xc0                   ZFCP_ERP_ACTION_NONE

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: 70932935b61e ("[SCSI] zfcp: Fix oops when port disappears")
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_erp.c  | 20 ++++++++++++++++++++
 drivers/s390/scsi/zfcp_ext.h  |  3 +++
 drivers/s390/scsi/zfcp_scsi.c |  5 +++++
 3 files changed, 28 insertions(+)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index d9cd25b56cfa..3489b1bc9121 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -283,6 +283,26 @@ static int zfcp_erp_action_enqueue(int want, struct zfcp_adapter *adapter,
 	return retval;
 }
 
+void zfcp_erp_port_forced_no_port_dbf(char *id, struct zfcp_adapter *adapter,
+				      u64 port_name, u32 port_id)
+{
+	unsigned long flags;
+	static /* don't waste stack */ struct zfcp_port tmpport;
+
+	write_lock_irqsave(&adapter->erp_lock, flags);
+	/* Stand-in zfcp port with fields just good enough for
+	 * zfcp_dbf_rec_trig() and zfcp_dbf_set_common().
+	 * Under lock because tmpport is static.
+	 */
+	atomic_set(&tmpport.status, -1); /* unknown */
+	tmpport.wwpn = port_name;
+	tmpport.d_id = port_id;
+	zfcp_dbf_rec_trig(id, adapter, &tmpport, NULL,
+			  ZFCP_ERP_ACTION_REOPEN_PORT_FORCED,
+			  ZFCP_ERP_ACTION_NONE);
+	write_unlock_irqrestore(&adapter->erp_lock, flags);
+}
+
 static int _zfcp_erp_adapter_reopen(struct zfcp_adapter *adapter,
 				    int clear_mask, char *id)
 {
diff --git a/drivers/s390/scsi/zfcp_ext.h b/drivers/s390/scsi/zfcp_ext.h
index e55f42ce1168..3299bd345076 100644
--- a/drivers/s390/scsi/zfcp_ext.h
+++ b/drivers/s390/scsi/zfcp_ext.h
@@ -55,6 +55,9 @@ extern void zfcp_dbf_scsi_eh(char *tag, struct zfcp_adapter *adapter,
 /* zfcp_erp.c */
 extern void zfcp_erp_set_adapter_status(struct zfcp_adapter *, u32);
 extern void zfcp_erp_clear_adapter_status(struct zfcp_adapter *, u32);
+extern void zfcp_erp_port_forced_no_port_dbf(char *id,
+					     struct zfcp_adapter *adapter,
+					     u64 port_name, u32 port_id);
 extern void zfcp_erp_adapter_reopen(struct zfcp_adapter *, int, char *);
 extern void zfcp_erp_adapter_shutdown(struct zfcp_adapter *, int, char *);
 extern void zfcp_erp_set_port_status(struct zfcp_port *, u32);
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 4fdb1665b0e6..478e7ef9ea2f 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -605,6 +605,11 @@ static void zfcp_scsi_terminate_rport_io(struct fc_rport *rport)
 	if (port) {
 		zfcp_erp_port_forced_reopen(port, 0, "sctrpi1");
 		put_device(&port->dev);
+	} else {
+		zfcp_erp_port_forced_no_port_dbf(
+			"sctrpin", adapter,
+			rport->port_name /* zfcp_scsi_rport_register */,
+			rport->port_id /* zfcp_scsi_rport_register */);
 	}
 }
 
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 05/25] zfcp: fix missing REC trigger trace on terminate_rport_io for ERP_FAILED
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
                   ` (3 preceding siblings ...)
  2018-05-17 17:14 ` [PATCH 04/25] zfcp: fix missing REC trigger trace on terminate_rport_io early return Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 06/25] zfcp: fix missing REC trigger trace for all objects in ERP_FAILED Steffen Maier
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

For problem determination we always want to see when we were invoked
on the terminate_rport_io callback whether we perform something or not.

Temporal event sequence of interest with a long fast_io_fail_tmo of 27 sec:

loose remote port

t   workqueue
[s] zfcp_q_<dev>       IRQ                 zfcperp<dev>
=== ================== =================== ============================

  0                    recv RSCN
                       q p.test_link_work
    block rport
     start fast_io_fail_tmo
    send ADISC ELS
  4                    recv ADISC fail
                       block zfcp_port
                                           port forced reopen
                                           send open port
 12                    recv open port fail
                                           q p.gid_pn_work
                                           zfcp_erp_wakeup
                                           (zfcp_erp_wait would return)
    GID_PN fail

Before this point, we got a SCSI trace with tag "sctrpi1" on fast_io_fail,
e.g. with the typical 5 sec setting.

    port.status |= ERP_FAILED

If fast_io_fail_tmo triggers after this point, we missed a SCSI trace.

    workqueue
    fc_dl_<host>
    ==================
 27 fc_timeout_fail_rport_io
    fc_terminate_rport_io
    zfcp_scsi_terminate_rport_io
    zfcp_erp_port_forced_reopen
    _zfcp_erp_port_forced_reopen
     if (port.status & ERP_FAILED)
      return;

Therefore, write a trace before above early return.

Example trace record formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : REC
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1                      ZFCP_DBF_REC_TRIG
Tag            : sctrpi1                SCSI terminate rport I/O
LUN            : 0xffffffffffffffff                     none (invalid)
WWPN           : 0x<wwpn>
D_ID           : 0x<n_port_id>
Adapter status : 0x...
Port status    : 0x...
LUN status     : 0x00000000                             none (invalid)
Ready count    : 0x...
Running count  : 0x...
ERP want       : 0x03                   ZFCP_ERP_ACTION_REOPEN_PORT_FORCED
ERP need       : 0xe0                   ZFCP_ERP_ACTION_FAILED

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_erp.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 3489b1bc9121..5c368cdfc455 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -42,9 +42,13 @@ enum zfcp_erp_steps {
  * @ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: Forced port recovery.
  * @ZFCP_ERP_ACTION_REOPEN_ADAPTER: Adapter recovery.
  * @ZFCP_ERP_ACTION_NONE: Eyecatcher pseudo flag to bitwise or-combine with
- *			  either of the other enum values.
+ *			  either of the first four enum values.
  *			  Used to indicate that an ERP action could not be
  *			  set up despite a detected need for some recovery.
+ * @ZFCP_ERP_ACTION_FAILED: Eyecatcher pseudo flag to bitwise or-combine with
+ *			    either of the first four enum values.
+ *			    Used to indicate that ERP not needed because
+ *			    the object has ZFCP_STATUS_COMMON_ERP_FAILED.
  */
 enum zfcp_erp_act_type {
 	ZFCP_ERP_ACTION_REOPEN_LUN         = 1,
@@ -52,6 +56,7 @@ enum zfcp_erp_act_type {
 	ZFCP_ERP_ACTION_REOPEN_PORT_FORCED = 3,
 	ZFCP_ERP_ACTION_REOPEN_ADAPTER     = 4,
 	ZFCP_ERP_ACTION_NONE		   = 0xc0,
+	ZFCP_ERP_ACTION_FAILED		   = 0xe0,
 };
 
 enum zfcp_erp_act_state {
@@ -379,8 +384,12 @@ static void _zfcp_erp_port_forced_reopen(struct zfcp_port *port, int clear,
 	zfcp_erp_port_block(port, clear);
 	zfcp_scsi_schedule_rport_block(port);
 
-	if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
+	if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED) {
+		zfcp_dbf_rec_trig(id, port->adapter, port, NULL,
+				  ZFCP_ERP_ACTION_REOPEN_PORT_FORCED,
+				  ZFCP_ERP_ACTION_FAILED);
 		return;
+	}
 
 	zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_PORT_FORCED,
 				port->adapter, port, NULL, id, 0);
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 06/25] zfcp: fix missing REC trigger trace for all objects in ERP_FAILED
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
                   ` (4 preceding siblings ...)
  2018-05-17 17:14 ` [PATCH 05/25] zfcp: fix missing REC trigger trace on terminate_rport_io for ERP_FAILED Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-17 17:14 ` [PATCH 07/25] zfcp: fix missing REC trigger trace on enqueue without ERP thread Steffen Maier
  2018-05-18 15:24 ` [PATCH 00/25] zfcp: updates for v4.18 Martin K. Petersen
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

That other commit introduced an inconsistency because it would trace
on ERP_FAILED for all callers of port forced reopen triggers
(not just terminate_rport_io), but it would not trace on ERP_FAILED
for all callers of other ERP triggers such as adapter, port regular, LUN.

Therefore, generalize that other commit. zfcp_erp_action_enqueue()
already had two early outs which re-used the one zfcp_dbf_rec_trig() call.
All ERP trigger functions finally run through zfcp_erp_action_enqueue().
So move the special handling for ZFCP_STATUS_COMMON_ERP_FAILED into
zfcp_erp_action_enqueue() and add another early out with new trace marker
for pseudo ERP need in this case. This removes all early returns from
all ERP trigger functions so we always end up at zfcp_dbf_rec_trig().

Example trace record formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : REC
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1                      ZFCP_DBF_REC_TRIG
Tag            : .......
LUN            : 0x...
WWPN           : 0x...
D_ID           : 0x...
Adapter status : 0x...
Port status    : 0x...
LUN status     : 0x...
Ready count    : 0x...
Running count  : 0x...
ERP want       : 0x0.                   ZFCP_ERP_ACTION_REOPEN_...
ERP need       : 0xe0                   ZFCP_ERP_ACTION_FAILED

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_erp.c | 79 ++++++++++++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 5c368cdfc455..20fe59300d0e 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -143,6 +143,49 @@ static void zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *adapter)
 	}
 }
 
+static int zfcp_erp_handle_failed(int want, struct zfcp_adapter *adapter,
+				  struct zfcp_port *port,
+				  struct scsi_device *sdev)
+{
+	int need = want;
+	struct zfcp_scsi_dev *zsdev;
+
+	switch (want) {
+	case ZFCP_ERP_ACTION_REOPEN_LUN:
+		zsdev = sdev_to_zfcp(sdev);
+		if (atomic_read(&zsdev->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
+			need = 0;
+		break;
+	case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED:
+		if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
+			need = 0;
+		break;
+	case ZFCP_ERP_ACTION_REOPEN_PORT:
+		if (atomic_read(&port->status) &
+		    ZFCP_STATUS_COMMON_ERP_FAILED) {
+			need = 0;
+			/* ensure propagation of failed status to new devices */
+			zfcp_erp_set_port_status(
+				port, ZFCP_STATUS_COMMON_ERP_FAILED);
+		}
+		break;
+	case ZFCP_ERP_ACTION_REOPEN_ADAPTER:
+		if (atomic_read(&adapter->status) &
+		    ZFCP_STATUS_COMMON_ERP_FAILED) {
+			need = 0;
+			/* ensure propagation of failed status to new devices */
+			zfcp_erp_set_adapter_status(
+				adapter, ZFCP_STATUS_COMMON_ERP_FAILED);
+		}
+		break;
+	default:
+		need = 0;
+		break;
+	}
+
+	return need;
+}
+
 static int zfcp_erp_required_act(int want, struct zfcp_adapter *adapter,
 				 struct zfcp_port *port,
 				 struct scsi_device *sdev)
@@ -266,6 +309,12 @@ static int zfcp_erp_action_enqueue(int want, struct zfcp_adapter *adapter,
 	int retval = 1, need;
 	struct zfcp_erp_action *act;
 
+	need = zfcp_erp_handle_failed(want, adapter, port, sdev);
+	if (!need) {
+		need = ZFCP_ERP_ACTION_FAILED; /* marker for trace */
+		goto out;
+	}
+
 	if (!adapter->erp_thread)
 		return -EIO;
 
@@ -314,12 +363,6 @@ static int _zfcp_erp_adapter_reopen(struct zfcp_adapter *adapter,
 	zfcp_erp_adapter_block(adapter, clear_mask);
 	zfcp_scsi_schedule_rports_block(adapter);
 
-	/* ensure propagation of failed status to new devices */
-	if (atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_ERP_FAILED) {
-		zfcp_erp_set_adapter_status(adapter,
-					    ZFCP_STATUS_COMMON_ERP_FAILED);
-		return -EIO;
-	}
 	return zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_ADAPTER,
 				       adapter, NULL, NULL, id, 0);
 }
@@ -338,12 +381,8 @@ void zfcp_erp_adapter_reopen(struct zfcp_adapter *adapter, int clear, char *id)
 	zfcp_scsi_schedule_rports_block(adapter);
 
 	write_lock_irqsave(&adapter->erp_lock, flags);
-	if (atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
-		zfcp_erp_set_adapter_status(adapter,
-					    ZFCP_STATUS_COMMON_ERP_FAILED);
-	else
-		zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_ADAPTER, adapter,
-					NULL, NULL, id, 0);
+	zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_ADAPTER, adapter,
+				NULL, NULL, id, 0);
 	write_unlock_irqrestore(&adapter->erp_lock, flags);
 }
 
@@ -384,13 +423,6 @@ static void _zfcp_erp_port_forced_reopen(struct zfcp_port *port, int clear,
 	zfcp_erp_port_block(port, clear);
 	zfcp_scsi_schedule_rport_block(port);
 
-	if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED) {
-		zfcp_dbf_rec_trig(id, port->adapter, port, NULL,
-				  ZFCP_ERP_ACTION_REOPEN_PORT_FORCED,
-				  ZFCP_ERP_ACTION_FAILED);
-		return;
-	}
-
 	zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_PORT_FORCED,
 				port->adapter, port, NULL, id, 0);
 }
@@ -416,12 +448,6 @@ static int _zfcp_erp_port_reopen(struct zfcp_port *port, int clear, char *id)
 	zfcp_erp_port_block(port, clear);
 	zfcp_scsi_schedule_rport_block(port);
 
-	if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED) {
-		/* ensure propagation of failed status to new devices */
-		zfcp_erp_set_port_status(port, ZFCP_STATUS_COMMON_ERP_FAILED);
-		return -EIO;
-	}
-
 	return zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_PORT,
 				       port->adapter, port, NULL, id, 0);
 }
@@ -461,9 +487,6 @@ static void _zfcp_erp_lun_reopen(struct scsi_device *sdev, int clear, char *id,
 
 	zfcp_erp_lun_block(sdev, clear);
 
-	if (atomic_read(&zfcp_sdev->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
-		return;
-
 	zfcp_erp_action_enqueue(ZFCP_ERP_ACTION_REOPEN_LUN, adapter,
 				zfcp_sdev->port, sdev, id, act_status);
 }
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 07/25] zfcp: fix missing REC trigger trace on enqueue without ERP thread
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
                   ` (5 preceding siblings ...)
  2018-05-17 17:14 ` [PATCH 06/25] zfcp: fix missing REC trigger trace for all objects in ERP_FAILED Steffen Maier
@ 2018-05-17 17:14 ` Steffen Maier
  2018-05-18 15:24 ` [PATCH 00/25] zfcp: updates for v4.18 Martin K. Petersen
  7 siblings, 0 replies; 8+ messages in thread
From: Steffen Maier @ 2018-05-17 17:14 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: linux-scsi, linux-s390, Benjamin Block, Martin Schwidefsky,
	Heiko Carstens, Steffen Maier, stable

Example trace record formatted with zfcpdbf from s390-tools:

Timestamp      : ...
Area           : REC
Subarea        : 00
Level          : 1
Exception      : -
CPU ID         : ..
Caller         : 0x...
Record ID      : 1                      ZFCP_DBF_REC_TRIG
Tag            : .......
LUN            : 0x...
WWPN           : 0x...
D_ID           : 0x...
Adapter status : 0x...
Port status    : 0x...
LUN status     : 0x...
Ready count    : 0x...
Running count  : 0x...
ERP want       : 0x0.                   ZFCP_ERP_ACTION_REOPEN_...
ERP need       : 0xc0                   ZFCP_ERP_ACTION_NONE

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
---
 drivers/s390/scsi/zfcp_erp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 20fe59300d0e..69dfb328dba4 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -315,8 +315,11 @@ static int zfcp_erp_action_enqueue(int want, struct zfcp_adapter *adapter,
 		goto out;
 	}
 
-	if (!adapter->erp_thread)
-		return -EIO;
+	if (!adapter->erp_thread) {
+		need = ZFCP_ERP_ACTION_NONE; /* marker for trace */
+		retval = -EIO;
+		goto out;
+	}
 
 	need = zfcp_erp_required_act(want, adapter, port, sdev);
 	if (!need)
-- 
2.16.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 00/25] zfcp: updates for v4.18
       [not found] <20180517171507.18237-1-maier@linux.ibm.com>
                   ` (6 preceding siblings ...)
  2018-05-17 17:14 ` [PATCH 07/25] zfcp: fix missing REC trigger trace on enqueue without ERP thread Steffen Maier
@ 2018-05-18 15:24 ` Martin K. Petersen
  7 siblings, 0 replies; 8+ messages in thread
From: Martin K. Petersen @ 2018-05-18 15:24 UTC (permalink / raw)
  To: linux-s390, linux-scsi


Steffen,

> this is the zfcp patch set for the v4.18 merge window.  The patches
> apply to Martin's 4.18/scsi-queue.

Applied to 4.18/scsi-queue. Thank you!

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-05-18 15:24 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20180517171507.18237-1-maier@linux.ibm.com>
2018-05-17 17:14 ` [PATCH 01/25] zfcp: fix missing SCSI trace for result of eh_host_reset_handler Steffen Maier
2018-05-17 17:14 ` [PATCH 02/25] zfcp: fix missing SCSI trace for retry of abort / scsi_eh TMF Steffen Maier
2018-05-17 17:14 ` [PATCH 03/25] zfcp: fix misleading REC trigger trace where erp_action setup failed Steffen Maier
2018-05-17 17:14 ` [PATCH 04/25] zfcp: fix missing REC trigger trace on terminate_rport_io early return Steffen Maier
2018-05-17 17:14 ` [PATCH 05/25] zfcp: fix missing REC trigger trace on terminate_rport_io for ERP_FAILED Steffen Maier
2018-05-17 17:14 ` [PATCH 06/25] zfcp: fix missing REC trigger trace for all objects in ERP_FAILED Steffen Maier
2018-05-17 17:14 ` [PATCH 07/25] zfcp: fix missing REC trigger trace on enqueue without ERP thread Steffen Maier
2018-05-18 15:24 ` [PATCH 00/25] zfcp: updates for v4.18 Martin K. Petersen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.