linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/13] target task management fixes
@ 2023-01-12  3:08 Mike Christie
  2023-01-12  3:08 ` [PATCH v2 01/13] scsi: target: Move sess cmd counter to new struct Mike Christie
                   ` (13 more replies)
  0 siblings, 14 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel

The following patches apply over Martin's 6.2 and 6.3 branches or
Linus's tree. They fix a couple regressions in iscsit that occur when
there are multiple sessions accessing the same se_device and TMRs are
executing and a connection is closed. And they fix some bugs in isert
for the single session case when there are TMRs executing and the
connection is closed.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 01/13] scsi: target: Move sess cmd counter to new struct
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 02/13] scsi: target: Move cmd counter allocation Mike Christie
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

iSCSI needs to wait on outstanding commands like how srp and the FC/fcoe
drivers do. It can't use target_stop_session because for MCS support we
can't stop the entire session during recovery because if other connections
are ok then we want to be able to continue to execute IO on them.

This patch moves the per session cmd counters to a new struct, so iSCSI
can allocate it per connection. The xcopy code can also just not allocate
it in the future since it doesn't need to track commands.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_tpg.c         |   2 +-
 drivers/target/target_core_transport.c   | 135 ++++++++++++++++-------
 include/target/iscsi/iscsi_target_core.h |   1 +
 include/target/target_core_base.h        |  13 ++-
 4 files changed, 107 insertions(+), 44 deletions(-)

diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index 736847c933e5..8ebccdbd94f0 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -328,7 +328,7 @@ static void target_shutdown_sessions(struct se_node_acl *acl)
 restart:
 	spin_lock_irqsave(&acl->nacl_sess_lock, flags);
 	list_for_each_entry(sess, &acl->acl_sess_list, sess_acl_list) {
-		if (atomic_read(&sess->stopped))
+		if (sess->cmd_cnt && atomic_read(&sess->cmd_cnt->stopped))
 			continue;
 
 		list_del_init(&sess->sess_acl_list);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 5926316252eb..3d6034f00dcd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -220,11 +220,49 @@ void transport_subsystem_check_init(void)
 	sub_api_initialized = 1;
 }
 
-static void target_release_sess_cmd_refcnt(struct percpu_ref *ref)
+static void target_release_cmd_refcnt(struct percpu_ref *ref)
 {
-	struct se_session *sess = container_of(ref, typeof(*sess), cmd_count);
+	struct target_cmd_counter *cmd_cnt  = container_of(ref,
+							   typeof(*cmd_cnt),
+							   refcnt);
+	wake_up(&cmd_cnt->refcnt_wq);
+}
+
+static struct target_cmd_counter *target_alloc_cmd_counter(void)
+{
+	struct target_cmd_counter *cmd_cnt;
+	int rc;
+
+	cmd_cnt = kzalloc(sizeof(*cmd_cnt), GFP_KERNEL);
+	if (!cmd_cnt)
+		return NULL;
 
-	wake_up(&sess->cmd_count_wq);
+	init_completion(&cmd_cnt->stop_done);
+	init_waitqueue_head(&cmd_cnt->refcnt_wq);
+	atomic_set(&cmd_cnt->stopped, 0);
+
+	rc = percpu_ref_init(&cmd_cnt->refcnt, target_release_cmd_refcnt, 0,
+			     GFP_KERNEL);
+	if (rc)
+		goto free_cmd_cnt;
+
+	return cmd_cnt;
+
+free_cmd_cnt:
+	kfree(cmd_cnt);
+	return NULL;
+}
+
+static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
+{
+	/*
+	 * Drivers like loop do not call target_stop_session during session
+	 * shutdown so we have to drop the ref taken at init time here.
+	 */
+	if (!atomic_read(&cmd_cnt->stopped))
+		percpu_ref_put(&cmd_cnt->refcnt);
+
+	percpu_ref_exit(&cmd_cnt->refcnt);
 }
 
 /**
@@ -238,25 +276,17 @@ int transport_init_session(struct se_session *se_sess)
 	INIT_LIST_HEAD(&se_sess->sess_list);
 	INIT_LIST_HEAD(&se_sess->sess_acl_list);
 	spin_lock_init(&se_sess->sess_cmd_lock);
-	init_waitqueue_head(&se_sess->cmd_count_wq);
-	init_completion(&se_sess->stop_done);
-	atomic_set(&se_sess->stopped, 0);
-	return percpu_ref_init(&se_sess->cmd_count,
-			       target_release_sess_cmd_refcnt, 0, GFP_KERNEL);
+	se_sess->cmd_cnt = target_alloc_cmd_counter();
+	if (!se_sess->cmd_cnt)
+		return -ENOMEM;
+
+	return  0;
 }
 EXPORT_SYMBOL(transport_init_session);
 
 void transport_uninit_session(struct se_session *se_sess)
 {
-	/*
-	 * Drivers like iscsi and loop do not call target_stop_session
-	 * during session shutdown so we have to drop the ref taken at init
-	 * time here.
-	 */
-	if (!atomic_read(&se_sess->stopped))
-		percpu_ref_put(&se_sess->cmd_count);
-
-	percpu_ref_exit(&se_sess->cmd_count);
+	target_free_cmd_counter(se_sess->cmd_cnt);
 }
 
 /**
@@ -2970,9 +3000,16 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 		se_cmd->se_cmd_flags |= SCF_ACK_KREF;
 	}
 
-	if (!percpu_ref_tryget_live(&se_sess->cmd_count))
-		ret = -ESHUTDOWN;
-
+	/*
+	 * Users like xcopy do not use counters since they never do a stop
+	 * and wait.
+	 */
+	if (se_sess->cmd_cnt) {
+		if (!percpu_ref_tryget_live(&se_sess->cmd_cnt->refcnt))
+			ret = -ESHUTDOWN;
+		else
+			se_cmd->cmd_cnt = se_sess->cmd_cnt;
+	}
 	if (ret && ack_kref)
 		target_put_sess_cmd(se_cmd);
 
@@ -2993,7 +3030,7 @@ static void target_free_cmd_mem(struct se_cmd *cmd)
 static void target_release_cmd_kref(struct kref *kref)
 {
 	struct se_cmd *se_cmd = container_of(kref, struct se_cmd, cmd_kref);
-	struct se_session *se_sess = se_cmd->se_sess;
+	struct target_cmd_counter *cmd_cnt = se_cmd->cmd_cnt;
 	struct completion *free_compl = se_cmd->free_compl;
 	struct completion *abrt_compl = se_cmd->abrt_compl;
 
@@ -3004,7 +3041,8 @@ static void target_release_cmd_kref(struct kref *kref)
 	if (abrt_compl)
 		complete(abrt_compl);
 
-	percpu_ref_put(&se_sess->cmd_count);
+	if (cmd_cnt)
+		percpu_ref_put(&cmd_cnt->refcnt);
 }
 
 /**
@@ -3123,46 +3161,65 @@ void target_show_cmd(const char *pfx, struct se_cmd *cmd)
 }
 EXPORT_SYMBOL(target_show_cmd);
 
-static void target_stop_session_confirm(struct percpu_ref *ref)
+static void target_stop_cmd_counter_confirm(struct percpu_ref *ref)
+{
+	struct target_cmd_counter *cmd_cnt = container_of(ref,
+						struct target_cmd_counter,
+						refcnt);
+	complete_all(&cmd_cnt->stop_done);
+}
+
+/**
+ * target_stop_cmd_counter - Stop new IO from being added to the counter.
+ * @cmd_cnt: counter to stop
+ */
+static void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
-	struct se_session *se_sess = container_of(ref, struct se_session,
-						  cmd_count);
-	complete_all(&se_sess->stop_done);
+	pr_debug("Stopping command counter.\n");
+	if (!atomic_cmpxchg(&cmd_cnt->stopped, 0, 1))
+		percpu_ref_kill_and_confirm(&cmd_cnt->refcnt,
+					    target_stop_cmd_counter_confirm);
 }
 
 /**
  * target_stop_session - Stop new IO from being queued on the session.
- * @se_sess:    session to stop
+ * @se_sess: session to stop
  */
 void target_stop_session(struct se_session *se_sess)
 {
-	pr_debug("Stopping session queue.\n");
-	if (atomic_cmpxchg(&se_sess->stopped, 0, 1) == 0)
-		percpu_ref_kill_and_confirm(&se_sess->cmd_count,
-					    target_stop_session_confirm);
+	target_stop_cmd_counter(se_sess->cmd_cnt);
 }
 EXPORT_SYMBOL(target_stop_session);
 
 /**
- * target_wait_for_sess_cmds - Wait for outstanding commands
- * @se_sess:    session to wait for active I/O
+ * target_wait_for_cmds - Wait for outstanding cmds.
+ * @cmd_cnt: counter to wait for active I/O for.
  */
-void target_wait_for_sess_cmds(struct se_session *se_sess)
+static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 {
 	int ret;
 
-	WARN_ON_ONCE(!atomic_read(&se_sess->stopped));
+	WARN_ON_ONCE(!atomic_read(&cmd_cnt->stopped));
 
 	do {
 		pr_debug("Waiting for running cmds to complete.\n");
-		ret = wait_event_timeout(se_sess->cmd_count_wq,
-				percpu_ref_is_zero(&se_sess->cmd_count),
-				180 * HZ);
+		ret = wait_event_timeout(cmd_cnt->refcnt_wq,
+					 percpu_ref_is_zero(&cmd_cnt->refcnt),
+					 180 * HZ);
 	} while (ret <= 0);
 
-	wait_for_completion(&se_sess->stop_done);
+	wait_for_completion(&cmd_cnt->stop_done);
 	pr_debug("Waiting for cmds done.\n");
 }
+
+/**
+ * target_wait_for_sess_cmds - Wait for outstanding commands
+ * @se_sess: session to wait for active I/O
+ */
+void target_wait_for_sess_cmds(struct se_session *se_sess)
+{
+	target_wait_for_cmds(se_sess->cmd_cnt);
+}
 EXPORT_SYMBOL(target_wait_for_sess_cmds);
 
 /*
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 94d06ddfd80a..229118156a1f 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -600,6 +600,7 @@ struct iscsit_conn {
 	struct iscsi_tpg_np	*tpg_np;
 	/* Pointer to parent session */
 	struct iscsit_session	*sess;
+	struct target_cmd_counter *cmd_cnt;
 	int			bitmap_id;
 	int			rx_thread_active;
 	struct task_struct	*rx_thread;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 12c9ba16217e..bd299790e99c 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -494,6 +494,7 @@ struct se_cmd {
 	struct se_lun		*se_lun;
 	/* Only used for internal passthrough and legacy TCM fabric modules */
 	struct se_session	*se_sess;
+	struct target_cmd_counter *cmd_cnt;
 	struct se_tmr_req	*se_tmr_req;
 	struct llist_node	se_cmd_list;
 	struct completion	*free_compl;
@@ -619,22 +620,26 @@ static inline struct se_node_acl *fabric_stat_to_nacl(struct config_item *item)
 			acl_fabric_stat_group);
 }
 
-struct se_session {
+struct target_cmd_counter {
+	struct percpu_ref	refcnt;
+	wait_queue_head_t	refcnt_wq;
+	struct completion	stop_done;
 	atomic_t		stopped;
+};
+
+struct se_session {
 	u64			sess_bin_isid;
 	enum target_prot_op	sup_prot_ops;
 	enum target_prot_type	sess_prot_type;
 	struct se_node_acl	*se_node_acl;
 	struct se_portal_group *se_tpg;
 	void			*fabric_sess_ptr;
-	struct percpu_ref	cmd_count;
 	struct list_head	sess_list;
 	struct list_head	sess_acl_list;
 	spinlock_t		sess_cmd_lock;
-	wait_queue_head_t	cmd_count_wq;
-	struct completion	stop_done;
 	void			*sess_cmd_map;
 	struct sbitmap_queue	sess_tag_pool;
+	struct target_cmd_counter *cmd_cnt;
 };
 
 struct se_device;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 02/13] scsi: target: Move cmd counter allocation
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
  2023-01-12  3:08 ` [PATCH v2 01/13] scsi: target: Move sess cmd counter to new struct Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 03/13] scsi: target: Pass in cmd counter to use during cmd setup Mike Christie
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

iSCSI needs to allocate its cmd counter per connection for MCS support
where we need to stop and wait on commands running on a connection instead
of per session. This moves the cmd counter allocation to
target_setup_session which is used by drivers that need the stop+wait
behavior per session.

xcopy doesn't need stop+wait at all, so we will be ok moving the cmd
counter allocation outside of transport_init_session.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target_login.c | 10 +++++
 drivers/target/target_core_internal.h     |  1 -
 drivers/target/target_core_transport.c    | 55 +++++++++++------------
 drivers/target/target_core_xcopy.c        | 15 +------
 include/target/target_core_fabric.h       |  4 +-
 5 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 27e448c2d066..8ab6c0107d89 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -324,8 +324,18 @@ static int iscsi_login_zero_tsih_s1(
 		goto free_ops;
 	}
 
+	/*
+	 * This is temp for iser. It will be moved to per conn in later
+	 * patches for iscsi.
+	 */
+	sess->se_sess->cmd_cnt = target_alloc_cmd_counter();
+	if (!sess->se_sess->cmd_cnt)
+		goto free_se_sess;
+
 	return 0;
 
+free_se_sess:
+	transport_free_session(sess->se_sess);
 free_ops:
 	kfree(sess->sess_ops);
 free_id:
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 38a6d08f75b3..85e35cf582e5 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -138,7 +138,6 @@ int	init_se_kmem_caches(void);
 void	release_se_kmem_caches(void);
 u32	scsi_get_new_index(scsi_index_t);
 void	transport_subsystem_check_init(void);
-void	transport_uninit_session(struct se_session *);
 unsigned char *transport_dump_cmd_direction(struct se_cmd *);
 void	transport_dump_dev_state(struct se_device *, char *, int *);
 void	transport_dump_dev_info(struct se_device *, struct se_lun *,
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 3d6034f00dcd..60647a49a1d3 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -228,7 +228,7 @@ static void target_release_cmd_refcnt(struct percpu_ref *ref)
 	wake_up(&cmd_cnt->refcnt_wq);
 }
 
-static struct target_cmd_counter *target_alloc_cmd_counter(void)
+struct target_cmd_counter *target_alloc_cmd_counter(void)
 {
 	struct target_cmd_counter *cmd_cnt;
 	int rc;
@@ -252,6 +252,7 @@ static struct target_cmd_counter *target_alloc_cmd_counter(void)
 	kfree(cmd_cnt);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(target_alloc_cmd_counter);
 
 static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
@@ -271,24 +272,14 @@ static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
  *
  * The caller must have zero-initialized @se_sess before calling this function.
  */
-int transport_init_session(struct se_session *se_sess)
+void transport_init_session(struct se_session *se_sess)
 {
 	INIT_LIST_HEAD(&se_sess->sess_list);
 	INIT_LIST_HEAD(&se_sess->sess_acl_list);
 	spin_lock_init(&se_sess->sess_cmd_lock);
-	se_sess->cmd_cnt = target_alloc_cmd_counter();
-	if (!se_sess->cmd_cnt)
-		return -ENOMEM;
-
-	return  0;
 }
 EXPORT_SYMBOL(transport_init_session);
 
-void transport_uninit_session(struct se_session *se_sess)
-{
-	target_free_cmd_counter(se_sess->cmd_cnt);
-}
-
 /**
  * transport_alloc_session - allocate a session object and initialize it
  * @sup_prot_ops: bitmask that defines which T10-PI modes are supported.
@@ -296,7 +287,6 @@ void transport_uninit_session(struct se_session *se_sess)
 struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops)
 {
 	struct se_session *se_sess;
-	int ret;
 
 	se_sess = kmem_cache_zalloc(se_sess_cache, GFP_KERNEL);
 	if (!se_sess) {
@@ -304,11 +294,7 @@ struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops)
 				" se_sess_cache\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	ret = transport_init_session(se_sess);
-	if (ret < 0) {
-		kmem_cache_free(se_sess_cache, se_sess);
-		return ERR_PTR(ret);
-	}
+	transport_init_session(se_sess);
 	se_sess->sup_prot_ops = sup_prot_ops;
 
 	return se_sess;
@@ -474,8 +460,13 @@ target_setup_session(struct se_portal_group *tpg,
 		     int (*callback)(struct se_portal_group *,
 				     struct se_session *, void *))
 {
+	struct target_cmd_counter *cmd_cnt;
 	struct se_session *sess;
+	int rc;
 
+	cmd_cnt = target_alloc_cmd_counter();
+	if (!cmd_cnt)
+		return ERR_PTR(-ENOMEM);
 	/*
 	 * If the fabric driver is using percpu-ida based pre allocation
 	 * of I/O descriptor tags, go ahead and perform that setup now..
@@ -485,29 +476,36 @@ target_setup_session(struct se_portal_group *tpg,
 	else
 		sess = transport_alloc_session(prot_op);
 
-	if (IS_ERR(sess))
-		return sess;
+	if (IS_ERR(sess)) {
+		rc = PTR_ERR(sess);
+		goto free_cnt;
+	}
+	sess->cmd_cnt = cmd_cnt;
 
 	sess->se_node_acl = core_tpg_check_initiator_node_acl(tpg,
 					(unsigned char *)initiatorname);
 	if (!sess->se_node_acl) {
-		transport_free_session(sess);
-		return ERR_PTR(-EACCES);
+		rc = -EACCES;
+		goto free_sess;
 	}
 	/*
 	 * Go ahead and perform any remaining fabric setup that is
 	 * required before transport_register_session().
 	 */
 	if (callback != NULL) {
-		int rc = callback(tpg, sess, private);
-		if (rc) {
-			transport_free_session(sess);
-			return ERR_PTR(rc);
-		}
+		rc = callback(tpg, sess, private);
+		if (rc)
+			goto free_sess;
 	}
 
 	transport_register_session(tpg, sess->se_node_acl, sess, private);
 	return sess;
+
+free_sess:
+	transport_free_session(sess);
+free_cnt:
+	target_free_cmd_counter(cmd_cnt);
+	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL(target_setup_session);
 
@@ -632,7 +630,8 @@ void transport_free_session(struct se_session *se_sess)
 		sbitmap_queue_free(&se_sess->sess_tag_pool);
 		kvfree(se_sess->sess_cmd_map);
 	}
-	transport_uninit_session(se_sess);
+	if (se_sess->cmd_cnt)
+		target_free_cmd_counter(se_sess->cmd_cnt);
 	kmem_cache_free(se_sess_cache, se_sess);
 }
 EXPORT_SYMBOL(transport_free_session);
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 49eaee022ef1..49a83500c8b7 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -461,8 +461,6 @@ static const struct target_core_fabric_ops xcopy_pt_tfo = {
 
 int target_xcopy_setup_pt(void)
 {
-	int ret;
-
 	xcopy_wq = alloc_workqueue("xcopy_wq", WQ_MEM_RECLAIM, 0);
 	if (!xcopy_wq) {
 		pr_err("Unable to allocate xcopy_wq\n");
@@ -479,9 +477,7 @@ int target_xcopy_setup_pt(void)
 	INIT_LIST_HEAD(&xcopy_pt_nacl.acl_list);
 	INIT_LIST_HEAD(&xcopy_pt_nacl.acl_sess_list);
 	memset(&xcopy_pt_sess, 0, sizeof(struct se_session));
-	ret = transport_init_session(&xcopy_pt_sess);
-	if (ret < 0)
-		goto destroy_wq;
+	transport_init_session(&xcopy_pt_sess);
 
 	xcopy_pt_nacl.se_tpg = &xcopy_pt_tpg;
 	xcopy_pt_nacl.nacl_sess = &xcopy_pt_sess;
@@ -490,19 +486,12 @@ int target_xcopy_setup_pt(void)
 	xcopy_pt_sess.se_node_acl = &xcopy_pt_nacl;
 
 	return 0;
-
-destroy_wq:
-	destroy_workqueue(xcopy_wq);
-	xcopy_wq = NULL;
-	return ret;
 }
 
 void target_xcopy_release_pt(void)
 {
-	if (xcopy_wq) {
+	if (xcopy_wq)
 		destroy_workqueue(xcopy_wq);
-		transport_uninit_session(&xcopy_pt_sess);
-	}
 }
 
 /*
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 38f0662476d1..65527174b8bc 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -133,7 +133,9 @@ struct se_session *target_setup_session(struct se_portal_group *,
 				struct se_session *, void *));
 void target_remove_session(struct se_session *);
 
-int transport_init_session(struct se_session *se_sess);
+struct target_cmd_counter *target_alloc_cmd_counter(void);
+
+void transport_init_session(struct se_session *se_sess);
 struct se_session *transport_alloc_session(enum target_prot_op);
 int transport_alloc_session_tags(struct se_session *, unsigned int,
 		unsigned int);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 03/13] scsi: target: Pass in cmd counter to use during cmd setup
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
  2023-01-12  3:08 ` [PATCH v2 01/13] scsi: target: Move sess cmd counter to new struct Mike Christie
  2023-01-12  3:08 ` [PATCH v2 02/13] scsi: target: Move cmd counter allocation Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 04/13] scsi: target: iscsit/isert: Alloc per conn cmd counter Mike Christie
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This allows target_get_sess_cmd users to pass in the cmd counter they want
to use. Right now we pass in the session's cmd counter but in the next
patch iSCSI will switch from per session to per conn so this patch will be
needed for that conversion.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c    | 10 +++++----
 drivers/target/target_core_transport.c | 28 ++++++++++++--------------
 drivers/target/target_core_xcopy.c     |  8 ++++----
 drivers/usb/gadget/function/f_tcm.c    |  4 ++--
 include/target/target_core_fabric.h    |  8 +++++---
 5 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index baf4da7bb3b4..87927a36f90d 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1190,9 +1190,10 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 	 * Initialize struct se_cmd descriptor from target_core_mod infrastructure
 	 */
 	__target_init_cmd(&cmd->se_cmd, &iscsi_ops,
-			 conn->sess->se_sess, be32_to_cpu(hdr->data_length),
-			 cmd->data_direction, sam_task_attr,
-			 cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun));
+			  conn->sess->se_sess, be32_to_cpu(hdr->data_length),
+			  cmd->data_direction, sam_task_attr,
+			  cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun),
+			  conn->sess->se_sess->cmd_cnt);
 
 	pr_debug("Got SCSI Command, ITT: 0x%08x, CmdSN: 0x%08x,"
 		" ExpXferLen: %u, Length: %u, CID: %hu\n", hdr->itt,
@@ -2055,7 +2056,8 @@ iscsit_handle_task_mgt_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 	__target_init_cmd(&cmd->se_cmd, &iscsi_ops,
 			  conn->sess->se_sess, 0, DMA_NONE,
 			  TCM_SIMPLE_TAG, cmd->sense_buffer + 2,
-			  scsilun_to_int(&hdr->lun));
+			  scsilun_to_int(&hdr->lun),
+			  conn->sess->se_sess->cmd_cnt);
 
 	target_get_sess_cmd(&cmd->se_cmd, true);
 
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 60647a49a1d3..c395606ab1a9 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1441,14 +1441,12 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
  *
  * Preserves the value of @cmd->tag.
  */
-void __target_init_cmd(
-	struct se_cmd *cmd,
-	const struct target_core_fabric_ops *tfo,
-	struct se_session *se_sess,
-	u32 data_length,
-	int data_direction,
-	int task_attr,
-	unsigned char *sense_buffer, u64 unpacked_lun)
+void __target_init_cmd(struct se_cmd *cmd,
+		       const struct target_core_fabric_ops *tfo,
+		       struct se_session *se_sess, u32 data_length,
+		       int data_direction, int task_attr,
+		       unsigned char *sense_buffer, u64 unpacked_lun,
+		       struct target_cmd_counter *cmd_cnt)
 {
 	INIT_LIST_HEAD(&cmd->se_delayed_node);
 	INIT_LIST_HEAD(&cmd->se_qf_node);
@@ -1468,6 +1466,7 @@ void __target_init_cmd(
 	cmd->sam_task_attr = task_attr;
 	cmd->sense_buffer = sense_buffer;
 	cmd->orig_fe_lun = unpacked_lun;
+	cmd->cmd_cnt = cmd_cnt;
 
 	if (!(cmd->se_cmd_flags & SCF_USE_CPUID))
 		cmd->cpuid = raw_smp_processor_id();
@@ -1687,7 +1686,8 @@ int target_init_cmd(struct se_cmd *se_cmd, struct se_session *se_sess,
 	 * target_core_fabric_ops->queue_status() callback
 	 */
 	__target_init_cmd(se_cmd, se_tpg->se_tpg_tfo, se_sess, data_length,
-			  data_dir, task_attr, sense, unpacked_lun);
+			  data_dir, task_attr, sense, unpacked_lun,
+			  se_sess->cmd_cnt);
 
 	/*
 	 * Obtain struct se_cmd->cmd_kref reference. A second kref_get here is
@@ -1982,7 +1982,8 @@ int target_submit_tmr(struct se_cmd *se_cmd, struct se_session *se_sess,
 	BUG_ON(!se_tpg);
 
 	__target_init_cmd(se_cmd, se_tpg->se_tpg_tfo, se_sess,
-			  0, DMA_NONE, TCM_SIMPLE_TAG, sense, unpacked_lun);
+			  0, DMA_NONE, TCM_SIMPLE_TAG, sense, unpacked_lun,
+			  se_sess->cmd_cnt);
 	/*
 	 * FIXME: Currently expect caller to handle se_cmd->se_tmr_req
 	 * allocation failure.
@@ -2986,7 +2987,6 @@ EXPORT_SYMBOL(transport_generic_free_cmd);
  */
 int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 {
-	struct se_session *se_sess = se_cmd->se_sess;
 	int ret = 0;
 
 	/*
@@ -3003,11 +3003,9 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 	 * Users like xcopy do not use counters since they never do a stop
 	 * and wait.
 	 */
-	if (se_sess->cmd_cnt) {
-		if (!percpu_ref_tryget_live(&se_sess->cmd_cnt->refcnt))
+	if (se_cmd->cmd_cnt) {
+		if (!percpu_ref_tryget_live(&se_cmd->cmd_cnt->refcnt))
 			ret = -ESHUTDOWN;
-		else
-			se_cmd->cmd_cnt = se_sess->cmd_cnt;
 	}
 	if (ret && ack_kref)
 		target_put_sess_cmd(se_cmd);
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 49a83500c8b7..91ed015b588c 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -591,8 +591,8 @@ static int target_xcopy_read_source(
 		(unsigned long long)src_lba, transfer_length_block, src_bytes);
 
 	__target_init_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, src_bytes,
-			  DMA_FROM_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0);
-
+			  DMA_FROM_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0,
+			  NULL);
 	rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, src_dev, &cdb[0],
 				remote_port);
 	if (rc < 0) {
@@ -636,8 +636,8 @@ static int target_xcopy_write_destination(
 		(unsigned long long)dst_lba, transfer_length_block, dst_bytes);
 
 	__target_init_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, dst_bytes,
-			  DMA_TO_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0);
-
+			  DMA_TO_DEVICE, 0, &xpt_cmd.sense_buffer[0], 0,
+			  NULL);
 	rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, dst_dev, &cdb[0],
 				remote_port);
 	if (rc < 0) {
diff --git a/drivers/usb/gadget/function/f_tcm.c b/drivers/usb/gadget/function/f_tcm.c
index 658e2e21fdd0..c21acebe8aae 100644
--- a/drivers/usb/gadget/function/f_tcm.c
+++ b/drivers/usb/gadget/function/f_tcm.c
@@ -1054,7 +1054,7 @@ static void usbg_cmd_work(struct work_struct *work)
 				  tv_nexus->tvn_se_sess->se_tpg->se_tpg_tfo,
 				  tv_nexus->tvn_se_sess, cmd->data_len, DMA_NONE,
 				  cmd->prio_attr, cmd->sense_iu.sense,
-				  cmd->unpacked_lun);
+				  cmd->unpacked_lun, NULL);
 		goto out;
 	}
 
@@ -1183,7 +1183,7 @@ static void bot_cmd_work(struct work_struct *work)
 				  tv_nexus->tvn_se_sess->se_tpg->se_tpg_tfo,
 				  tv_nexus->tvn_se_sess, cmd->data_len, DMA_NONE,
 				  cmd->prio_attr, cmd->sense_iu.sense,
-				  cmd->unpacked_lun);
+				  cmd->unpacked_lun, NULL);
 		goto out;
 	}
 
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 65527174b8bc..d507e7885f17 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -151,9 +151,11 @@ void	transport_deregister_session_configfs(struct se_session *);
 void	transport_deregister_session(struct se_session *);
 
 
-void	__target_init_cmd(struct se_cmd *,
-		const struct target_core_fabric_ops *,
-		struct se_session *, u32, int, int, unsigned char *, u64);
+void	__target_init_cmd(struct se_cmd *cmd,
+		const struct target_core_fabric_ops *tfo,
+		struct se_session *sess, u32 data_length, int data_direction,
+		int task_attr, unsigned char *sense_buffer, u64 unpacked_lun,
+		struct target_cmd_counter *cmd_cnt);
 int	target_init_cmd(struct se_cmd *se_cmd, struct se_session *se_sess,
 		unsigned char *sense, u64 unpacked_lun, u32 data_length,
 		int task_attr, int data_dir, int flags);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 04/13] scsi: target: iscsit/isert: Alloc per conn cmd counter
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (2 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 03/13] scsi: target: Pass in cmd counter to use during cmd setup Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 05/13] scsi: target: iscsit: stop/wait on cmds during conn close Mike Christie
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This has iscsit allocate a per conn cmd counter and converts iscsit/isert
to use it instead of the per session one.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c   |  4 ++--
 drivers/target/iscsi/iscsi_target.c       |  4 ++--
 drivers/target/iscsi/iscsi_target_login.c | 17 +++++++----------
 drivers/target/target_core_transport.c    |  9 ++++++---
 include/target/target_core_fabric.h       |  3 +++
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 75404885cf98..f290cd49698e 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2506,8 +2506,8 @@ isert_wait4cmds(struct iscsit_conn *conn)
 	isert_info("iscsit_conn %p\n", conn);
 
 	if (conn->sess) {
-		target_stop_session(conn->sess->se_sess);
-		target_wait_for_sess_cmds(conn->sess->se_sess);
+		target_stop_cmd_counter(conn->cmd_cnt);
+		target_wait_for_cmds(conn->cmd_cnt);
 	}
 }
 
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 87927a36f90d..11115c207844 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1193,7 +1193,7 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 			  conn->sess->se_sess, be32_to_cpu(hdr->data_length),
 			  cmd->data_direction, sam_task_attr,
 			  cmd->sense_buffer + 2, scsilun_to_int(&hdr->lun),
-			  conn->sess->se_sess->cmd_cnt);
+			  conn->cmd_cnt);
 
 	pr_debug("Got SCSI Command, ITT: 0x%08x, CmdSN: 0x%08x,"
 		" ExpXferLen: %u, Length: %u, CID: %hu\n", hdr->itt,
@@ -2057,7 +2057,7 @@ iscsit_handle_task_mgt_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd,
 			  conn->sess->se_sess, 0, DMA_NONE,
 			  TCM_SIMPLE_TAG, cmd->sense_buffer + 2,
 			  scsilun_to_int(&hdr->lun),
-			  conn->sess->se_sess->cmd_cnt);
+			  conn->cmd_cnt);
 
 	target_get_sess_cmd(&cmd->se_cmd, true);
 
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 8ab6c0107d89..274bdd7845ca 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -324,18 +324,8 @@ static int iscsi_login_zero_tsih_s1(
 		goto free_ops;
 	}
 
-	/*
-	 * This is temp for iser. It will be moved to per conn in later
-	 * patches for iscsi.
-	 */
-	sess->se_sess->cmd_cnt = target_alloc_cmd_counter();
-	if (!sess->se_sess->cmd_cnt)
-		goto free_se_sess;
-
 	return 0;
 
-free_se_sess:
-	transport_free_session(sess->se_sess);
 free_ops:
 	kfree(sess->sess_ops);
 free_id:
@@ -1157,8 +1147,14 @@ static struct iscsit_conn *iscsit_alloc_conn(struct iscsi_np *np)
 		goto free_conn_cpumask;
 	}
 
+	conn->cmd_cnt = target_alloc_cmd_counter();
+	if (!conn->cmd_cnt)
+		goto free_conn_allowed_cpumask;
+
 	return conn;
 
+free_conn_allowed_cpumask:
+	free_cpumask_var(conn->allowed_cpumask);
 free_conn_cpumask:
 	free_cpumask_var(conn->conn_cpumask);
 free_conn_ops:
@@ -1172,6 +1168,7 @@ static struct iscsit_conn *iscsit_alloc_conn(struct iscsi_np *np)
 
 void iscsit_free_conn(struct iscsit_conn *conn)
 {
+	target_free_cmd_counter(conn->cmd_cnt);
 	free_cpumask_var(conn->allowed_cpumask);
 	free_cpumask_var(conn->conn_cpumask);
 	kfree(conn->conn_ops);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index c395606ab1a9..86adff2a86ed 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -254,7 +254,7 @@ struct target_cmd_counter *target_alloc_cmd_counter(void)
 }
 EXPORT_SYMBOL_GPL(target_alloc_cmd_counter);
 
-static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
+void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
 	/*
 	 * Drivers like loop do not call target_stop_session during session
@@ -265,6 +265,7 @@ static void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt)
 
 	percpu_ref_exit(&cmd_cnt->refcnt);
 }
+EXPORT_SYMBOL_GPL(target_free_cmd_counter);
 
 /**
  * transport_init_session - initialize a session object
@@ -3170,13 +3171,14 @@ static void target_stop_cmd_counter_confirm(struct percpu_ref *ref)
  * target_stop_cmd_counter - Stop new IO from being added to the counter.
  * @cmd_cnt: counter to stop
  */
-static void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
+void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt)
 {
 	pr_debug("Stopping command counter.\n");
 	if (!atomic_cmpxchg(&cmd_cnt->stopped, 0, 1))
 		percpu_ref_kill_and_confirm(&cmd_cnt->refcnt,
 					    target_stop_cmd_counter_confirm);
 }
+EXPORT_SYMBOL_GPL(target_stop_cmd_counter);
 
 /**
  * target_stop_session - Stop new IO from being queued on the session.
@@ -3192,7 +3194,7 @@ EXPORT_SYMBOL(target_stop_session);
  * target_wait_for_cmds - Wait for outstanding cmds.
  * @cmd_cnt: counter to wait for active I/O for.
  */
-static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
+void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 {
 	int ret;
 
@@ -3208,6 +3210,7 @@ static void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt)
 	wait_for_completion(&cmd_cnt->stop_done);
 	pr_debug("Waiting for cmds done.\n");
 }
+EXPORT_SYMBOL_GPL(target_wait_for_cmds);
 
 /**
  * target_wait_for_sess_cmds - Wait for outstanding commands
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index d507e7885f17..b188b1e90e1e 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -133,7 +133,10 @@ struct se_session *target_setup_session(struct se_portal_group *,
 				struct se_session *, void *));
 void target_remove_session(struct se_session *);
 
+void target_stop_cmd_counter(struct target_cmd_counter *cmd_cnt);
+void target_wait_for_cmds(struct target_cmd_counter *cmd_cnt);
 struct target_cmd_counter *target_alloc_cmd_counter(void);
+void target_free_cmd_counter(struct target_cmd_counter *cmd_cnt);
 
 void transport_init_session(struct se_session *se_sess);
 struct se_session *transport_alloc_session(enum target_prot_op);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 05/13] scsi: target: iscsit: stop/wait on cmds during conn close
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (3 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 04/13] scsi: target: iscsit/isert: Alloc per conn cmd counter Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 06/13] scsi: target: iscsit: Fix TAS handling during conn cleanup Mike Christie
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This fixes a bug added in:

commit f36199355c64 ("scsi: target: iscsi: Fix cmd abort fabric stop
race")

If we have multiple sessions to the same se_device we can hit a race where
a LUN_RESET on one session cleans up the se_cmds from under another
session which is being closed. This results in the closing session freeing
its conn/session structs while they are still in use.

The bug is:

1. Session1 has IO se_cmd1.
2. Session2 can also have se_cmds for IO and optionally TMRs for ABORTS
but then gets a LUN_RESET.
3. The LUN_RESET on session2 sees the se_cmds on session1 and during
the drain stages marks them all with CMD_T_ABORTED.
4. session1 is now closed so iscsit_release_commands_from_conn only sees
se_cmds with the CMD_T_ABORTED bit set and returns immediately even
though we have outstanding commands.
5. session1's connection and session are freed.
6. The backend request for se_cmd1 completes and it accesses the freed
connection/session.

This hooks the iscsit layer into the cmd counter code, so we can wait for
all outstanding se_cmds before freeing the connection.

Fixes: f36199355c64 ("scsi: target: iscsi: Fix cmd abort fabric stop race")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 11115c207844..83b007141229 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -4245,6 +4245,16 @@ static void iscsit_release_commands_from_conn(struct iscsit_conn *conn)
 		iscsit_free_cmd(cmd, true);
 
 	}
+
+	/*
+	 * Wait on commands that were cleaned up via the aborted_task path.
+	 * LLDs that implement iscsit_wait_conn will already have waited for
+	 * commands.
+	 */
+	if (!conn->conn_transport->iscsit_wait_conn) {
+		target_stop_cmd_counter(conn->cmd_cnt);
+		target_wait_for_cmds(conn->cmd_cnt);
+	}
 }
 
 static void iscsit_stop_timers_for_cmds(
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 06/13] scsi: target: iscsit: Fix TAS handling during conn cleanup
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (4 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 05/13] scsi: target: iscsit: stop/wait on cmds during conn close Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 07/13] scsi: target: Fix multiple LUN_RESET handling Mike Christie
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This fixes a bug added in:

commit f36199355c64 ("scsi: target: iscsi: Fix cmd abort fabric stop
race")

If CMD_T_TAS is set on the se_cmd we must call iscsit_free_cmd to do the
last put on the cmd and free it, because the connection is down and we
will not up sending the response and doing the put from the normal IO
path. This patch adds a check for CMD_T_TAS in
iscsit_release_commands_from_conn so we now detect this case.

Fixes: f36199355c64 ("scsi: target: iscsi: Fix cmd abort fabric stop race")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 83b007141229..2a011afa6dff 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -4222,7 +4222,8 @@ static void iscsit_release_commands_from_conn(struct iscsit_conn *conn)
 
 		if (se_cmd->se_tfo != NULL) {
 			spin_lock_irq(&se_cmd->t_state_lock);
-			if (se_cmd->transport_state & CMD_T_ABORTED) {
+			if (se_cmd->transport_state & CMD_T_ABORTED &&
+			    !(se_cmd->transport_state & CMD_T_TAS)) {
 				/*
 				 * LIO's abort path owns the cleanup for this,
 				 * so put it back on the list and let
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 07/13] scsi: target: Fix multiple LUN_RESET handling
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (5 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 06/13] scsi: target: iscsit: Fix TAS handling during conn cleanup Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 08/13] scsi: target: drop tas arg from __transport_wait_for_tasks Mike Christie
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This fixes a bug where an initiator thinks a LUN_RESET has cleaned
up running commands when it hasn't. The bug was added in:

commit 51ec502a3266 ("target: Delete tmr from list before processing")

The problem occurs when:

1. We have N IO cmds running in the target layer spread over 2 sessions.
2. The initiator sends a LUN_RESET for each session.
3. session1's LUN_RESET loops over all the running commands from both
sessions and moves them to its local drain_task_list.
4. session2's LUN_RESET does not see the LUN_RESET from session1 because
the commit above has it remove itself. session2 also does not see any
commands since the other reset moved them off the state lists.
5. sessions2's LUN_RESET will then complete with a successful response.
6. sessions2's inititor believes the running commands on its session are
now cleaned up due to the successful response and cleans up the running
commands from its side. It then restarts them.
7. The commands do eventually complete on the backend and the target
starts to return aborted task statuses for them. The initiator will
either throw a invalid ITT error or might accidentally lookup a new task
if the ITT has been reallocated already.

This fixes the bug by reverting the patch.

Fixes: 51ec502a3266 ("target: Delete tmr from list before processing")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/target/target_core_tmr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index bac111456fa1..ba2a2c18dae9 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -188,9 +188,10 @@ static void core_tmr_drain_tmr_list(
 	 * LUN_RESET tmr..
 	 */
 	spin_lock_irqsave(&dev->se_tmr_lock, flags);
-	if (tmr)
-		list_del_init(&tmr->tmr_list);
 	list_for_each_entry_safe(tmr_p, tmr_pp, &dev->dev_tmr_list, tmr_list) {
+		if (tmr_p == tmr)
+			continue;
+
 		cmd = tmr_p->task_cmd;
 		if (!cmd) {
 			pr_err("Unable to locate struct se_cmd for TMR\n");
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 08/13] scsi: target: drop tas arg from __transport_wait_for_tasks
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (6 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 07/13] scsi: target: Fix multiple LUN_RESET handling Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login Mike Christie
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

The tas arg is no longer used by callers of __transport_wait_for_tasks
so drop it.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/target/target_core_transport.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 86adff2a86ed..cb3fdc81ba3b 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2899,15 +2899,14 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
 }
 
 static bool
-__transport_wait_for_tasks(struct se_cmd *, bool, bool *, bool *,
-			   unsigned long *flags);
+__transport_wait_for_tasks(struct se_cmd *, bool, bool *, unsigned long *flags);
 
-static void target_wait_free_cmd(struct se_cmd *cmd, bool *aborted, bool *tas)
+static void target_wait_free_cmd(struct se_cmd *cmd, bool *aborted)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
-	__transport_wait_for_tasks(cmd, true, aborted, tas, &flags);
+	__transport_wait_for_tasks(cmd, true, aborted, &flags);
 	spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 }
 
@@ -2952,10 +2951,10 @@ int transport_generic_free_cmd(struct se_cmd *cmd, int wait_for_tasks)
 {
 	DECLARE_COMPLETION_ONSTACK(compl);
 	int ret = 0;
-	bool aborted = false, tas = false;
+	bool aborted = false;
 
 	if (wait_for_tasks)
-		target_wait_free_cmd(cmd, &aborted, &tas);
+		target_wait_free_cmd(cmd, &aborted);
 
 	if (cmd->se_cmd_flags & SCF_SE_LUN_CMD) {
 		/*
@@ -3234,7 +3233,7 @@ void transport_clear_lun_ref(struct se_lun *lun)
 
 static bool
 __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
-			   bool *aborted, bool *tas, unsigned long *flags)
+			   bool *aborted, unsigned long *flags)
 	__releases(&cmd->t_state_lock)
 	__acquires(&cmd->t_state_lock)
 {
@@ -3246,9 +3245,6 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
 	if (cmd->transport_state & CMD_T_ABORTED)
 		*aborted = true;
 
-	if (cmd->transport_state & CMD_T_TAS)
-		*tas = true;
-
 	if (!(cmd->se_cmd_flags & SCF_SE_LUN_CMD) &&
 	    !(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB))
 		return false;
@@ -3289,10 +3285,10 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
 bool transport_wait_for_tasks(struct se_cmd *cmd)
 {
 	unsigned long flags;
-	bool ret, aborted = false, tas = false;
+	bool ret, aborted = false;
 
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
-	ret = __transport_wait_for_tasks(cmd, false, &aborted, &tas, &flags);
+	ret = __transport_wait_for_tasks(cmd, false, &aborted, &flags);
 	spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
 	return ret;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (7 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 08/13] scsi: target: drop tas arg from __transport_wait_for_tasks Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-13 14:08   ` Dmitry Bogdanov
  2023-01-12  3:08 ` [PATCH v2 10/13] scsi: target: iscsit: Add helper to check when cmd has failed Mike Christie
                   ` (4 subsequent siblings)
  13 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

If we get a disconnect event while logging in we can end up in a state
where will never be able to relogin. This happens when:

1. login thread has put us into TARG_CONN_STATE_IN_LOGIN
2. isert then does

isert_disconnected_handler -> iscsit_cause_connection_reinstatement

which sets the conn connection_reinstatement flag. Nothing else happens
because we are only in IN_LOGIN. The tx/rx threads are not running yet
so we can't start recovery from those contexts at this time.

3. The login thread finishes processing the login pdu and thinks login is
done. It sets us into TARG_CONN_STATE_LOGGED_IN/TARG_SESS_STATE_LOGGED_IN.
This starts the rx/tx threads.

4. The initiator thought it disconnected the connection at 2, and has
since sent a new connect which is now handled. This leads us to eventually
run:

iscsi_check_for_session_reinstatement-> iscsit_stop_session ->
iscsit_cause_connection_reinstatement

iscsit_stop_session sees the old conn and does
iscsit_cause_connection_reinstatement which sees connection_reinstatement
is set so it just returns instead of trying to kill the tx/rx threads
which would have caused recovery to start.

5. iscsit_stop_session then waits on session_wait_comp which will never
happen since we didn't kill the tx/rx threads.

This has the iscsit login code check if a fabric driver ran
iscsit_cause_connection_reinstatement during the login process similar
to how we check for the sk state for tcp based iscsit. This will prevent
us from getting into 3 and creating a ghost session.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target_nego.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index ff49c8f3fe24..2dd81752d4c9 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -350,6 +350,16 @@ static int iscsi_target_do_tx_login_io(struct iscsit_conn *conn, struct iscsi_lo
 					    ISCSI_LOGIN_STATUS_NO_RESOURCES);
 			return -1;
 		}
+
+		/*
+		 * isert doesn't know the iscsit state and uses
+		 * iscsit_cause_connection_reinstatement as a generic error
+		 * notification system. It may call it before we are in FFP.
+		 * Handle this now in case it signaled a failure before the
+		 * rx/tx threads were up and could start recovery.
+		 */
+		if (atomic_read(&conn->connection_reinstatement))
+			goto err;
 	}
 
 	if (conn->conn_transport->iscsit_put_login_tx(conn, login,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 10/13] scsi: target: iscsit: Add helper to check when cmd has failed
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (8 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP Mike Christie
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This moves the check in lio_queue_status to new helper so other code can
use it to check for commands that were failed by lio core or iscsit.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target_configfs.c | 3 +--
 include/target/iscsi/iscsi_target_core.h     | 5 +++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 5d0f51822414..82c1d335c369 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -1411,9 +1411,8 @@ static int lio_queue_status(struct se_cmd *se_cmd)
 
 	cmd->i_state = ISTATE_SEND_STATUS;
 
-	if (cmd->se_cmd.scsi_status || cmd->sense_reason) {
+	if (iscsit_cmd_failed(cmd))
 		return iscsit_add_cmd_to_response_queue(cmd, conn, cmd->i_state);
-	}
 	return conn->conn_transport->iscsit_queue_status(conn, cmd);
 }
 
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 229118156a1f..938dee8b7a51 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -913,6 +913,11 @@ static inline u32 session_get_next_ttt(struct iscsit_session *session)
 	return ttt;
 }
 
+static inline bool iscsit_cmd_failed(struct iscsit_cmd *cmd)
+{
+	return cmd->se_cmd.scsi_status || cmd->sense_reason;
+}
+
 extern struct iscsit_cmd *iscsit_find_cmd_from_itt(struct iscsit_conn *, itt_t);
 
 extern void iscsit_thread_check_cpumask(struct iscsit_conn *conn,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (9 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 10/13] scsi: target: iscsit: Add helper to check when cmd has failed Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-13 14:15   ` Dmitry Bogdanov
  2023-01-12  3:08 ` [PATCH v2 12/13] scsi: target: iscsit: Cleanup isert commands at conn closure Mike Christie
                   ` (2 subsequent siblings)
  13 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
connection is down and it can't send/recv IO (tx/rx threads are killed
or the cleanup thread is run from the one thats up). It will then loop
over running commands and wait for LIO core to complete them or clean
them up if they were on an internal queue waiting to be sent or ackd.

Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
command but for isert we need to prevent LIO core from calling into
iscsit callouts when the connection is being brought down. If LIO core
queues commands to iscsit and it ends up adding to an internal queue
instead of passing back to the driver then we can end up hanging waiting
on command completion that never occurs because it's stuck on the internal
list (the tx thread is stopped at this time, so it will never loop over
the response list and call into isert). We also want to sync up on a
point where we no longer call into isert so it can cleanup it's structs.

This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
command execution and also fixes the locking around the
target_cmd_interrupted calls so we don't have a case where a command
is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
time.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_sbc.c       |  2 +-
 drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 7536ca797606..56136613767f 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
 		 * we don't have to perform the write operation.
 		 */
 		WARN_ON(!(cmd->transport_state &
-			(CMD_T_ABORTED | CMD_T_STOP)));
+			(CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
 		goto out;
 	}
 	/*
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index cb3fdc81ba3b..02a9476945dc 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
 	 * Determine if frontend context caller is requesting the stopping of
 	 * this command for frontend exceptions.
 	 */
-	if (cmd->transport_state & CMD_T_STOP) {
-		pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
+	if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
 			__func__, __LINE__, cmd->tag);
 
 		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
@@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
 		INIT_WORK(&cmd->work, target_abort_work);
 		queue_work(target_completion_wq, &cmd->work);
 		return true;
-	} else if (cmd->transport_state & CMD_T_STOP) {
+	} else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
 		if (cmd->transport_complete_callback)
 			cmd->transport_complete_callback(cmd, false, &post_ret);
 		complete_all(&cmd->t_transport_stop_comp);
@@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
 	int success, cpu;
 	unsigned long flags;
 
-	if (target_cmd_interrupted(cmd))
+	spin_lock_irqsave(&cmd->t_state_lock, flags);
+	if (target_cmd_interrupted(cmd)) {
+		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 		return;
+	}
 
 	cmd->scsi_status = scsi_status;
 	cmd->sense_reason = sense_reason;
 
-	spin_lock_irqsave(&cmd->t_state_lock, flags);
 	switch (cmd->scsi_status) {
 	case SAM_STAT_CHECK_CONDITION:
 		if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
@@ -2277,10 +2279,12 @@ void target_execute_cmd(struct se_cmd *cmd)
 	 *
 	 * If the received CDB has already been aborted stop processing it here.
 	 */
-	if (target_cmd_interrupted(cmd))
+	spin_lock_irq(&cmd->t_state_lock);
+	if (target_cmd_interrupted(cmd)) {
+		spin_unlock_irq(&cmd->t_state_lock);
 		return;
+	}
 
-	spin_lock_irq(&cmd->t_state_lock);
 	cmd->t_state = TRANSPORT_PROCESSING;
 	cmd->transport_state |= CMD_T_ACTIVE | CMD_T_SENT;
 	spin_unlock_irq(&cmd->t_state_lock);
@@ -2847,9 +2851,9 @@ transport_generic_new_cmd(struct se_cmd *cmd)
 	 * Determine if frontend context caller is requesting the stopping of
 	 * this command for frontend exceptions.
 	 */
-	if (cmd->transport_state & CMD_T_STOP &&
+	if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP) &&
 	    !cmd->se_tfo->write_pending_must_be_called) {
-		pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOPfor ITT: 0x%08llx\n",
 			 __func__, __LINE__, cmd->tag);
 
 		spin_unlock_irqrestore(&cmd->t_state_lock, flags);
@@ -2880,11 +2884,12 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
 	bool stop;
 
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
-	stop = (cmd->transport_state & (CMD_T_STOP | CMD_T_ABORTED));
+	stop = (cmd->transport_state &
+		(CMD_T_STOP | CMD_T_FABRIC_STOP | CMD_T_ABORTED));
 	spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
 	if (stop) {
-		pr_debug("%s:%d CMD_T_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
+		pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
 			__func__, __LINE__, cmd->tag);
 		complete_all(&cmd->t_transport_stop_comp);
 		return;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 12/13] scsi: target: iscsit: Cleanup isert commands at conn closure
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (10 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-12  3:08 ` [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds Mike Christie
  2023-01-12  3:13 ` [PATCH v2 00/13] target task management fixes Mike Christie
  13 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

Currently, isert does target_wait_for_cmds before iscsit calls
iscsit_release_commands_from_conn because we can race where LIO core
calls into isert when it wants to cleanup the connection. The wait
prevents isert from freeing the connection while trying to post responses
but it can result in a hang during connection closure if there are se_cmds
on the iscsit response queue, because when isert calls
target_wait_for_cmds the tx thread is stopped or we are running the wait
from it.

For example this is hit when a command times out on the initiator,  the
initiator sends an ABORT, then the connection is closed. When the command
completes it will be placed on the response queue if TAS is set, and the
ABORT response will be placed on the response queue. So at the very
least we will hang waiting on the last put on the ABORT's se_cmd which
will never happen.

This patch adds support to iscsit so it can now handle isert and iscsit
running commands during connection closure so we can have a common place
for the code.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c      | 33 ++++++++++++++++++------
 drivers/target/iscsi/iscsi_target_util.c |  8 +++++-
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 2a011afa6dff..a57527beb340 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -4231,6 +4231,15 @@ static void iscsit_release_commands_from_conn(struct iscsit_conn *conn)
 				 */
 				list_move_tail(&cmd->i_conn_node,
 					       &conn->conn_cmd_list);
+			} else if (conn->sess->sess_ops->RDMAExtensions &&
+				   (se_cmd->transport_state & CMD_T_COMPLETE) &&
+				   !iscsit_cmd_failed(cmd)) {
+				/*
+				 * isert is still handling these cmds so wait in
+				 * target_wait_for_cmds.
+				 */
+				list_move_tail(&cmd->i_conn_node,
+					       &conn->conn_cmd_list);
 			} else {
 				se_cmd->transport_state |= CMD_T_FABRIC_STOP;
 			}
@@ -4243,19 +4252,27 @@ static void iscsit_release_commands_from_conn(struct iscsit_conn *conn)
 		list_del_init(&cmd->i_conn_node);
 
 		iscsit_increment_maxcmdsn(cmd, sess);
+		/*
+		 * Free cmds that:
+		 * 1. we have not got acks for.
+		 * 2. are (or will be when the backend completes them) stuck
+		 * on the response/immediate queue (failed cmds, TMRs, iscsi
+		 * reqs).
+		 * 3. completed ok on the backend, but hit the CMD_T_FABRIC_STOP
+		 * or CMD_T_STOP checks.
+		 */
 		iscsit_free_cmd(cmd, true);
-
 	}
 
 	/*
-	 * Wait on commands that were cleaned up via the aborted_task path.
-	 * LLDs that implement iscsit_wait_conn will already have waited for
-	 * commands.
+	 * We need to wait:
+	 * 1. for commands that are being cleaned up via the aborted_task path.
+	 * 2. for isert we need to wait for iscsit_queue_status calls
+	 * that posted a response after the ib_drain_qp call returned but
+	 * have not yet called isert_send_done.
 	 */
-	if (!conn->conn_transport->iscsit_wait_conn) {
-		target_stop_cmd_counter(conn->cmd_cnt);
-		target_wait_for_cmds(conn->cmd_cnt);
-	}
+	target_stop_cmd_counter(conn->cmd_cnt);
+	target_wait_for_cmds(conn->cmd_cnt);
 }
 
 static void iscsit_stop_timers_for_cmds(
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 26dc8ed3045b..b0d7d6c73a1c 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -753,7 +753,13 @@ void iscsit_free_cmd(struct iscsit_cmd *cmd, bool shutdown)
 	if (se_cmd) {
 		rc = transport_generic_free_cmd(se_cmd, shutdown);
 		if (!rc && shutdown && se_cmd->se_sess) {
-			__iscsit_free_cmd(cmd, shutdown);
+			struct iscsit_conn *conn = cmd->conn;
+			/*
+			 * The command wasn't aborted via ABORT_TASK but didn't
+			 * reach the driver so allow it to cleanup resources
+			 * now.
+			 */
+			conn->conn_transport->iscsit_aborted_task(conn, cmd);
 			target_put_sess_cmd(se_cmd);
 		}
 	} else {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (11 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 12/13] scsi: target: iscsit: Cleanup isert commands at conn closure Mike Christie
@ 2023-01-12  3:08 ` Mike Christie
  2023-01-23  9:50   ` Sagi Grimberg
  2023-01-12  3:13 ` [PATCH v2 00/13] target task management fixes Mike Christie
  13 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:08 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel
  Cc: Mike Christie

This removes the target_wait_for_cmds call from isert, to fix a hang that
occurs when isert's calls target_wait_for_cmds to wait on running
commands, but also ends up waiting on failed SCSI commands or TMR
responses that are on the iscsit response queue. When isert_wait_conn is
called the tx thread is down, so the response queue will not be
processed and the target_wait_for_cmds call will never wake up.

This is safe because iscsit can now handle cleaning up both iscsit and
isert commands that are running/completing and stuck on the response
queue.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index f290cd49698e..516fa37494e1 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2500,17 +2500,6 @@ isert_wait4logout(struct isert_conn *isert_conn)
 	}
 }
 
-static void
-isert_wait4cmds(struct iscsit_conn *conn)
-{
-	isert_info("iscsit_conn %p\n", conn);
-
-	if (conn->sess) {
-		target_stop_cmd_counter(conn->cmd_cnt);
-		target_wait_for_cmds(conn->cmd_cnt);
-	}
-}
-
 /**
  * isert_put_unsol_pending_cmds() - Drop commands waiting for
  *     unsolicitate dataout
@@ -2558,7 +2547,6 @@ static void isert_wait_conn(struct iscsit_conn *conn)
 
 	ib_drain_qp(isert_conn->qp);
 	isert_put_unsol_pending_cmds(conn);
-	isert_wait4cmds(conn);
 	isert_wait4logout(isert_conn);
 
 	queue_work(isert_release_wq, &isert_conn->release_work);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 00/13] target task management fixes
  2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
                   ` (12 preceding siblings ...)
  2023-01-12  3:08 ` [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds Mike Christie
@ 2023-01-12  3:13 ` Mike Christie
  2023-01-23  9:54   ` Sagi Grimberg
  13 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-12  3:13 UTC (permalink / raw)
  To: mlombard, martin.petersen, mgurtovoy, sagi, d.bogdanov,
	linux-scsi, target-devel

On 1/11/23 9:08 PM, Mike Christie wrote:
> The following patches apply over Martin's 6.2 and 6.3 branches or
> Linus's tree. They fix a couple regressions in iscsit that occur when
> there are multiple sessions accessing the same se_device and TMRs are
> executing and a connection is closed. And they fix some bugs in isert
> for the single session case when there are TMRs executing and the
> connection is closed.
>

Oh yeah Sagi and Max,

I didn't post the write_data_done fix. Let me know how you want it
fixed and I can post it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login
  2023-01-12  3:08 ` [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login Mike Christie
@ 2023-01-13 14:08   ` Dmitry Bogdanov
  2023-01-18  2:14     ` Mike Christie
  0 siblings, 1 reply; 26+ messages in thread
From: Dmitry Bogdanov @ 2023-01-13 14:08 UTC (permalink / raw)
  To: Mike Christie
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On Wed, Jan 11, 2023 at 09:08:28PM -0600, Mike Christie wrote:
> 
> If we get a disconnect event while logging in we can end up in a state
> where will never be able to relogin. This happens when:
> 
> 1. login thread has put us into TARG_CONN_STATE_IN_LOGIN
> 2. isert then does
> 
> isert_disconnected_handler -> iscsit_cause_connection_reinstatement
> 
> which sets the conn connection_reinstatement flag. Nothing else happens
> because we are only in IN_LOGIN. The tx/rx threads are not running yet
> so we can't start recovery from those contexts at this time.
> 
> 3. The login thread finishes processing the login pdu and thinks login is
> done. It sets us into TARG_CONN_STATE_LOGGED_IN/TARG_SESS_STATE_LOGGED_IN.
> This starts the rx/tx threads.
> 
> 4. The initiator thought it disconnected the connection at 2, and has
> since sent a new connect which is now handled. This leads us to eventually
> run:
> 
> iscsi_check_for_session_reinstatement-> iscsit_stop_session ->
> iscsit_cause_connection_reinstatement
> 
> iscsit_stop_session sees the old conn and does
> iscsit_cause_connection_reinstatement which sees connection_reinstatement
> is set so it just returns instead of trying to kill the tx/rx threads
> which would have caused recovery to start.
> 
> 5. iscsit_stop_session then waits on session_wait_comp which will never
> happen since we didn't kill the tx/rx threads.
> 
> This has the iscsit login code check if a fabric driver ran
> iscsit_cause_connection_reinstatement during the login process similar
> to how we check for the sk state for tcp based iscsit. This will prevent
> us from getting into 3 and creating a ghost session.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/iscsi/iscsi_target_nego.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
> index ff49c8f3fe24..2dd81752d4c9 100644
> --- a/drivers/target/iscsi/iscsi_target_nego.c
> +++ b/drivers/target/iscsi/iscsi_target_nego.c
> @@ -350,6 +350,16 @@ static int iscsi_target_do_tx_login_io(struct iscsit_conn *conn, struct iscsi_lo
>                                             ISCSI_LOGIN_STATUS_NO_RESOURCES);
>                         return -1;
>                 }
> +
> +               /*
> +                * isert doesn't know the iscsit state and uses
> +                * iscsit_cause_connection_reinstatement as a generic error
> +                * notification system. It may call it before we are in FFP.
> +                * Handle this now in case it signaled a failure before the
> +                * rx/tx threads were up and could start recovery.
> +                */
> +               if (atomic_read(&conn->connection_reinstatement))
> +                       goto err;

Why only for login->login_complete case? In other case the session will
not hang? Will it be droppped on login timeout or something else?

May be the root cause is point 2 itself - calling iscsit_cause_connection_reinstatement
in not ISER_CONN_FULL_FEATURE state where there are no TX_RX threads?
I mean that was a misuse of iscsit_cause_connection_reinstatement?

>         }
> 
>         if (conn->conn_transport->iscsit_put_login_tx(conn, login,
> --
> 2.31.1
> 
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-12  3:08 ` [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP Mike Christie
@ 2023-01-13 14:15   ` Dmitry Bogdanov
  2023-01-17 11:52     ` Dmitry Bogdanov
  0 siblings, 1 reply; 26+ messages in thread
From: Dmitry Bogdanov @ 2023-01-13 14:15 UTC (permalink / raw)
  To: Mike Christie
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> 
> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> connection is down and it can't send/recv IO (tx/rx threads are killed
> or the cleanup thread is run from the one thats up). It will then loop
> over running commands and wait for LIO core to complete them or clean
> them up if they were on an internal queue waiting to be sent or ackd.
> 
> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> command but for isert we need to prevent LIO core from calling into
> iscsit callouts when the connection is being brought down. If LIO core
> queues commands to iscsit and it ends up adding to an internal queue
> instead of passing back to the driver then we can end up hanging waiting
> on command completion that never occurs because it's stuck on the internal
> list (the tx thread is stopped at this time, so it will never loop over
> the response list and call into isert). We also want to sync up on a
> point where we no longer call into isert so it can cleanup it's structs.
> 
> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> command execution and also fixes the locking around the
> target_cmd_interrupted calls so we don't have a case where a command
> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> time.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/target_core_sbc.c       |  2 +-
>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
>  2 files changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> index 7536ca797606..56136613767f 100644
> --- a/drivers/target/target_core_sbc.c
> +++ b/drivers/target/target_core_sbc.c
> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
>                  * we don't have to perform the write operation.
>                  */
>                 WARN_ON(!(cmd->transport_state &
> -                       (CMD_T_ABORTED | CMD_T_STOP)));
> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
>                 goto out;
>         }
>         /*
> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> index cb3fdc81ba3b..02a9476945dc 100644
> --- a/drivers/target/target_core_transport.c
> +++ b/drivers/target/target_core_transport.c
> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
>          * Determine if frontend context caller is requesting the stopping of
>          * this command for frontend exceptions.
>          */
> -       if (cmd->transport_state & CMD_T_STOP) {
> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
>                         __func__, __LINE__, cmd->tag);
> 
>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
>                 INIT_WORK(&cmd->work, target_abort_work);
>                 queue_work(target_completion_wq, &cmd->work);
>                 return true;
> -       } else if (cmd->transport_state & CMD_T_STOP) {
> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>                 if (cmd->transport_complete_callback)
>                         cmd->transport_complete_callback(cmd, false, &post_ret);
>                 complete_all(&cmd->t_transport_stop_comp);
> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
>         int success, cpu;
>         unsigned long flags;
> 
> -       if (target_cmd_interrupted(cmd))
> +       spin_lock_irqsave(&cmd->t_state_lock, flags);

That leads to a hardlock because
target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
cmd->t_state_lock.

> +       if (target_cmd_interrupted(cmd)) {
> +               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
>                 return;
> +       }
> 
>         cmd->scsi_status = scsi_status;
>         cmd->sense_reason = sense_reason;
> 
> -       spin_lock_irqsave(&cmd->t_state_lock, flags);
>         switch (cmd->scsi_status) {
>         case SAM_STAT_CHECK_CONDITION:
>                 if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
> @@ -2277,10 +2279,12 @@ void target_execute_cmd(struct se_cmd *cmd)
>          *
>          * If the received CDB has already been aborted stop processing it here.
>          */
> -       if (target_cmd_interrupted(cmd))
> +       spin_lock_irq(&cmd->t_state_lock);
> +       if (target_cmd_interrupted(cmd)) {
> +               spin_unlock_irq(&cmd->t_state_lock);
>                 return;
> +       }
> 
> -       spin_lock_irq(&cmd->t_state_lock);
>         cmd->t_state = TRANSPORT_PROCESSING;
>         cmd->transport_state |= CMD_T_ACTIVE | CMD_T_SENT;
>         spin_unlock_irq(&cmd->t_state_lock);
> @@ -2847,9 +2851,9 @@ transport_generic_new_cmd(struct se_cmd *cmd)
>          * Determine if frontend context caller is requesting the stopping of
>          * this command for frontend exceptions.
>          */
> -       if (cmd->transport_state & CMD_T_STOP &&
> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP) &&
>             !cmd->se_tfo->write_pending_must_be_called) {
> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOPfor ITT: 0x%08llx\n",
>                          __func__, __LINE__, cmd->tag);
> 
>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> @@ -2880,11 +2884,12 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
>         bool stop;
> 
>         spin_lock_irqsave(&cmd->t_state_lock, flags);
> -       stop = (cmd->transport_state & (CMD_T_STOP | CMD_T_ABORTED));
> +       stop = (cmd->transport_state &
> +               (CMD_T_STOP | CMD_T_FABRIC_STOP | CMD_T_ABORTED));
>         spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> 
>         if (stop) {
> -               pr_debug("%s:%d CMD_T_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP|CMD_T_ABORTED for ITT: 0x%08llx\n",
>                         __func__, __LINE__, cmd->tag);
>                 complete_all(&cmd->t_transport_stop_comp);
>                 return;
> --
> 2.31.1
> 
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-13 14:15   ` Dmitry Bogdanov
@ 2023-01-17 11:52     ` Dmitry Bogdanov
  2023-01-17 18:45       ` Mike Christie
  0 siblings, 1 reply; 26+ messages in thread
From: Dmitry Bogdanov @ 2023-01-17 11:52 UTC (permalink / raw)
  To: Mike Christie
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> > 
> > iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> > connection is down and it can't send/recv IO (tx/rx threads are killed
> > or the cleanup thread is run from the one thats up). It will then loop
> > over running commands and wait for LIO core to complete them or clean
> > them up if they were on an internal queue waiting to be sent or ackd.
> > 
> > Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> > command but for isert we need to prevent LIO core from calling into
> > iscsit callouts when the connection is being brought down. If LIO core
> > queues commands to iscsit and it ends up adding to an internal queue
> > instead of passing back to the driver then we can end up hanging waiting
> > on command completion that never occurs because it's stuck on the internal
> > list (the tx thread is stopped at this time, so it will never loop over
> > the response list and call into isert). We also want to sync up on a
> > point where we no longer call into isert so it can cleanup it's structs.
> > 
> > This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> > command execution and also fixes the locking around the
> > target_cmd_interrupted calls so we don't have a case where a command
> > is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> > time.
> > 
> > Signed-off-by: Mike Christie <michael.christie@oracle.com>
> > ---
> >  drivers/target/target_core_sbc.c       |  2 +-
> >  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
> >  2 files changed, 17 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> > index 7536ca797606..56136613767f 100644
> > --- a/drivers/target/target_core_sbc.c
> > +++ b/drivers/target/target_core_sbc.c
> > @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
> >                  * we don't have to perform the write operation.
> >                  */
> >                 WARN_ON(!(cmd->transport_state &
> > -                       (CMD_T_ABORTED | CMD_T_STOP)));
> > +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
> >                 goto out;
> >         }
> >         /*
> > diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> > index cb3fdc81ba3b..02a9476945dc 100644
> > --- a/drivers/target/target_core_transport.c
> > +++ b/drivers/target/target_core_transport.c
> > @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
> >          * Determine if frontend context caller is requesting the stopping of
> >          * this command for frontend exceptions.
> >          */
> > -       if (cmd->transport_state & CMD_T_STOP) {
> > -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> > +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> > +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
> >                         __func__, __LINE__, cmd->tag);
> > 
> >                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> > @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
> >                 INIT_WORK(&cmd->work, target_abort_work);
> >                 queue_work(target_completion_wq, &cmd->work);
> >                 return true;
> > -       } else if (cmd->transport_state & CMD_T_STOP) {
> > +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >                 if (cmd->transport_complete_callback)
> >                         cmd->transport_complete_callback(cmd, false, &post_ret);
> >                 complete_all(&cmd->t_transport_stop_comp);
> > @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
> >         int success, cpu;
> >         unsigned long flags;
> > 
> > -       if (target_cmd_interrupted(cmd))
> > +       spin_lock_irqsave(&cmd->t_state_lock, flags);
> 
> That leads to a hardlock because
> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
> cmd->t_state_lock.
But the taking the lock for read+write of cmd->t*_state is absolutelly right!
It would be great if you manage to move transport_complete_callback()
into other thread/job.

> > +       if (target_cmd_interrupted(cmd)) {
> > +               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> >                 return;
> > +       }
> > 
> >         cmd->scsi_status = scsi_status;
> >         cmd->sense_reason = sense_reason;
> > 
> > -       spin_lock_irqsave(&cmd->t_state_lock, flags);
> >         switch (cmd->scsi_status) {
> >         case SAM_STAT_CHECK_CONDITION:
> >                 if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-17 11:52     ` Dmitry Bogdanov
@ 2023-01-17 18:45       ` Mike Christie
  2023-01-17 19:55         ` Dmitry Bogdanov
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-17 18:45 UTC (permalink / raw)
  To: Dmitry Bogdanov
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On 1/17/23 05:52, Dmitry Bogdanov wrote:
> On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
>> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
>>>
>>> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
>>> connection is down and it can't send/recv IO (tx/rx threads are killed
>>> or the cleanup thread is run from the one thats up). It will then loop
>>> over running commands and wait for LIO core to complete them or clean
>>> them up if they were on an internal queue waiting to be sent or ackd.
>>>
>>> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
>>> command but for isert we need to prevent LIO core from calling into
>>> iscsit callouts when the connection is being brought down. If LIO core
>>> queues commands to iscsit and it ends up adding to an internal queue
>>> instead of passing back to the driver then we can end up hanging waiting
>>> on command completion that never occurs because it's stuck on the internal
>>> list (the tx thread is stopped at this time, so it will never loop over
>>> the response list and call into isert). We also want to sync up on a
>>> point where we no longer call into isert so it can cleanup it's structs.
>>>
>>> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
>>> command execution and also fixes the locking around the
>>> target_cmd_interrupted calls so we don't have a case where a command
>>> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
>>> time.
>>>
>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>> ---
>>>  drivers/target/target_core_sbc.c       |  2 +-
>>>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
>>>  2 files changed, 17 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
>>> index 7536ca797606..56136613767f 100644
>>> --- a/drivers/target/target_core_sbc.c
>>> +++ b/drivers/target/target_core_sbc.c
>>> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
>>>                  * we don't have to perform the write operation.
>>>                  */
>>>                 WARN_ON(!(cmd->transport_state &
>>> -                       (CMD_T_ABORTED | CMD_T_STOP)));
>>> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
>>>                 goto out;
>>>         }
>>>         /*
>>> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
>>> index cb3fdc81ba3b..02a9476945dc 100644
>>> --- a/drivers/target/target_core_transport.c
>>> +++ b/drivers/target/target_core_transport.c
>>> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
>>>          * Determine if frontend context caller is requesting the stopping of
>>>          * this command for frontend exceptions.
>>>          */
>>> -       if (cmd->transport_state & CMD_T_STOP) {
>>> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
>>> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>>> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
>>>                         __func__, __LINE__, cmd->tag);
>>>
>>>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
>>> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
>>>                 INIT_WORK(&cmd->work, target_abort_work);
>>>                 queue_work(target_completion_wq, &cmd->work);
>>>                 return true;
>>> -       } else if (cmd->transport_state & CMD_T_STOP) {
>>> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
>>>                 if (cmd->transport_complete_callback)
>>>                         cmd->transport_complete_callback(cmd, false, &post_ret);
>>>                 complete_all(&cmd->t_transport_stop_comp);
>>> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
>>>         int success, cpu;
>>>         unsigned long flags;
>>>
>>> -       if (target_cmd_interrupted(cmd))
>>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
>>
>> That leads to a hardlock because
>> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
>> cmd->t_state_lock.

We shouldn't lock up because for the failure cases the transport_complete_callback
functions don't take the lock. They just cleanup and return.

> But the taking the lock for read+write of cmd->t*_state is absolutelly right!
> It would be great if you manage to move transport_complete_callback()
> into other thread/job.

I'm not sure why we want to do that since none of the transport_complete_callback
sleep on failure. It seems to just add more complexity and we only have the 2
transport_complete_callback uses now, so it seems overkill.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-17 18:45       ` Mike Christie
@ 2023-01-17 19:55         ` Dmitry Bogdanov
  2023-01-18 19:12           ` Mike Christie
  0 siblings, 1 reply; 26+ messages in thread
From: Dmitry Bogdanov @ 2023-01-17 19:55 UTC (permalink / raw)
  To: Mike Christie
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On Tue, Jan 17, 2023 at 12:45:56PM -0600, Mike Christie wrote:
> 
> On 1/17/23 05:52, Dmitry Bogdanov wrote:
> > On Fri, Jan 13, 2023 at 05:15:12PM +0300, Dmitry Bogdanov wrote:
> >> On Wed, Jan 11, 2023 at 09:08:30PM -0600, Mike Christie wrote:
> >>>
> >>> iscsit will set CMD_T_FABRIC_STOP on running commands when its transport
> >>> connection is down and it can't send/recv IO (tx/rx threads are killed
> >>> or the cleanup thread is run from the one thats up). It will then loop
> >>> over running commands and wait for LIO core to complete them or clean
> >>> them up if they were on an internal queue waiting to be sent or ackd.
> >>>
> >>> Currently, CMD_T_FABRIC_STOP only stops TMRs from operating on the
> >>> command but for isert we need to prevent LIO core from calling into
> >>> iscsit callouts when the connection is being brought down. If LIO core
> >>> queues commands to iscsit and it ends up adding to an internal queue
> >>> instead of passing back to the driver then we can end up hanging waiting
> >>> on command completion that never occurs because it's stuck on the internal
> >>> list (the tx thread is stopped at this time, so it will never loop over
> >>> the response list and call into isert). We also want to sync up on a
> >>> point where we no longer call into isert so it can cleanup it's structs.
> >>>
> >>> This has LIO core treat CMD_T_FABRIC_STOP like CMD_T_STOP during
> >>> command execution and also fixes the locking around the
> >>> target_cmd_interrupted calls so we don't have a case where a command
> >>> is marked CMD_T_COMPLETE and CMD_T_STOP|CMD_T_FABRIC_STOP at the same
> >>> time.
> >>>
> >>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> >>> ---
> >>>  drivers/target/target_core_sbc.c       |  2 +-
> >>>  drivers/target/target_core_transport.c | 27 +++++++++++++++-----------
> >>>  2 files changed, 17 insertions(+), 12 deletions(-)
> >>>
> >>> diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
> >>> index 7536ca797606..56136613767f 100644
> >>> --- a/drivers/target/target_core_sbc.c
> >>> +++ b/drivers/target/target_core_sbc.c
> >>> @@ -459,7 +459,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
> >>>                  * we don't have to perform the write operation.
> >>>                  */
> >>>                 WARN_ON(!(cmd->transport_state &
> >>> -                       (CMD_T_ABORTED | CMD_T_STOP)));
> >>> +                       (CMD_T_ABORTED | CMD_T_STOP | CMD_T_FABRIC_STOP)));
> >>>                 goto out;
> >>>         }
> >>>         /*
> >>> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> >>> index cb3fdc81ba3b..02a9476945dc 100644
> >>> --- a/drivers/target/target_core_transport.c
> >>> +++ b/drivers/target/target_core_transport.c
> >>> @@ -737,8 +737,8 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
> >>>          * Determine if frontend context caller is requesting the stopping of
> >>>          * this command for frontend exceptions.
> >>>          */
> >>> -       if (cmd->transport_state & CMD_T_STOP) {
> >>> -               pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
> >>> +       if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >>> +               pr_debug("%s:%d CMD_T_STOP|CMD_T_FABRIC_STOP for ITT: 0x%08llx\n",
> >>>                         __func__, __LINE__, cmd->tag);
> >>>
> >>>                 spin_unlock_irqrestore(&cmd->t_state_lock, flags);
> >>> @@ -889,7 +889,7 @@ static bool target_cmd_interrupted(struct se_cmd *cmd)
> >>>                 INIT_WORK(&cmd->work, target_abort_work);
> >>>                 queue_work(target_completion_wq, &cmd->work);
> >>>                 return true;
> >>> -       } else if (cmd->transport_state & CMD_T_STOP) {
> >>> +       } else if (cmd->transport_state & (CMD_T_STOP | CMD_T_FABRIC_STOP)) {
> >>>                 if (cmd->transport_complete_callback)
> >>>                         cmd->transport_complete_callback(cmd, false, &post_ret);
> >>>                 complete_all(&cmd->t_transport_stop_comp);
> >>> @@ -907,13 +907,15 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status,
> >>>         int success, cpu;
> >>>         unsigned long flags;
> >>>
> >>> -       if (target_cmd_interrupted(cmd))
> >>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
> >>
> >> That leads to a hardlock because
> >> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
> >> cmd->t_state_lock.
> 
> We shouldn't lock up because for the failure cases the transport_complete_callback
> functions don't take the lock. They just cleanup and return.

The second callback (compare_and_write_post) does take the lock always.
Although to be honest, that lock there has no sense.

> > But the taking the lock for read+write of cmd->t*_state is absolutelly right!
> > It would be great if you manage to move transport_complete_callback()
> > into other thread/job.
> 
> I'm not sure why we want to do that since none of the transport_complete_callback
> sleep on failure. It seems to just add more complexity and we only have the 2
> transport_complete_callback uses now, so it seems overkill.

I have a personal interest on that :) Of course, it is just one of the
ways to solve it.

BR,
 Dmitry


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login
  2023-01-13 14:08   ` Dmitry Bogdanov
@ 2023-01-18  2:14     ` Mike Christie
  2023-01-23  9:53       ` Sagi Grimberg
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Christie @ 2023-01-18  2:14 UTC (permalink / raw)
  To: Dmitry Bogdanov
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On 1/13/23 08:08, Dmitry Bogdanov wrote:
> On Wed, Jan 11, 2023 at 09:08:28PM -0600, Mike Christie wrote:
>>
>> If we get a disconnect event while logging in we can end up in a state
>> where will never be able to relogin. This happens when:
>>
>> 1. login thread has put us into TARG_CONN_STATE_IN_LOGIN
>> 2. isert then does
>>
>> isert_disconnected_handler -> iscsit_cause_connection_reinstatement
>>
>> which sets the conn connection_reinstatement flag. Nothing else happens
>> because we are only in IN_LOGIN. The tx/rx threads are not running yet
>> so we can't start recovery from those contexts at this time.
>>
>> 3. The login thread finishes processing the login pdu and thinks login is
>> done. It sets us into TARG_CONN_STATE_LOGGED_IN/TARG_SESS_STATE_LOGGED_IN.
>> This starts the rx/tx threads.
>>
>> 4. The initiator thought it disconnected the connection at 2, and has
>> since sent a new connect which is now handled. This leads us to eventually
>> run:
>>
>> iscsi_check_for_session_reinstatement-> iscsit_stop_session ->
>> iscsit_cause_connection_reinstatement
>>
>> iscsit_stop_session sees the old conn and does
>> iscsit_cause_connection_reinstatement which sees connection_reinstatement
>> is set so it just returns instead of trying to kill the tx/rx threads
>> which would have caused recovery to start.
>>
>> 5. iscsit_stop_session then waits on session_wait_comp which will never
>> happen since we didn't kill the tx/rx threads.
>>
>> This has the iscsit login code check if a fabric driver ran
>> iscsit_cause_connection_reinstatement during the login process similar
>> to how we check for the sk state for tcp based iscsit. This will prevent
>> us from getting into 3 and creating a ghost session.
>>
>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>> ---
>>  drivers/target/iscsi/iscsi_target_nego.c | 10 ++++++++++
>>  1 file changed, 10 insertions(+)
>>
>> diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
>> index ff49c8f3fe24..2dd81752d4c9 100644
>> --- a/drivers/target/iscsi/iscsi_target_nego.c
>> +++ b/drivers/target/iscsi/iscsi_target_nego.c
>> @@ -350,6 +350,16 @@ static int iscsi_target_do_tx_login_io(struct iscsit_conn *conn, struct iscsi_lo
>>                                             ISCSI_LOGIN_STATUS_NO_RESOURCES);
>>                         return -1;
>>                 }
>> +
>> +               /*
>> +                * isert doesn't know the iscsit state and uses
>> +                * iscsit_cause_connection_reinstatement as a generic error
>> +                * notification system. It may call it before we are in FFP.
>> +                * Handle this now in case it signaled a failure before the
>> +                * rx/tx threads were up and could start recovery.
>> +                */
>> +               if (atomic_read(&conn->connection_reinstatement))
>> +                       goto err;
> 
> Why only for login->login_complete case? In other case the session will
> not hang? Will it be droppped on login timeout or something else?

It will not hang. If you hit an error with isert before we think we can go into
FFP then the login timeout currently cleans up the conn and session.

> 
> May be the root cause is point 2 itself - calling iscsit_cause_connection_reinstatement
> in not ISER_CONN_FULL_FEATURE state where there are no TX_RX threads?
> I mean that was a misuse of iscsit_cause_connection_reinstatement?

Let me drop this patch for now. After writing the response above about normally it just
times out, and thinking about your question here I think to really fix this we want to
fully integrate isert login into iscsit.

The root problem is that isert login is not integrated into iscsit, so there is really
no error handling. iscsit_cause_connection_reinstatement was supposed to do it, but it
doesn't do anything.

So we need to separate the LOGIN_FLAGS_CLOSED tests and setting from the iscsi_target_sk*
code. Add a helper to set that bit and do some state checks, and make the checks generic
in the login code (not tied to having a socket). Convert iscsit and isert to use the new
helper. Then handle the LOGIN_FLAGS_READ_ACTIVE/LOGIN_FLAGS_WRITE_ACTIVE and login_work
stuff. Then review cxgb?

I'll do that later after fixing the command cleanup stuff in this patchsdet and the
other patchsets I have outstanding.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP
  2023-01-17 19:55         ` Dmitry Bogdanov
@ 2023-01-18 19:12           ` Mike Christie
  0 siblings, 0 replies; 26+ messages in thread
From: Mike Christie @ 2023-01-18 19:12 UTC (permalink / raw)
  To: Dmitry Bogdanov
  Cc: mlombard, martin.petersen, mgurtovoy, sagi, linux-scsi, target-devel

On 1/17/23 13:55, Dmitry Bogdanov wrote
>>>>> -       if (target_cmd_interrupted(cmd))
>>>>> +       spin_lock_irqsave(&cmd->t_state_lock, flags);
>>>>
>>>> That leads to a hardlock because
>>>> target_cmd_interrupted() => cmd->transport_complete_callback() also tooks
>>>> cmd->t_state_lock.
>>
>> We shouldn't lock up because for the failure cases the transport_complete_callback
>> functions don't take the lock. They just cleanup and return.
> 
> The second callback (compare_and_write_post) does take the lock always.
> Although to be honest, that lock there has no sense.

You're right. I messed up. I removed it in my tree in a prep patch
but never posted it.

> 
>>> But the taking the lock for read+write of cmd->t*_state is absolutelly right!
>>> It would be great if you manage to move transport_complete_callback()
>>> into other thread/job.
>>
>> I'm not sure why we want to do that since none of the transport_complete_callback
>> sleep on failure. It seems to just add more complexity and we only have the 2
>> transport_complete_callback uses now, so it seems overkill.
> 
> I have a personal interest on that :) Of course, it is just one of the
> ways to solve it.

Ah ok. I'm going to do the more standard process and code for the what we
have right now since we don't know when your code will be pushed upstream.
It's also the more simple approach so we can always change it and make it more
complicated later on. We just really have to get these regressions fixed.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds
  2023-01-12  3:08 ` [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds Mike Christie
@ 2023-01-23  9:50   ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2023-01-23  9:50 UTC (permalink / raw)
  To: Mike Christie, mlombard, martin.petersen, mgurtovoy, d.bogdanov,
	linux-scsi, target-devel

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

Mike aren't you also missing a fix with isert_put_unsol_pending_cmds
due to the mis-clearing of write_data_done  in isert_rdma_read_done ?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login
  2023-01-18  2:14     ` Mike Christie
@ 2023-01-23  9:53       ` Sagi Grimberg
  2023-01-24  2:40         ` michael.christie
  0 siblings, 1 reply; 26+ messages in thread
From: Sagi Grimberg @ 2023-01-23  9:53 UTC (permalink / raw)
  To: Mike Christie, Dmitry Bogdanov
  Cc: mlombard, martin.petersen, mgurtovoy, linux-scsi, target-devel



On 1/18/23 04:14, Mike Christie wrote:
> On 1/13/23 08:08, Dmitry Bogdanov wrote:
>> On Wed, Jan 11, 2023 at 09:08:28PM -0600, Mike Christie wrote:
>>>
>>> If we get a disconnect event while logging in we can end up in a state
>>> where will never be able to relogin. This happens when:
>>>
>>> 1. login thread has put us into TARG_CONN_STATE_IN_LOGIN
>>> 2. isert then does
>>>
>>> isert_disconnected_handler -> iscsit_cause_connection_reinstatement
>>>
>>> which sets the conn connection_reinstatement flag. Nothing else happens
>>> because we are only in IN_LOGIN. The tx/rx threads are not running yet
>>> so we can't start recovery from those contexts at this time.
>>>
>>> 3. The login thread finishes processing the login pdu and thinks login is
>>> done. It sets us into TARG_CONN_STATE_LOGGED_IN/TARG_SESS_STATE_LOGGED_IN.
>>> This starts the rx/tx threads.
>>>
>>> 4. The initiator thought it disconnected the connection at 2, and has
>>> since sent a new connect which is now handled. This leads us to eventually
>>> run:
>>>
>>> iscsi_check_for_session_reinstatement-> iscsit_stop_session ->
>>> iscsit_cause_connection_reinstatement
>>>
>>> iscsit_stop_session sees the old conn and does
>>> iscsit_cause_connection_reinstatement which sees connection_reinstatement
>>> is set so it just returns instead of trying to kill the tx/rx threads
>>> which would have caused recovery to start.
>>>
>>> 5. iscsit_stop_session then waits on session_wait_comp which will never
>>> happen since we didn't kill the tx/rx threads.
>>>
>>> This has the iscsit login code check if a fabric driver ran
>>> iscsit_cause_connection_reinstatement during the login process similar
>>> to how we check for the sk state for tcp based iscsit. This will prevent
>>> us from getting into 3 and creating a ghost session.
>>>
>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>> ---
>>>   drivers/target/iscsi/iscsi_target_nego.c | 10 ++++++++++
>>>   1 file changed, 10 insertions(+)
>>>
>>> diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
>>> index ff49c8f3fe24..2dd81752d4c9 100644
>>> --- a/drivers/target/iscsi/iscsi_target_nego.c
>>> +++ b/drivers/target/iscsi/iscsi_target_nego.c
>>> @@ -350,6 +350,16 @@ static int iscsi_target_do_tx_login_io(struct iscsit_conn *conn, struct iscsi_lo
>>>                                              ISCSI_LOGIN_STATUS_NO_RESOURCES);
>>>                          return -1;
>>>                  }
>>> +
>>> +               /*
>>> +                * isert doesn't know the iscsit state and uses
>>> +                * iscsit_cause_connection_reinstatement as a generic error
>>> +                * notification system. It may call it before we are in FFP.
>>> +                * Handle this now in case it signaled a failure before the
>>> +                * rx/tx threads were up and could start recovery.
>>> +                */
>>> +               if (atomic_read(&conn->connection_reinstatement))
>>> +                       goto err;
>>
>> Why only for login->login_complete case? In other case the session will
>> not hang? Will it be droppped on login timeout or something else?
> 
> It will not hang. If you hit an error with isert before we think we can go into
> FFP then the login timeout currently cleans up the conn and session.
> 
>>
>> May be the root cause is point 2 itself - calling iscsit_cause_connection_reinstatement
>> in not ISER_CONN_FULL_FEATURE state where there are no TX_RX threads?
>> I mean that was a misuse of iscsit_cause_connection_reinstatement?
> 
> Let me drop this patch for now. After writing the response above about normally it just
> times out, and thinking about your question here I think to really fix this we want to
> fully integrate isert login into iscsit.
> 
> The root problem is that isert login is not integrated into iscsit, so there is really
> no error handling. iscsit_cause_connection_reinstatement was supposed to do it, but it
> doesn't do anything.
> 
> So we need to separate the LOGIN_FLAGS_CLOSED tests and setting from the iscsi_target_sk*
> code. Add a helper to set that bit and do some state checks, and make the checks generic
> in the login code (not tied to having a socket). Convert iscsit and isert to use the new
> helper. Then handle the LOGIN_FLAGS_READ_ACTIVE/LOGIN_FLAGS_WRITE_ACTIVE and login_work
> stuff. Then review cxgb?
> 
> I'll do that later after fixing the command cleanup stuff in this patchsdet and the
> other patchsets I have outstanding.

Perhaps the fix is as simple as checking the iscsi state to be also 
full-featured and if not, do a simple error handling within
iser_disconnected_handler?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 00/13] target task management fixes
  2023-01-12  3:13 ` [PATCH v2 00/13] target task management fixes Mike Christie
@ 2023-01-23  9:54   ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2023-01-23  9:54 UTC (permalink / raw)
  To: Mike Christie, mlombard, martin.petersen, mgurtovoy, d.bogdanov,
	linux-scsi, target-devel



On 1/12/23 05:13, Mike Christie wrote:
> On 1/11/23 9:08 PM, Mike Christie wrote:
>> The following patches apply over Martin's 6.2 and 6.3 branches or
>> Linus's tree. They fix a couple regressions in iscsit that occur when
>> there are multiple sessions accessing the same se_device and TMRs are
>> executing and a connection is closed. And they fix some bugs in isert
>> for the single session case when there are TMRs executing and the
>> connection is closed.
>>
> 
> Oh yeah Sagi and Max,
> 
> I didn't post the write_data_done fix. Let me know how you want it
> fixed and I can post it.

I think just fixing the write_data_done to add the data read via
the rdma operation.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login
  2023-01-23  9:53       ` Sagi Grimberg
@ 2023-01-24  2:40         ` michael.christie
  0 siblings, 0 replies; 26+ messages in thread
From: michael.christie @ 2023-01-24  2:40 UTC (permalink / raw)
  To: Sagi Grimberg, Dmitry Bogdanov
  Cc: mlombard, martin.petersen, mgurtovoy, linux-scsi, target-devel

On 1/23/23 3:53 AM, Sagi Grimberg wrote:
> 
> 
> On 1/18/23 04:14, Mike Christie wrote:
>> On 1/13/23 08:08, Dmitry Bogdanov wrote:
>>> On Wed, Jan 11, 2023 at 09:08:28PM -0600, Mike Christie wrote:
>>>>
>>>> If we get a disconnect event while logging in we can end up in a state
>>>> where will never be able to relogin. This happens when:
>>>>
>>>> 1. login thread has put us into TARG_CONN_STATE_IN_LOGIN
>>>> 2. isert then does
>>>>
>>>> isert_disconnected_handler -> iscsit_cause_connection_reinstatement
>>>>
>>>> which sets the conn connection_reinstatement flag. Nothing else happens
>>>> because we are only in IN_LOGIN. The tx/rx threads are not running yet
>>>> so we can't start recovery from those contexts at this time.
>>>>
>>>> 3. The login thread finishes processing the login pdu and thinks login is
>>>> done. It sets us into TARG_CONN_STATE_LOGGED_IN/TARG_SESS_STATE_LOGGED_IN.
>>>> This starts the rx/tx threads.
>>>>
>>>> 4. The initiator thought it disconnected the connection at 2, and has
>>>> since sent a new connect which is now handled. This leads us to eventually
>>>> run:
>>>>
>>>> iscsi_check_for_session_reinstatement-> iscsit_stop_session ->
>>>> iscsit_cause_connection_reinstatement
>>>>
>>>> iscsit_stop_session sees the old conn and does
>>>> iscsit_cause_connection_reinstatement which sees connection_reinstatement
>>>> is set so it just returns instead of trying to kill the tx/rx threads
>>>> which would have caused recovery to start.
>>>>
>>>> 5. iscsit_stop_session then waits on session_wait_comp which will never
>>>> happen since we didn't kill the tx/rx threads.
>>>>
>>>> This has the iscsit login code check if a fabric driver ran
>>>> iscsit_cause_connection_reinstatement during the login process similar
>>>> to how we check for the sk state for tcp based iscsit. This will prevent
>>>> us from getting into 3 and creating a ghost session.
>>>>
>>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>>> ---
>>>>   drivers/target/iscsi/iscsi_target_nego.c | 10 ++++++++++
>>>>   1 file changed, 10 insertions(+)
>>>>
>>>> diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
>>>> index ff49c8f3fe24..2dd81752d4c9 100644
>>>> --- a/drivers/target/iscsi/iscsi_target_nego.c
>>>> +++ b/drivers/target/iscsi/iscsi_target_nego.c
>>>> @@ -350,6 +350,16 @@ static int iscsi_target_do_tx_login_io(struct iscsit_conn *conn, struct iscsi_lo
>>>>                                              ISCSI_LOGIN_STATUS_NO_RESOURCES);
>>>>                          return -1;
>>>>                  }
>>>> +
>>>> +               /*
>>>> +                * isert doesn't know the iscsit state and uses
>>>> +                * iscsit_cause_connection_reinstatement as a generic error
>>>> +                * notification system. It may call it before we are in FFP.
>>>> +                * Handle this now in case it signaled a failure before the
>>>> +                * rx/tx threads were up and could start recovery.
>>>> +                */
>>>> +               if (atomic_read(&conn->connection_reinstatement))
>>>> +                       goto err;
>>>
>>> Why only for login->login_complete case? In other case the session will
>>> not hang? Will it be droppped on login timeout or something else?
>>
>> It will not hang. If you hit an error with isert before we think we can go into
>> FFP then the login timeout currently cleans up the conn and session.
>>
>>>
>>> May be the root cause is point 2 itself - calling iscsit_cause_connection_reinstatement
>>> in not ISER_CONN_FULL_FEATURE state where there are no TX_RX threads?
>>> I mean that was a misuse of iscsit_cause_connection_reinstatement?
>>
>> Let me drop this patch for now. After writing the response above about normally it just
>> times out, and thinking about your question here I think to really fix this we want to
>> fully integrate isert login into iscsit.
>>
>> The root problem is that isert login is not integrated into iscsit, so there is really
>> no error handling. iscsit_cause_connection_reinstatement was supposed to do it, but it
>> doesn't do anything.
>>
>> So we need to separate the LOGIN_FLAGS_CLOSED tests and setting from the iscsi_target_sk*
>> code. Add a helper to set that bit and do some state checks, and make the checks generic
>> in the login code (not tied to having a socket). Convert iscsit and isert to use the new
>> helper. Then handle the LOGIN_FLAGS_READ_ACTIVE/LOGIN_FLAGS_WRITE_ACTIVE and login_work
>> stuff. Then review cxgb?
>>
>> I'll do that later after fixing the command cleanup stuff in this patchsdet and the
>> other patchsets I have outstanding.
> 
> Perhaps the fix is as simple as checking the iscsi state to be also full-featured and if not, do a simple error handling within
> iser_disconnected_handler?

Yeah that's what I'm thinking above. The catch was what is the simple error handler going to be.

For iscsit tcp it mostly just sets LOGIN_FLAGS_CLOSED to signal the failure. So I was saying
above if we could just expose that to isert, then all it has to do is call some helper iscsit
helper:

isert_disconnected_handler
...
        case ISER_CONN_BOUND:
		iscsit_login_failed();
		break;
	case ISER_CONN_FULL_FEATURE
		iscsit_cause_connection_reinstatement(isert_conn->conn, 0);

*I think isert_login_send_done would also need a change.

The iscsit code then would work similarly for both isert and iscsit tcp. There's then just
cxgb which I think just times out like isert.

The other option is for the simple case is just let it timeout like cxgb. And for isert
just do nothing:

        case ISER_CONN_BOUND:
		isert_err("Failure on unbound connection. Timing out login\n");
		break;
	case ISER_CONN_FULL_FEATURE
		iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
		break;

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2023-01-24  2:41 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-12  3:08 [PATCH v2 00/13] target task management fixes Mike Christie
2023-01-12  3:08 ` [PATCH v2 01/13] scsi: target: Move sess cmd counter to new struct Mike Christie
2023-01-12  3:08 ` [PATCH v2 02/13] scsi: target: Move cmd counter allocation Mike Christie
2023-01-12  3:08 ` [PATCH v2 03/13] scsi: target: Pass in cmd counter to use during cmd setup Mike Christie
2023-01-12  3:08 ` [PATCH v2 04/13] scsi: target: iscsit/isert: Alloc per conn cmd counter Mike Christie
2023-01-12  3:08 ` [PATCH v2 05/13] scsi: target: iscsit: stop/wait on cmds during conn close Mike Christie
2023-01-12  3:08 ` [PATCH v2 06/13] scsi: target: iscsit: Fix TAS handling during conn cleanup Mike Christie
2023-01-12  3:08 ` [PATCH v2 07/13] scsi: target: Fix multiple LUN_RESET handling Mike Christie
2023-01-12  3:08 ` [PATCH v2 08/13] scsi: target: drop tas arg from __transport_wait_for_tasks Mike Christie
2023-01-12  3:08 ` [PATCH v2 09/13] scsi: target: iscsit: Fix isert disconnect handling during login Mike Christie
2023-01-13 14:08   ` Dmitry Bogdanov
2023-01-18  2:14     ` Mike Christie
2023-01-23  9:53       ` Sagi Grimberg
2023-01-24  2:40         ` michael.christie
2023-01-12  3:08 ` [PATCH v2 10/13] scsi: target: iscsit: Add helper to check when cmd has failed Mike Christie
2023-01-12  3:08 ` [PATCH v2 11/13] scsi: target: Treat CMD_T_FABRIC_STOP like CMD_T_STOP Mike Christie
2023-01-13 14:15   ` Dmitry Bogdanov
2023-01-17 11:52     ` Dmitry Bogdanov
2023-01-17 18:45       ` Mike Christie
2023-01-17 19:55         ` Dmitry Bogdanov
2023-01-18 19:12           ` Mike Christie
2023-01-12  3:08 ` [PATCH v2 12/13] scsi: target: iscsit: Cleanup isert commands at conn closure Mike Christie
2023-01-12  3:08 ` [PATCH v2 13/13] IB/isert: Fix hang in target_wait_for_cmds Mike Christie
2023-01-23  9:50   ` Sagi Grimberg
2023-01-12  3:13 ` [PATCH v2 00/13] target task management fixes Mike Christie
2023-01-23  9:54   ` Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).