All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
  2016-07-18 21:58 ` Steve Wise
@ 2016-07-18 20:44     ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 20:44 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagi-NQWnxTmZq1alnMjI0IkVqw, sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
	mlin-DgEjT+Ai2ygdnm+yROfE0A, hch-jcswGhMUV9g,
	linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Remove the complicated logic to free the cm_id resources in iw_cm event
handlers vs when an application thread destroys the device.  I'm not sure
why this code was written, but simply allowing the last deref to free
the memory is cleaner.  It also prevents a deadlock when applications
try to destroy cm_id's in their cm event handler function.

Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
 drivers/infiniband/core/iwcm.c | 41 ++++++-----------------------------------
 1 file changed, 6 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index f057204..20a94ed 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
 
 /*
  * Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
  */
 static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
 {
 	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
 	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
 		BUG_ON(!list_empty(&cm_id_priv->work_list));
-		complete(&cm_id_priv->destroy_comp);
+		free_cm_id(cm_id_priv);
 		return 1;
 	}
 
@@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id)
 static void rem_ref(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
-	int cb_destroy;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
-	/*
-	 * Test bit before deref in case the cm_id gets freed on another
-	 * thread.
-	 */
-	cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-	if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
-		BUG_ON(!list_empty(&cm_id_priv->work_list));
-		free_cm_id(cm_id_priv);
-	}
+	(void)iwcm_deref_id(cm_id_priv);
 }
 
 static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
@@ -433,13 +423,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
 	struct iwcm_id_private *cm_id_priv;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
-	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
 	destroy_cm_id(cm_id);
-
-	wait_for_completion(&cm_id_priv->destroy_comp);
-
-	free_cm_id(cm_id_priv);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
@@ -809,10 +793,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
 	ret = cm_id->cm_handler(cm_id, iw_event);
 	if (ret) {
 		iw_cm_reject(cm_id, NULL, 0);
-		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-		destroy_cm_id(cm_id);
-		if (atomic_read(&cm_id_priv->refcount)==0)
-			free_cm_id(cm_id_priv);
+		iw_destroy_cm_id(cm_id);
 	}
 
 out:
@@ -1000,7 +981,6 @@ static void cm_work_handler(struct work_struct *_work)
 	unsigned long flags;
 	int empty;
 	int ret = 0;
-	int destroy_id;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	empty = list_empty(&cm_id_priv->work_list);
@@ -1014,19 +994,10 @@ static void cm_work_handler(struct work_struct *_work)
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 		ret = process_event(cm_id_priv, &levent);
-		if (ret) {
-			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (ret)
 			destroy_cm_id(&cm_id_priv->id);
-		}
-		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
-		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-		if (iwcm_deref_id(cm_id_priv)) {
-			if (destroy_id) {
-				BUG_ON(!list_empty(&cm_id_priv->work_list));
-				free_cm_id(cm_id_priv);
-			}
+		if (iwcm_deref_id(cm_id_priv))
 			return;
-		}
 		if (empty)
 			return;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
@ 2016-07-18 20:44     ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 20:44 UTC (permalink / raw)


Remove the complicated logic to free the cm_id resources in iw_cm event
handlers vs when an application thread destroys the device.  I'm not sure
why this code was written, but simply allowing the last deref to free
the memory is cleaner.  It also prevents a deadlock when applications
try to destroy cm_id's in their cm event handler function.

Signed-off-by: Steve Wise <swise at opengridcomputing.com>
---
 drivers/infiniband/core/iwcm.c | 41 ++++++-----------------------------------
 1 file changed, 6 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index f057204..20a94ed 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
 
 /*
  * Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
  */
 static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
 {
 	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
 	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
 		BUG_ON(!list_empty(&cm_id_priv->work_list));
-		complete(&cm_id_priv->destroy_comp);
+		free_cm_id(cm_id_priv);
 		return 1;
 	}
 
@@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id)
 static void rem_ref(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
-	int cb_destroy;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
-	/*
-	 * Test bit before deref in case the cm_id gets freed on another
-	 * thread.
-	 */
-	cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-	if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
-		BUG_ON(!list_empty(&cm_id_priv->work_list));
-		free_cm_id(cm_id_priv);
-	}
+	(void)iwcm_deref_id(cm_id_priv);
 }
 
 static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
@@ -433,13 +423,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
 	struct iwcm_id_private *cm_id_priv;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
-	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
 	destroy_cm_id(cm_id);
-
-	wait_for_completion(&cm_id_priv->destroy_comp);
-
-	free_cm_id(cm_id_priv);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
@@ -809,10 +793,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
 	ret = cm_id->cm_handler(cm_id, iw_event);
 	if (ret) {
 		iw_cm_reject(cm_id, NULL, 0);
-		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-		destroy_cm_id(cm_id);
-		if (atomic_read(&cm_id_priv->refcount)==0)
-			free_cm_id(cm_id_priv);
+		iw_destroy_cm_id(cm_id);
 	}
 
 out:
@@ -1000,7 +981,6 @@ static void cm_work_handler(struct work_struct *_work)
 	unsigned long flags;
 	int empty;
 	int ret = 0;
-	int destroy_id;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	empty = list_empty(&cm_id_priv->work_list);
@@ -1014,19 +994,10 @@ static void cm_work_handler(struct work_struct *_work)
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 		ret = process_event(cm_id_priv, &levent);
-		if (ret) {
-			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (ret)
 			destroy_cm_id(&cm_id_priv->id);
-		}
-		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
-		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-		if (iwcm_deref_id(cm_id_priv)) {
-			if (destroy_id) {
-				BUG_ON(!list_empty(&cm_id_priv->work_list));
-				free_cm_id(cm_id_priv);
-			}
+		if (iwcm_deref_id(cm_id_priv))
 			return;
-		}
 		if (empty)
 			return;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
-- 
2.7.0

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] iw_cxgb4: don't block in destroy_qp awaiting the last deref
  2016-07-18 21:58 ` Steve Wise
@ 2016-07-18 20:44     ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 20:44 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagi-NQWnxTmZq1alnMjI0IkVqw, sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
	mlin-DgEjT+Ai2ygdnm+yROfE0A, hch-jcswGhMUV9g,
	linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Blocking in c4iw_destroy_qp() causes a deadlock when apps destroy a qp
or disconnect a cm_id from their cm event handler function.  There is
no nead to block here anyway, so just replace the refcnt atomic with a
kref object and free the memory on the last put.

Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |  2 +-
 drivers/infiniband/hw/cxgb4/qp.c       | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index f6f34a7..0420c21 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -472,7 +472,7 @@ struct c4iw_qp {
 	struct t4_wq wq;
 	spinlock_t lock;
 	struct mutex mutex;
-	atomic_t refcnt;
+	struct kref kref;
 	wait_queue_head_t wait;
 	struct timer_list timer;
 	int sq_sig_all;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 0d461f3..770531a3 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
 	return 0;
 }
 
+void _free_qp(struct kref *kref)
+{
+	struct c4iw_qp *qhp;
+
+	qhp = container_of(kref, struct c4iw_qp, kref);
+	PDBG("%s qhp %p\n", __func__, qhp);
+	kfree(qhp);
+}
+
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
 	PDBG("%s ib_qp %p\n", __func__, qp);
-	atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+	kref_get(&to_c4iw_qp(qp)->kref);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
 	PDBG("%s ib_qp %p\n", __func__, qp);
-	if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
-		wake_up(&(to_c4iw_qp(qp)->wait));
+	kref_put(&to_c4iw_qp(qp)->kref, _free_qp);
 }
 
 static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1592,8 +1600,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 	wait_event(qhp->wait, !qhp->ep);
 
 	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-	atomic_dec(&qhp->refcnt);
-	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
 	spin_lock_irq(&rhp->lock);
 	if (!list_empty(&qhp->db_fc_entry))
@@ -1606,8 +1612,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 	destroy_qp(&rhp->rdev, &qhp->wq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
+	c4iw_qp_rem_ref(ib_qp);
+
 	PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
-	kfree(qhp);
 	return 0;
 }
 
@@ -1704,7 +1711,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	init_completion(&qhp->rq_drained);
 	mutex_init(&qhp->mutex);
 	init_waitqueue_head(&qhp->wait);
-	atomic_set(&qhp->refcnt, 1);
+	kref_init(&qhp->kref);
 
 	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
 	if (ret)
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] iw_cxgb4: don't block in destroy_qp awaiting the last deref
@ 2016-07-18 20:44     ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 20:44 UTC (permalink / raw)


Blocking in c4iw_destroy_qp() causes a deadlock when apps destroy a qp
or disconnect a cm_id from their cm event handler function.  There is
no nead to block here anyway, so just replace the refcnt atomic with a
kref object and free the memory on the last put.

Signed-off-by: Steve Wise <swise at opengridcomputing.com>
---
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |  2 +-
 drivers/infiniband/hw/cxgb4/qp.c       | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index f6f34a7..0420c21 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -472,7 +472,7 @@ struct c4iw_qp {
 	struct t4_wq wq;
 	spinlock_t lock;
 	struct mutex mutex;
-	atomic_t refcnt;
+	struct kref kref;
 	wait_queue_head_t wait;
 	struct timer_list timer;
 	int sq_sig_all;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 0d461f3..770531a3 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
 	return 0;
 }
 
+void _free_qp(struct kref *kref)
+{
+	struct c4iw_qp *qhp;
+
+	qhp = container_of(kref, struct c4iw_qp, kref);
+	PDBG("%s qhp %p\n", __func__, qhp);
+	kfree(qhp);
+}
+
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
 	PDBG("%s ib_qp %p\n", __func__, qp);
-	atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+	kref_get(&to_c4iw_qp(qp)->kref);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
 	PDBG("%s ib_qp %p\n", __func__, qp);
-	if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
-		wake_up(&(to_c4iw_qp(qp)->wait));
+	kref_put(&to_c4iw_qp(qp)->kref, _free_qp);
 }
 
 static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1592,8 +1600,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 	wait_event(qhp->wait, !qhp->ep);
 
 	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-	atomic_dec(&qhp->refcnt);
-	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
 	spin_lock_irq(&rhp->lock);
 	if (!list_empty(&qhp->db_fc_entry))
@@ -1606,8 +1612,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 	destroy_qp(&rhp->rdev, &qhp->wq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
+	c4iw_qp_rem_ref(ib_qp);
+
 	PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
-	kfree(qhp);
 	return 0;
 }
 
@@ -1704,7 +1711,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	init_completion(&qhp->rq_drained);
 	mutex_init(&qhp->mutex);
 	init_waitqueue_head(&qhp->wait);
-	atomic_set(&qhp->refcnt, 1);
+	kref_init(&qhp->kref);
 
 	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
 	if (ret)
-- 
2.7.0

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] nvme-rdma: Fix device removal handling
  2016-07-18 21:58 ` Steve Wise
@ 2016-07-18 20:44     ` Sagi Grimberg
  -1 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-18 20:44 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagi-NQWnxTmZq1alnMjI0IkVqw, sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
	mlin-DgEjT+Ai2ygdnm+yROfE0A, hch-jcswGhMUV9g,
	linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Device removal sequence may have crashed because the
controller (and admin queue space) was freed before
we destroyed the admin queue resources. Thus we
want to destroy the admin queue and only then queue
controller deletion and wait for it to complete.

More specifically we:
1. own the controller deletion (make sure we are not
   competing with another deletion).
2. get rid of inflight reconnects if exists (which
   also destroy and create queues).
3. destroy the queue.
4. safely queue controller deletion (and wait for it
   to complete).

Signed-off-by: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
---
 drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3e3ce2b..0e58450 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 		struct rdma_cm_event *event);
 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
 
 /* XXX: really should move to a generic header sooner or later.. */
 static inline void put_unaligned_le24(u32 val, u8 *p)
@@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
  * that caught the event. Since we hold the callout until the controller
  * deletion is completed, we'll deadlock if the controller deletion will
  * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
- * of destroying this queue before-hand, destroy the queue resources
- * after the controller deletion completed with the exception of destroying
- * the cm_id implicitely by returning a non-zero rc to the callout.
+ * of destroying this queue before-hand, destroy the queue resources,
+ * then queue the controller deletion which won't destroy this queue and
+ * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
  */
 static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
 {
 	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
-	int ret, ctrl_deleted = 0;
+	int ret;
 
-	/* First disable the queue so ctrl delete won't free it */
-	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
-		goto out;
+	/* Own the controller deletion */
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return 0;
 
-	/* delete the controller */
-	ret = __nvme_rdma_del_ctrl(ctrl);
-	if (!ret) {
-		dev_warn(ctrl->ctrl.device,
-			"Got rdma device removal event, deleting ctrl\n");
-		flush_work(&ctrl->delete_work);
+	dev_warn(ctrl->ctrl.device,
+		"Got rdma device removal event, deleting ctrl\n");
 
-		/* Return non-zero so the cm_id will destroy implicitly */
-		ctrl_deleted = 1;
+	/* Get rid of reconnect work if its running */
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
 
-		/* Free this queue ourselves */
-		rdma_disconnect(queue->cm_id);
-		ib_drain_qp(queue->qp);
-		nvme_rdma_destroy_queue_ib(queue);
+	/* Disable the queue so ctrl delete won't free it */
+	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
+		ret = 0;
+		goto queue_delete;
 	}
 
-out:
-	return ctrl_deleted;
+	/* Free this queue ourselves */
+	nvme_rdma_stop_queue(queue);
+	nvme_rdma_destroy_queue_ib(queue);
+
+	/* Return non-zero so the cm_id will destroy implicitly */
+	ret = 1;
+
+queue_delete:
+	/* queue controller deletion */
+	queue_work(nvme_rdma_wq, &ctrl->delete_work);
+	flush_work(&ctrl->delete_work);
+	return ret;
 }
 
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] nvme-rdma: Fix device removal handling
@ 2016-07-18 20:44     ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-18 20:44 UTC (permalink / raw)


Device removal sequence may have crashed because the
controller (and admin queue space) was freed before
we destroyed the admin queue resources. Thus we
want to destroy the admin queue and only then queue
controller deletion and wait for it to complete.

More specifically we:
1. own the controller deletion (make sure we are not
   competing with another deletion).
2. get rid of inflight reconnects if exists (which
   also destroy and create queues).
3. destroy the queue.
4. safely queue controller deletion (and wait for it
   to complete).

Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
---
 drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3e3ce2b..0e58450 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 		struct rdma_cm_event *event);
 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
 
 /* XXX: really should move to a generic header sooner or later.. */
 static inline void put_unaligned_le24(u32 val, u8 *p)
@@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
  * that caught the event. Since we hold the callout until the controller
  * deletion is completed, we'll deadlock if the controller deletion will
  * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
- * of destroying this queue before-hand, destroy the queue resources
- * after the controller deletion completed with the exception of destroying
- * the cm_id implicitely by returning a non-zero rc to the callout.
+ * of destroying this queue before-hand, destroy the queue resources,
+ * then queue the controller deletion which won't destroy this queue and
+ * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
  */
 static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
 {
 	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
-	int ret, ctrl_deleted = 0;
+	int ret;
 
-	/* First disable the queue so ctrl delete won't free it */
-	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
-		goto out;
+	/* Own the controller deletion */
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return 0;
 
-	/* delete the controller */
-	ret = __nvme_rdma_del_ctrl(ctrl);
-	if (!ret) {
-		dev_warn(ctrl->ctrl.device,
-			"Got rdma device removal event, deleting ctrl\n");
-		flush_work(&ctrl->delete_work);
+	dev_warn(ctrl->ctrl.device,
+		"Got rdma device removal event, deleting ctrl\n");
 
-		/* Return non-zero so the cm_id will destroy implicitly */
-		ctrl_deleted = 1;
+	/* Get rid of reconnect work if its running */
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
 
-		/* Free this queue ourselves */
-		rdma_disconnect(queue->cm_id);
-		ib_drain_qp(queue->qp);
-		nvme_rdma_destroy_queue_ib(queue);
+	/* Disable the queue so ctrl delete won't free it */
+	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
+		ret = 0;
+		goto queue_delete;
 	}
 
-out:
-	return ctrl_deleted;
+	/* Free this queue ourselves */
+	nvme_rdma_stop_queue(queue);
+	nvme_rdma_destroy_queue_ib(queue);
+
+	/* Return non-zero so the cm_id will destroy implicitly */
+	ret = 1;
+
+queue_delete:
+	/* queue controller deletion */
+	queue_work(nvme_rdma_wq, &ctrl->delete_work);
+	flush_work(&ctrl->delete_work);
+	return ret;
 }
 
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
-- 
2.7.0

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH RFC 0/3] iwarp device removal deadlock fix
@ 2016-07-18 21:58 ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 21:58 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagi-NQWnxTmZq1alnMjI0IkVqw, sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
	mlin-DgEjT+Ai2ygdnm+yROfE0A, hch-jcswGhMUV9g,
	linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

This RFC series attempts to address the deadlock issue discovered
while testing nvmf/rdma handling rdma device removal events from
the rdma_cm.  For a discussion of the deadlock that can happen, see

http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.

For my description of the deadlock itself, see this post in the above thread:

http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html

In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
until all references are removed.  This combined with the iwarp CM passing
disconnect events up to the rdma_cm during disconnect and/or qp/cm_id destruction
leads to a deadlock.

My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
block during object destruction for the recnts to reach 0, but rather to
let the freeing of the object memory be deferred when the last deref is
done. This allows all the qps/cm_ids to be destroyed without blocking, and
all the object memory freeing ends up happinging when the application's
device_remove event handler function returns to the rdma_cm.

Sean, I was hoping you could have a look at the iwcm.c patch particularly,
to tell my why its broken. :)  I spent some time trying to figure out
why we really need the CALLBACK_DESTROY flag, but I concluded it really
isn't needed.  The one side effect I see with my change, is that the
application could possibly get a cm_id event after it has destroyed the
cm_id.  There probably is a way to discard events that have a reference
on the cm_id but get processed after the app has destoyed the cm_id by
having a new flag indicating "destroyed by app".

I've included Sagi's proposed nvme-rdma patch to fix the existing
touch-after-free problem with the device removal event handler function
that brought about this deadlock analysis.  Also this series would be
the way to submit the final fix for all this (assuming my proposal isn't
broken too badly).

Thanks,

Steve.

---

Sagi Grimberg (1):
  nvme-rdma: Fix device removal handling

Steve Wise (2):
  iw_cm: free cm_id resources on the last deref
  iw_cxgb4: don't block in destroy_qp awaiting the last deref

 drivers/infiniband/core/iwcm.c         | 41 +++++-----------------------
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |  2 +-
 drivers/infiniband/hw/cxgb4/qp.c       | 21 ++++++++++-----
 drivers/nvme/host/rdma.c               | 49 +++++++++++++++++++---------------
 4 files changed, 48 insertions(+), 65 deletions(-)

-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH RFC 0/3] iwarp device removal deadlock fix
@ 2016-07-18 21:58 ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-18 21:58 UTC (permalink / raw)


This RFC series attempts to address the deadlock issue discovered
while testing nvmf/rdma handling rdma device removal events from
the rdma_cm.  For a discussion of the deadlock that can happen, see

http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.

For my description of the deadlock itself, see this post in the above thread:

http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html

In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
until all references are removed.  This combined with the iwarp CM passing
disconnect events up to the rdma_cm during disconnect and/or qp/cm_id destruction
leads to a deadlock.

My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
block during object destruction for the recnts to reach 0, but rather to
let the freeing of the object memory be deferred when the last deref is
done. This allows all the qps/cm_ids to be destroyed without blocking, and
all the object memory freeing ends up happinging when the application's
device_remove event handler function returns to the rdma_cm.

Sean, I was hoping you could have a look at the iwcm.c patch particularly,
to tell my why its broken. :)  I spent some time trying to figure out
why we really need the CALLBACK_DESTROY flag, but I concluded it really
isn't needed.  The one side effect I see with my change, is that the
application could possibly get a cm_id event after it has destroyed the
cm_id.  There probably is a way to discard events that have a reference
on the cm_id but get processed after the app has destoyed the cm_id by
having a new flag indicating "destroyed by app".

I've included Sagi's proposed nvme-rdma patch to fix the existing
touch-after-free problem with the device removal event handler function
that brought about this deadlock analysis.  Also this series would be
the way to submit the final fix for all this (assuming my proposal isn't
broken too badly).

Thanks,

Steve.

---

Sagi Grimberg (1):
  nvme-rdma: Fix device removal handling

Steve Wise (2):
  iw_cm: free cm_id resources on the last deref
  iw_cxgb4: don't block in destroy_qp awaiting the last deref

 drivers/infiniband/core/iwcm.c         | 41 +++++-----------------------
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |  2 +-
 drivers/infiniband/hw/cxgb4/qp.c       | 21 ++++++++++-----
 drivers/nvme/host/rdma.c               | 49 +++++++++++++++++++---------------
 4 files changed, 48 insertions(+), 65 deletions(-)

-- 
2.7.0

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC 0/3] iwarp device removal deadlock fix
  2016-07-18 21:58 ` Steve Wise
@ 2016-07-20  8:47     ` Sagi Grimberg
  -1 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:47 UTC (permalink / raw)
  To: Steve Wise, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

> This RFC series attempts to address the deadlock issue discovered
> while testing nvmf/rdma handling rdma device removal events from
> the rdma_cm.

Thanks for doing this Steve!

> For a discussion of the deadlock that can happen, see
>
> http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.
>
> For my description of the deadlock itself, see this post in the above thread:
>
> http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html
>
> In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
> until all references are removed.  This combined with the iwarp CM passing
> disconnect events up to the rdma_cm during disconnect and/or qp/cm_id destruction
> leads to a deadlock.
>
> My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
> block during object destruction for the recnts to reach 0, but rather to
> let the freeing of the object memory be deferred when the last deref is
> done. This allows all the qps/cm_ids to be destroyed without blocking, and
> all the object memory freeing ends up happinging when the application's
> device_remove event handler function returns to the rdma_cm.

This sounds like a very good approach moving forward.

> Sean, I was hoping you could have a look at the iwcm.c patch particularly,
> to tell my why its broken. :)  I spent some time trying to figure out
> why we really need the CALLBACK_DESTROY flag, but I concluded it really
> isn't needed.  The one side effect I see with my change, is that the
> application could possibly get a cm_id event after it has destroyed the
> cm_id.  There probably is a way to discard events that have a reference
> on the cm_id but get processed after the app has destoyed the cm_id by
> having a new flag indicating "destroyed by app".

That sounds easy enough. Does this mean that iwcm relies on the driver
to do this or is it inter-operable with the existing logic? If not this
will need to take care of all the iWARP drivers.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH RFC 0/3] iwarp device removal deadlock fix
@ 2016-07-20  8:47     ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:47 UTC (permalink / raw)


> This RFC series attempts to address the deadlock issue discovered
> while testing nvmf/rdma handling rdma device removal events from
> the rdma_cm.

Thanks for doing this Steve!

> For a discussion of the deadlock that can happen, see
>
> http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.
>
> For my description of the deadlock itself, see this post in the above thread:
>
> http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html
>
> In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
> until all references are removed.  This combined with the iwarp CM passing
> disconnect events up to the rdma_cm during disconnect and/or qp/cm_id destruction
> leads to a deadlock.
>
> My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
> block during object destruction for the recnts to reach 0, but rather to
> let the freeing of the object memory be deferred when the last deref is
> done. This allows all the qps/cm_ids to be destroyed without blocking, and
> all the object memory freeing ends up happinging when the application's
> device_remove event handler function returns to the rdma_cm.

This sounds like a very good approach moving forward.

> Sean, I was hoping you could have a look at the iwcm.c patch particularly,
> to tell my why its broken. :)  I spent some time trying to figure out
> why we really need the CALLBACK_DESTROY flag, but I concluded it really
> isn't needed.  The one side effect I see with my change, is that the
> application could possibly get a cm_id event after it has destroyed the
> cm_id.  There probably is a way to discard events that have a reference
> on the cm_id but get processed after the app has destoyed the cm_id by
> having a new flag indicating "destroyed by app".

That sounds easy enough. Does this mean that iwcm relies on the driver
to do this or is it inter-operable with the existing logic? If not this
will need to take care of all the iWARP drivers.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] iw_cm: free cm_id resources on the last deref
  2016-07-18 20:44     ` Steve Wise
@ 2016-07-20  8:51         ` Sagi Grimberg
  -1 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:51 UTC (permalink / raw)
  To: Steve Wise, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r


> Remove the complicated logic to free the cm_id resources in iw_cm event
> handlers vs when an application thread destroys the device.  I'm not sure
> why this code was written, but simply allowing the last deref to free
> the memory is cleaner.  It also prevents a deadlock when applications
> try to destroy cm_id's in their cm event handler function.

The description here is misleading. we can never destroy the cm_id
inside the cm_id handler. Also, I don't think the deadlock was on cm_id
removal but rather on the qp referenced by the cm_id. I think the change
log can be improved.

The patch looks fine to me.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
@ 2016-07-20  8:51         ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:51 UTC (permalink / raw)



> Remove the complicated logic to free the cm_id resources in iw_cm event
> handlers vs when an application thread destroys the device.  I'm not sure
> why this code was written, but simply allowing the last deref to free
> the memory is cleaner.  It also prevents a deadlock when applications
> try to destroy cm_id's in their cm event handler function.

The description here is misleading. we can never destroy the cm_id
inside the cm_id handler. Also, I don't think the deadlock was on cm_id
removal but rather on the qp referenced by the cm_id. I think the change
log can be improved.

The patch looks fine to me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] iw_cxgb4: don't block in destroy_qp awaiting the last deref
  2016-07-18 20:44     ` Steve Wise
@ 2016-07-20  8:52         ` Sagi Grimberg
  -1 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:52 UTC (permalink / raw)
  To: Steve Wise, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Looks good Steve!

Reviewed-by: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/3] iw_cxgb4: don't block in destroy_qp awaiting the last deref
@ 2016-07-20  8:52         ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2016-07-20  8:52 UTC (permalink / raw)


Looks good Steve!

Reviewed-by: Sagi Grimberg <sagi at grimberg.me>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH RFC 0/3] iwarp device removal deadlock fix
  2016-07-20  8:47     ` Sagi Grimberg
@ 2016-07-20 13:49         ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-20 13:49 UTC (permalink / raw)
  To: 'Sagi Grimberg', linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

> > This RFC series attempts to address the deadlock issue discovered
> > while testing nvmf/rdma handling rdma device removal events from
> > the rdma_cm.
> 
> Thanks for doing this Steve!
> 
> > For a discussion of the deadlock that can happen, see
> >
> > http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.
> >
> > For my description of the deadlock itself, see this post in the above
thread:
> >
> > http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html
> >
> > In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
> > until all references are removed.  This combined with the iwarp CM passing
> > disconnect events up to the rdma_cm during disconnect and/or qp/cm_id
> destruction
> > leads to a deadlock.
> >
> > My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
> > block during object destruction for the recnts to reach 0, but rather to
> > let the freeing of the object memory be deferred when the last deref is
> > done. This allows all the qps/cm_ids to be destroyed without blocking, and
> > all the object memory freeing ends up happinging when the application's
> > device_remove event handler function returns to the rdma_cm.
> 
> This sounds like a very good approach moving forward.
> 
> > Sean, I was hoping you could have a look at the iwcm.c patch particularly,
> > to tell my why its broken. :)  I spent some time trying to figure out
> > why we really need the CALLBACK_DESTROY flag, but I concluded it really
> > isn't needed.  The one side effect I see with my change, is that the
> > application could possibly get a cm_id event after it has destroyed the
> > cm_id.  There probably is a way to discard events that have a reference
> > on the cm_id but get processed after the app has destoyed the cm_id by
> > having a new flag indicating "destroyed by app".
> 

By the way, I think Sean is on sabbatical until 9/12. 

> That sounds easy enough. Does this mean that iwcm relies on the driver
> to do this or is it inter-operable with the existing logic? If not this
> will need to take care of all the iWARP drivers.

This can be handled all in the iw_cm module.  In fact, I'm testing a new version
of the iw_cm patch now.

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH RFC 0/3] iwarp device removal deadlock fix
@ 2016-07-20 13:49         ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-20 13:49 UTC (permalink / raw)


> > This RFC series attempts to address the deadlock issue discovered
> > while testing nvmf/rdma handling rdma device removal events from
> > the rdma_cm.
> 
> Thanks for doing this Steve!
> 
> > For a discussion of the deadlock that can happen, see
> >
> > http://lists.infradead.org/pipermail/linux-nvme/2016-July/005440.html.
> >
> > For my description of the deadlock itself, see this post in the above
thread:
> >
> > http://lists.infradead.org/pipermail/linux-nvme/2016-July/005465.html
> >
> > In a nutshell, iw_cxgb4 and the iw_cm block during qp/cm_id destruction
> > until all references are removed.  This combined with the iwarp CM passing
> > disconnect events up to the rdma_cm during disconnect and/or qp/cm_id
> destruction
> > leads to a deadlock.
> >
> > My proposed solution is to remove the need for iw_cxgb4 and iw_cm to
> > block during object destruction for the recnts to reach 0, but rather to
> > let the freeing of the object memory be deferred when the last deref is
> > done. This allows all the qps/cm_ids to be destroyed without blocking, and
> > all the object memory freeing ends up happinging when the application's
> > device_remove event handler function returns to the rdma_cm.
> 
> This sounds like a very good approach moving forward.
> 
> > Sean, I was hoping you could have a look at the iwcm.c patch particularly,
> > to tell my why its broken. :)  I spent some time trying to figure out
> > why we really need the CALLBACK_DESTROY flag, but I concluded it really
> > isn't needed.  The one side effect I see with my change, is that the
> > application could possibly get a cm_id event after it has destroyed the
> > cm_id.  There probably is a way to discard events that have a reference
> > on the cm_id but get processed after the app has destoyed the cm_id by
> > having a new flag indicating "destroyed by app".
> 

By the way, I think Sean is on sabbatical until 9/12. 

> That sounds easy enough. Does this mean that iwcm relies on the driver
> to do this or is it inter-operable with the existing logic? If not this
> will need to take care of all the iWARP drivers.

This can be handled all in the iw_cm module.  In fact, I'm testing a new version
of the iw_cm patch now.

Steve.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 1/3] iw_cm: free cm_id resources on the last deref
  2016-07-20  8:51         ` Sagi Grimberg
@ 2016-07-20 13:51             ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-20 13:51 UTC (permalink / raw)
  To: 'Sagi Grimberg', linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r


> 
> > Remove the complicated logic to free the cm_id resources in iw_cm event
> > handlers vs when an application thread destroys the device.  I'm not sure
> > why this code was written, but simply allowing the last deref to free
> > the memory is cleaner.  It also prevents a deadlock when applications
> > try to destroy cm_id's in their cm event handler function.
> 
> The description here is misleading. we can never destroy the cm_id
> inside the cm_id handler. Also, I don't think the deadlock was on cm_id
> removal but rather on the qp referenced by the cm_id. I think the change
> log can be improved.
>

I'll reword it. 
 
> The patch looks fine to me.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
@ 2016-07-20 13:51             ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-20 13:51 UTC (permalink / raw)



> 
> > Remove the complicated logic to free the cm_id resources in iw_cm event
> > handlers vs when an application thread destroys the device.  I'm not sure
> > why this code was written, but simply allowing the last deref to free
> > the memory is cleaner.  It also prevents a deadlock when applications
> > try to destroy cm_id's in their cm event handler function.
> 
> The description here is misleading. we can never destroy the cm_id
> inside the cm_id handler. Also, I don't think the deadlock was on cm_id
> removal but rather on the qp referenced by the cm_id. I think the change
> log can be improved.
>

I'll reword it. 
 
> The patch looks fine to me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] nvme-rdma: Fix device removal handling
  2016-07-18 20:44     ` Sagi Grimberg
@ 2016-07-21  8:15         ` Christoph Hellwig
  -1 siblings, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2016-07-21  8:15 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Mon, Jul 18, 2016 at 01:44:50PM -0700, Sagi Grimberg wrote:
> Device removal sequence may have crashed because the
> controller (and admin queue space) was freed before
> we destroyed the admin queue resources. Thus we
> want to destroy the admin queue and only then queue
> controller deletion and wait for it to complete.
> 
> More specifically we:
> 1. own the controller deletion (make sure we are not
>    competing with another deletion).
> 2. get rid of inflight reconnects if exists (which
>    also destroy and create queues).
> 3. destroy the queue.
> 4. safely queue controller deletion (and wait for it
>    to complete).
> 
> Signed-off-by: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
> ---
>  drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++----------------------
>  1 file changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 3e3ce2b..0e58450 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
>  static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
>  		struct rdma_cm_event *event);
>  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
> -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
>  
>  /* XXX: really should move to a generic header sooner or later.. */
>  static inline void put_unaligned_le24(u32 val, u8 *p)
> @@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
>   * that caught the event. Since we hold the callout until the controller
>   * deletion is completed, we'll deadlock if the controller deletion will
>   * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
> - * of destroying this queue before-hand, destroy the queue resources
> - * after the controller deletion completed with the exception of destroying
> - * the cm_id implicitely by returning a non-zero rc to the callout.
> + * of destroying this queue before-hand, destroy the queue resources,
> + * then queue the controller deletion which won't destroy this queue and
> + * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
>   */
>  static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
>  {
>  	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
> +	int ret;
>  
> +	/* Own the controller deletion */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
> +		return 0;
>  
> +	dev_warn(ctrl->ctrl.device,
> +		"Got rdma device removal event, deleting ctrl\n");
>  
> +	/* Get rid of reconnect work if its running */
> +	cancel_delayed_work_sync(&ctrl->reconnect_work);
>  
> +	/* Disable the queue so ctrl delete won't free it */
> +	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
> +		ret = 0;
> +		goto queue_delete;
>  	}
>  
> +	/* Free this queue ourselves */
> +	nvme_rdma_stop_queue(queue);
> +	nvme_rdma_destroy_queue_ib(queue);
> +
> +	/* Return non-zero so the cm_id will destroy implicitly */
> +	ret = 1;
> +
> +queue_delete:
> +	/* queue controller deletion */
> +	queue_work(nvme_rdma_wq, &ctrl->delete_work);
> +	flush_work(&ctrl->delete_work);
> +	return ret;

Seems like we should be able to just skip the goto here:

	/* Disable the queue so ctrl delete won't free it */
	if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
		/* Free this queue ourselves */
		nvme_rdma_stop_queue(queue);
		nvme_rdma_destroy_queue_ib(queue);

		/* Return non-zero so the cm_id will destroy implicitly */
		ret = 1;
	}

	/* queue controller deletion */
	queue_work(nvme_rdma_wq, &ctrl->delete_work);
	flush_work(&ctrl->delete_work);
	return ret;
}


And that opportunity could used to improve the comment for that if
connected stop queue case a bit as well.

Otherwise this looks reasonable to me.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/3] nvme-rdma: Fix device removal handling
@ 2016-07-21  8:15         ` Christoph Hellwig
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2016-07-21  8:15 UTC (permalink / raw)


On Mon, Jul 18, 2016@01:44:50PM -0700, Sagi Grimberg wrote:
> Device removal sequence may have crashed because the
> controller (and admin queue space) was freed before
> we destroyed the admin queue resources. Thus we
> want to destroy the admin queue and only then queue
> controller deletion and wait for it to complete.
> 
> More specifically we:
> 1. own the controller deletion (make sure we are not
>    competing with another deletion).
> 2. get rid of inflight reconnects if exists (which
>    also destroy and create queues).
> 3. destroy the queue.
> 4. safely queue controller deletion (and wait for it
>    to complete).
> 
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> ---
>  drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++++----------------------
>  1 file changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 3e3ce2b..0e58450 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
>  static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
>  		struct rdma_cm_event *event);
>  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
> -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
>  
>  /* XXX: really should move to a generic header sooner or later.. */
>  static inline void put_unaligned_le24(u32 val, u8 *p)
> @@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
>   * that caught the event. Since we hold the callout until the controller
>   * deletion is completed, we'll deadlock if the controller deletion will
>   * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
> - * of destroying this queue before-hand, destroy the queue resources
> - * after the controller deletion completed with the exception of destroying
> - * the cm_id implicitely by returning a non-zero rc to the callout.
> + * of destroying this queue before-hand, destroy the queue resources,
> + * then queue the controller deletion which won't destroy this queue and
> + * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
>   */
>  static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
>  {
>  	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
> +	int ret;
>  
> +	/* Own the controller deletion */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
> +		return 0;
>  
> +	dev_warn(ctrl->ctrl.device,
> +		"Got rdma device removal event, deleting ctrl\n");
>  
> +	/* Get rid of reconnect work if its running */
> +	cancel_delayed_work_sync(&ctrl->reconnect_work);
>  
> +	/* Disable the queue so ctrl delete won't free it */
> +	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
> +		ret = 0;
> +		goto queue_delete;
>  	}
>  
> +	/* Free this queue ourselves */
> +	nvme_rdma_stop_queue(queue);
> +	nvme_rdma_destroy_queue_ib(queue);
> +
> +	/* Return non-zero so the cm_id will destroy implicitly */
> +	ret = 1;
> +
> +queue_delete:
> +	/* queue controller deletion */
> +	queue_work(nvme_rdma_wq, &ctrl->delete_work);
> +	flush_work(&ctrl->delete_work);
> +	return ret;

Seems like we should be able to just skip the goto here:

	/* Disable the queue so ctrl delete won't free it */
	if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
		/* Free this queue ourselves */
		nvme_rdma_stop_queue(queue);
		nvme_rdma_destroy_queue_ib(queue);

		/* Return non-zero so the cm_id will destroy implicitly */
		ret = 1;
	}

	/* queue controller deletion */
	queue_work(nvme_rdma_wq, &ctrl->delete_work);
	flush_work(&ctrl->delete_work);
	return ret;
}


And that opportunity could used to improve the comment for that if
connected stop queue case a bit as well.

Otherwise this looks reasonable to me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 1/3] iw_cm: free cm_id resources on the last deref
  2016-07-20 13:51             ` Steve Wise
@ 2016-07-21 14:17               ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-21 14:17 UTC (permalink / raw)
  To: 'Sagi Grimberg', linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

> > > Remove the complicated logic to free the cm_id resources in iw_cm event
> > > handlers vs when an application thread destroys the device.  I'm not sure
> > > why this code was written, but simply allowing the last deref to free
> > > the memory is cleaner.  It also prevents a deadlock when applications
> > > try to destroy cm_id's in their cm event handler function.
> >
> > The description here is misleading. we can never destroy the cm_id
> > inside the cm_id handler. Also, I don't think the deadlock was on cm_id
> > removal but rather on the qp referenced by the cm_id. I think the change
> > log can be improved.
> >
> 
> I'll reword it.

The nvme unplug handler does indeed destroy all the qps -and- cm_ids used for
the controllers for this device, with the exception of the cm_id handling the
event.  That is what causes this deadlock.  Once I fixed iw_cxgb4 (in patch 2)
to not block until the refcnt reaches 0 in c4iw_destroy_qp(), I then hit the
block in iw_destroy_cm_id() which deadlocks the process due to the iw_cm worker
thread already stuck trying to post an event to the rdma_cm for the cm_id
handling the event.  

Perhaps I should describe the deadlock in detail like I did in the email threads
leading up to this series?

While I'm rambling, there is still a condition that probably needs to be
addressed:  if the application event handler function disconnects the cm_id that
is handling the event, the iw_cm workq thread gets stuck posting a
IW_CM_EVENT_CLOSE to rdma_cm.  So the iw_cm workq thread is stuck in
cm_close_handler() calling cm_id_priv->id.cm_handler() which is cma_iw_handler()
which is blocked in cma_disable_callback() because the application is currently
running its event handler for this cm_id.  This block is released when the
application returns from its event handler function.  

But maybe cma_iw_handler() should queue the event if it cannot deliver it, vs
blocking the iw_cm workq thread?   

Steve.




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
@ 2016-07-21 14:17               ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-21 14:17 UTC (permalink / raw)


> > > Remove the complicated logic to free the cm_id resources in iw_cm event
> > > handlers vs when an application thread destroys the device.  I'm not sure
> > > why this code was written, but simply allowing the last deref to free
> > > the memory is cleaner.  It also prevents a deadlock when applications
> > > try to destroy cm_id's in their cm event handler function.
> >
> > The description here is misleading. we can never destroy the cm_id
> > inside the cm_id handler. Also, I don't think the deadlock was on cm_id
> > removal but rather on the qp referenced by the cm_id. I think the change
> > log can be improved.
> >
> 
> I'll reword it.

The nvme unplug handler does indeed destroy all the qps -and- cm_ids used for
the controllers for this device, with the exception of the cm_id handling the
event.  That is what causes this deadlock.  Once I fixed iw_cxgb4 (in patch 2)
to not block until the refcnt reaches 0 in c4iw_destroy_qp(), I then hit the
block in iw_destroy_cm_id() which deadlocks the process due to the iw_cm worker
thread already stuck trying to post an event to the rdma_cm for the cm_id
handling the event.  

Perhaps I should describe the deadlock in detail like I did in the email threads
leading up to this series?

While I'm rambling, there is still a condition that probably needs to be
addressed:  if the application event handler function disconnects the cm_id that
is handling the event, the iw_cm workq thread gets stuck posting a
IW_CM_EVENT_CLOSE to rdma_cm.  So the iw_cm workq thread is stuck in
cm_close_handler() calling cm_id_priv->id.cm_handler() which is cma_iw_handler()
which is blocked in cma_disable_callback() because the application is currently
running its event handler for this cm_id.  This block is released when the
application returns from its event handler function.  

But maybe cma_iw_handler() should queue the event if it cannot deliver it, vs
blocking the iw_cm workq thread?   

Steve.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 1/3] iw_cm: free cm_id resources on the last deref
       [not found]             ` <045f01d1e35a$93618a60$ba249f20$@opengridcomputing.com>
@ 2016-07-21 15:45                 ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-21 15:45 UTC (permalink / raw)
  To: 'Sagi Grimberg', linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

> While I'm rambling, there is still a condition that probably needs to be
> addressed:  if the application event handler function disconnects the
> cm_id that is handling the event, the iw_cm workq thread gets stuck
> posting a IW_CM_EVENT_CLOSE to rdma_cm.  So the iw_cm workq thread is
> stuck in cm_close_handler() calling cm_id_priv->id.cm_handler() which is
> cma_iw_handler() which is blocked in cma_disable_callback() because the
> application is currently running its event handler for this cm_id.  This
> block is released when the application returns from its event handler
> function.
> 
> But maybe cma_iw_handler() should queue the event if it cannot deliver it,
> vs blocking the iw_cm workq thread?
> 

Answering myself:  queueing the event seems to be a no-go because the rdma_cm
can return non-zero to the iw_cm to indicate it should destroy the iw_cm_id.
This cannot be done correctly if the event is queued/deferred.





--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] iw_cm: free cm_id resources on the last deref
@ 2016-07-21 15:45                 ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-21 15:45 UTC (permalink / raw)


> While I'm rambling, there is still a condition that probably needs to be
> addressed:  if the application event handler function disconnects the
> cm_id that is handling the event, the iw_cm workq thread gets stuck
> posting a IW_CM_EVENT_CLOSE to rdma_cm.  So the iw_cm workq thread is
> stuck in cm_close_handler() calling cm_id_priv->id.cm_handler() which is
> cma_iw_handler() which is blocked in cma_disable_callback() because the
> application is currently running its event handler for this cm_id.  This
> block is released when the application returns from its event handler
> function.
> 
> But maybe cma_iw_handler() should queue the event if it cannot deliver it,
> vs blocking the iw_cm workq thread?
> 

Answering myself:  queueing the event seems to be a no-go because the rdma_cm
can return non-zero to the iw_cm to indicate it should destroy the iw_cm_id.
This cannot be done correctly if the event is queued/deferred.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 3/3] nvme-rdma: Fix device removal handling
  2016-07-18 20:44     ` Sagi Grimberg
@ 2016-07-22 18:37         ` Steve Wise
  -1 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-22 18:37 UTC (permalink / raw)
  To: 'Sagi Grimberg', linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sean.hefty-ral2JQCrhuEAvxtiuMwx3w, mlin-DgEjT+Ai2ygdnm+yROfE0A,
	hch-jcswGhMUV9g, linux-nvme-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

> Device removal sequence may have crashed because the
> controller (and admin queue space) was freed before
> we destroyed the admin queue resources. Thus we
> want to destroy the admin queue and only then queue
> controller deletion and wait for it to complete.
> 
> More specifically we:
> 1. own the controller deletion (make sure we are not
>    competing with another deletion).
> 2. get rid of inflight reconnects if exists (which
>    also destroy and create queues).
> 3. destroy the queue.
> 4. safely queue controller deletion (and wait for it
>    to complete).
> 
> Signed-off-by: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
> ---
>  drivers/nvme/host/rdma.c | 49
++++++++++++++++++++++++++----------------------
>  1 file changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 3e3ce2b..0e58450 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
>  static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
>  		struct rdma_cm_event *event);
>  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
> -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
> 
>  /* XXX: really should move to a generic header sooner or later.. */
>  static inline void put_unaligned_le24(u32 val, u8 *p)
> @@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
>   * that caught the event. Since we hold the callout until the controller
>   * deletion is completed, we'll deadlock if the controller deletion will
>   * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
> - * of destroying this queue before-hand, destroy the queue resources
> - * after the controller deletion completed with the exception of destroying
> - * the cm_id implicitely by returning a non-zero rc to the callout.
> + * of destroying this queue before-hand, destroy the queue resources,
> + * then queue the controller deletion which won't destroy this queue and
> + * we destroy the cm_id implicitely by returning a non-zero rc to the
callout.
>   */
>  static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
>  {
>  	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
> -	int ret, ctrl_deleted = 0;
> +	int ret;
> 
> -	/* First disable the queue so ctrl delete won't free it */
> -	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
> -		goto out;
> +	/* Own the controller deletion */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
> +		return 0;
> 
> -	/* delete the controller */
> -	ret = __nvme_rdma_del_ctrl(ctrl);
> -	if (!ret) {
> -		dev_warn(ctrl->ctrl.device,
> -			"Got rdma device removal event, deleting ctrl\n");
> -		flush_work(&ctrl->delete_work);
> +	dev_warn(ctrl->ctrl.device,
> +		"Got rdma device removal event, deleting ctrl\n");
> 
> -		/* Return non-zero so the cm_id will destroy implicitly */
> -		ctrl_deleted = 1;
> +	/* Get rid of reconnect work if its running */
> +	cancel_delayed_work_sync(&ctrl->reconnect_work);
> 
> -		/* Free this queue ourselves */
> -		rdma_disconnect(queue->cm_id);
> -		ib_drain_qp(queue->qp);
> -		nvme_rdma_destroy_queue_ib(queue);
> +	/* Disable the queue so ctrl delete won't free it */
> +	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
> +		ret = 0;
> +		goto queue_delete;
>  	}
> 
> -out:
> -	return ctrl_deleted;
> +	/* Free this queue ourselves */
> +	nvme_rdma_stop_queue(queue);
> +	nvme_rdma_destroy_queue_ib(queue);
> +
> +	/* Return non-zero so the cm_id will destroy implicitly */
> +	ret = 1;
> +
> +queue_delete:
> +	/* queue controller deletion */
> +	queue_work(nvme_rdma_wq, &ctrl->delete_work);
> +	flush_work(&ctrl->delete_work);

Actually, since the queue_work() fires off the workq thread to delete the
controller and its resources (on another cpu potentially), and think the
flush_work() could end up being a touch-after-free because it accesses *ctrl,
no?


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 3/3] nvme-rdma: Fix device removal handling
@ 2016-07-22 18:37         ` Steve Wise
  0 siblings, 0 replies; 26+ messages in thread
From: Steve Wise @ 2016-07-22 18:37 UTC (permalink / raw)


> Device removal sequence may have crashed because the
> controller (and admin queue space) was freed before
> we destroyed the admin queue resources. Thus we
> want to destroy the admin queue and only then queue
> controller deletion and wait for it to complete.
> 
> More specifically we:
> 1. own the controller deletion (make sure we are not
>    competing with another deletion).
> 2. get rid of inflight reconnects if exists (which
>    also destroy and create queues).
> 3. destroy the queue.
> 4. safely queue controller deletion (and wait for it
>    to complete).
> 
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> ---
>  drivers/nvme/host/rdma.c | 49
++++++++++++++++++++++++++----------------------
>  1 file changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 3e3ce2b..0e58450 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
>  static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
>  		struct rdma_cm_event *event);
>  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
> -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
> 
>  /* XXX: really should move to a generic header sooner or later.. */
>  static inline void put_unaligned_le24(u32 val, u8 *p)
> @@ -1318,37 +1317,43 @@ out_destroy_queue_ib:
>   * that caught the event. Since we hold the callout until the controller
>   * deletion is completed, we'll deadlock if the controller deletion will
>   * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
> - * of destroying this queue before-hand, destroy the queue resources
> - * after the controller deletion completed with the exception of destroying
> - * the cm_id implicitely by returning a non-zero rc to the callout.
> + * of destroying this queue before-hand, destroy the queue resources,
> + * then queue the controller deletion which won't destroy this queue and
> + * we destroy the cm_id implicitely by returning a non-zero rc to the
callout.
>   */
>  static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
>  {
>  	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
> -	int ret, ctrl_deleted = 0;
> +	int ret;
> 
> -	/* First disable the queue so ctrl delete won't free it */
> -	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
> -		goto out;
> +	/* Own the controller deletion */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
> +		return 0;
> 
> -	/* delete the controller */
> -	ret = __nvme_rdma_del_ctrl(ctrl);
> -	if (!ret) {
> -		dev_warn(ctrl->ctrl.device,
> -			"Got rdma device removal event, deleting ctrl\n");
> -		flush_work(&ctrl->delete_work);
> +	dev_warn(ctrl->ctrl.device,
> +		"Got rdma device removal event, deleting ctrl\n");
> 
> -		/* Return non-zero so the cm_id will destroy implicitly */
> -		ctrl_deleted = 1;
> +	/* Get rid of reconnect work if its running */
> +	cancel_delayed_work_sync(&ctrl->reconnect_work);
> 
> -		/* Free this queue ourselves */
> -		rdma_disconnect(queue->cm_id);
> -		ib_drain_qp(queue->qp);
> -		nvme_rdma_destroy_queue_ib(queue);
> +	/* Disable the queue so ctrl delete won't free it */
> +	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
> +		ret = 0;
> +		goto queue_delete;
>  	}
> 
> -out:
> -	return ctrl_deleted;
> +	/* Free this queue ourselves */
> +	nvme_rdma_stop_queue(queue);
> +	nvme_rdma_destroy_queue_ib(queue);
> +
> +	/* Return non-zero so the cm_id will destroy implicitly */
> +	ret = 1;
> +
> +queue_delete:
> +	/* queue controller deletion */
> +	queue_work(nvme_rdma_wq, &ctrl->delete_work);
> +	flush_work(&ctrl->delete_work);

Actually, since the queue_work() fires off the workq thread to delete the
controller and its resources (on another cpu potentially), and think the
flush_work() could end up being a touch-after-free because it accesses *ctrl,
no?

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2016-07-22 18:37 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-18 21:58 [PATCH RFC 0/3] iwarp device removal deadlock fix Steve Wise
2016-07-18 21:58 ` Steve Wise
     [not found] ` <cover.1468879135.git.swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2016-07-18 20:44   ` [PATCH 1/3] iw_cm: free cm_id resources on the last deref Steve Wise
2016-07-18 20:44     ` Steve Wise
     [not found]     ` <93c3c47c16406ef00184011948424a9597e4c6b8.1468879135.git.swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2016-07-20  8:51       ` Sagi Grimberg
2016-07-20  8:51         ` Sagi Grimberg
     [not found]         ` <578F3B92.2050803-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
2016-07-20 13:51           ` Steve Wise
2016-07-20 13:51             ` Steve Wise
2016-07-21 14:17             ` Steve Wise
2016-07-21 14:17               ` Steve Wise
     [not found]             ` <045f01d1e35a$93618a60$ba249f20$@opengridcomputing.com>
2016-07-21 15:45               ` Steve Wise
2016-07-21 15:45                 ` Steve Wise
2016-07-18 20:44   ` [PATCH 2/3] iw_cxgb4: don't block in destroy_qp awaiting " Steve Wise
2016-07-18 20:44     ` Steve Wise
     [not found]     ` <90b07add78b64320f5a4f99b8f71633214c1823c.1468879135.git.swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2016-07-20  8:52       ` Sagi Grimberg
2016-07-20  8:52         ` Sagi Grimberg
2016-07-18 20:44   ` [PATCH 3/3] nvme-rdma: Fix device removal handling Sagi Grimberg
2016-07-18 20:44     ` Sagi Grimberg
     [not found]     ` <0cb1ccaa920b3ec48dd94ea49fa0f0b7c5520d38.1468879135.git.swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2016-07-21  8:15       ` Christoph Hellwig
2016-07-21  8:15         ` Christoph Hellwig
2016-07-22 18:37       ` Steve Wise
2016-07-22 18:37         ` Steve Wise
2016-07-20  8:47   ` [PATCH RFC 0/3] iwarp device removal deadlock fix Sagi Grimberg
2016-07-20  8:47     ` Sagi Grimberg
     [not found]     ` <578F3A90.1000208-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
2016-07-20 13:49       ` Steve Wise
2016-07-20 13:49         ` Steve Wise

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.