From: Krishnamraju Eraparaju <krishna2@chelsio.com>
To: Sagi Grimberg <sagi@grimberg.me>, Bernard Metzler <BMT@zurich.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>,
Steve Wise <larrystevenwise@gmail.com>,
"linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>,
Nirranjan Kirubaharan <nirranjan@chelsio.com>,
"'Potnuri Bharat Teja'" <bharat@chelsio.com>
Subject: Re: [PATCH v3] iwcm: don't hold the irq disabled lock on iw_rem_ref
Date: Sun, 22 Sep 2019 00:05:08 +0530 [thread overview]
Message-ID: <20190921183507.GA1622@chelsio.com> (raw)
In-Reply-To: <20190918134324.GA5734@chelsio.com>
On Wednesday, September 09/18/19, 2019 at 19:13:25 +0530, Krishnamraju Eraparaju wrote:
Hi,
I have restructured iw_cxgb4 driver to support the proposed iwcm
changes. Issues addressed by this patch:
-all iw_rem_ref calls from iwcm are outside the spinlock critical
section.
-fixes siw early freeing of siw_base_qp issue.
-the ib_destroy_qp for iw_cxgb4 does not block, even before CM getting
destroyed.
Currently we are testing these changes, will submit the patch next week.
PATCH:
-----
diff --git a/drivers/infiniband/core/iwcm.c
b/drivers/infiniband/core/iwcm.c
index 72141c5..7e1d834 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -372,6 +372,7 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int
abrupt)
static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
unsigned long flags;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -389,6 +390,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+
switch (cm_id_priv->state) {
case IW_CM_STATE_LISTEN:
cm_id_priv->state = IW_CM_STATE_DESTROYING;
@@ -401,7 +405,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
cm_id_priv->state = IW_CM_STATE_DESTROYING;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
/* Abrupt close of the connection */
- (void)iwcm_modify_qp_err(cm_id_priv->qp);
+ (void)iwcm_modify_qp_err(qp);
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
case IW_CM_STATE_IDLE:
@@ -426,11 +430,9 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
BUG();
break;
}
- if (cm_id_priv->qp) {
- cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
- cm_id_priv->qp = NULL;
- }
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
if (cm_id->mapped) {
iwpm_remove_mapinfo(&cm_id->local_addr,
&cm_id->m_local_addr);
@@ -671,11 +673,11 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
cm_id_priv->state = IW_CM_STATE_IDLE;
spin_lock_irqsave(&cm_id_priv->lock, flags);
- if (cm_id_priv->qp) {
- cm_id->device->ops.iw_rem_ref(qp);
- cm_id_priv->qp = NULL;
- }
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id->device->ops.iw_rem_ref(qp);
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
wake_up_all(&cm_id_priv->connect_wait);
}
@@ -696,7 +698,7 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
iw_cm_conn_param *iw_param)
struct iwcm_id_private *cm_id_priv;
int ret;
unsigned long flags;
- struct ib_qp *qp;
+ struct ib_qp *qp = NULL;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -730,13 +732,13 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct
iw_cm_conn_param *iw_param)
return 0; /* success */
spin_lock_irqsave(&cm_id_priv->lock, flags);
- if (cm_id_priv->qp) {
- cm_id->device->ops.iw_rem_ref(qp);
- cm_id_priv->qp = NULL;
- }
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
cm_id_priv->state = IW_CM_STATE_IDLE;
err:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id->device->ops.iw_rem_ref(qp);
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
wake_up_all(&cm_id_priv->connect_wait);
return ret;
@@ -878,6 +880,7 @@ static int cm_conn_est_handler(struct
iwcm_id_private *cm_id_priv,
static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
struct iw_cm_event *iw_event)
{
+ struct ib_qp *qp = NULL;
unsigned long flags;
int ret;
@@ -896,11 +899,13 @@ static int cm_conn_rep_handler(struct
iwcm_id_private *cm_id_priv,
cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
} else {
/* REJECTED or RESET */
- cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
+ qp = cm_id_priv->qp;
cm_id_priv->qp = NULL;
cm_id_priv->state = IW_CM_STATE_IDLE;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
if (iw_event->private_data_len)
@@ -942,14 +947,13 @@ static void cm_disconnect_handler(struct
iwcm_id_private *cm_id_priv,
static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
struct iw_cm_event *iw_event)
{
+ struct ib_qp *qp;
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
- if (cm_id_priv->qp) {
- cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
- cm_id_priv->qp = NULL;
- }
switch (cm_id_priv->state) {
case IW_CM_STATE_ESTABLISHED:
case IW_CM_STATE_CLOSING:
@@ -965,6 +969,8 @@ static int cm_close_handler(struct iwcm_id_private
*cm_id_priv,
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
return ret;
}
diff --git a/drivers/infiniband/hw/cxgb4/device.c
b/drivers/infiniband/hw/cxgb4/device.c
index a8b9548..330483d 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -747,6 +747,7 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev
*rdev,
struct list_head *pos, *nxt;
struct c4iw_qid_list *entry;
+ wait_event(uctx->wait, refcount_read(&uctx->refcnt) == 1);
mutex_lock(&uctx->lock);
list_for_each_safe(pos, nxt, &uctx->qpids) {
entry = list_entry(pos, struct c4iw_qid_list, entry);
@@ -775,6 +776,8 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
INIT_LIST_HEAD(&uctx->qpids);
INIT_LIST_HEAD(&uctx->cqids);
mutex_init(&uctx->lock);
+ init_waitqueue_head(&uctx->wait);
+ refcount_set(&uctx->refcnt, 1);
}
/* Caller takes care of locking if needed */
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f..441adc6 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -110,6 +110,8 @@ struct c4iw_dev_ucontext {
struct list_head cqids;
struct mutex lock;
struct kref kref;
+ wait_queue_head_t wait;
+ refcount_t refcnt;
};
enum c4iw_rdev_flags {
@@ -490,13 +492,13 @@ struct c4iw_qp {
struct t4_wq wq;
spinlock_t lock;
struct mutex mutex;
+ struct kref kref;
wait_queue_head_t wait;
int sq_sig_all;
struct c4iw_srq *srq;
+ struct work_struct free_work;
struct c4iw_ucontext *ucontext;
struct c4iw_wr_wait *wr_waitp;
- struct completion qp_rel_comp;
- refcount_t qp_refcnt;
};
static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
@@ -531,6 +533,7 @@ struct c4iw_ucontext {
spinlock_t mmap_lock;
struct list_head mmaps;
bool is_32b_cqe;
+ struct completion qp_comp;
};
static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext
*c)
diff --git a/drivers/infiniband/hw/cxgb4/qp.c
b/drivers/infiniband/hw/cxgb4/qp.c
index eb9368b..ca61193 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -889,17 +889,44 @@ static int build_inv_stag(union t4_wr *wqe, const
struct ib_send_wr *wr,
return 0;
}
+static void free_qp_work(struct work_struct *work)
+{
+ struct c4iw_dev_ucontext *uctx;
+ struct c4iw_qp *qhp;
+ struct c4iw_dev *rhp;
+
+ qhp = container_of(work, struct c4iw_qp, free_work);
+ rhp = qhp->rhp;
+ uctx = qhp->ucontext ? &qhp->ucontext->uctx : &rhp->rdev.uctx;
+
+ pr_debug("qhp %p ucontext %p\n", qhp, qhp->ucontext);
+ destroy_qp(&rhp->rdev, &qhp->wq, uctx, !qhp->srq);
+
+ c4iw_put_wr_wait(qhp->wr_waitp);
+ kfree(qhp);
+ refcount_dec(&uctx->refcnt);
+ wake_up(&uctx->wait);
+}
+
+static void queue_qp_free(struct kref *kref)
+{
+ struct c4iw_qp *qhp;
+
+ qhp = container_of(kref, struct c4iw_qp, kref);
+ pr_debug("qhp %p\n", qhp);
+ queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
+}
+
void c4iw_qp_add_ref(struct ib_qp *qp)
{
pr_debug("ib_qp %p\n", qp);
- refcount_inc(&to_c4iw_qp(qp)->qp_refcnt);
+ kref_get(&to_c4iw_qp(qp)->kref);
}
void c4iw_qp_rem_ref(struct ib_qp *qp)
{
pr_debug("ib_qp %p\n", qp);
- if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt))
- complete(&to_c4iw_qp(qp)->qp_rel_comp);
+ kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
}
static void add_to_fc_list(struct list_head *head, struct list_head
*entry)
@@ -2071,12 +2098,10 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
ib_udata *udata)
{
struct c4iw_dev *rhp;
struct c4iw_qp *qhp;
- struct c4iw_ucontext *ucontext;
struct c4iw_qp_attributes attrs;
qhp = to_c4iw_qp(ib_qp);
rhp = qhp->rhp;
- ucontext = qhp->ucontext;
attrs.next_state = C4IW_QP_STATE_ERROR;
if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
@@ -2094,17 +2119,7 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct
ib_udata *udata)
c4iw_qp_rem_ref(ib_qp);
- wait_for_completion(&qhp->qp_rel_comp);
-
pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
- pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
-
- destroy_qp(&rhp->rdev, &qhp->wq,
- ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
!qhp->srq);
-
- c4iw_put_wr_wait(qhp->wr_waitp);
-
- kfree(qhp);
return 0;
}
@@ -2214,8 +2229,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *attrs,
spin_lock_init(&qhp->lock);
mutex_init(&qhp->mutex);
init_waitqueue_head(&qhp->wait);
- init_completion(&qhp->qp_rel_comp);
- refcount_set(&qhp->qp_refcnt, 1);
+ kref_init(&qhp->kref);
+ INIT_WORK(&qhp->free_work, free_qp_work);
ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
if (ret)
@@ -2335,6 +2350,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *attrs,
if (attrs->srq)
qhp->srq = to_c4iw_srq(attrs->srq);
INIT_LIST_HEAD(&qhp->db_fc_entry);
+ refcount_inc(ucontext ? &ucontext->uctx.refcnt :
+ &rhp->rdev.uctx.refcnt);
pr_debug("sq id %u size %u memsize %zu num_entries %u rq id %u
size %u memsize %zu num_entries %u\n",
qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
attrs->cap.max_send_wr, qhp->wq.rq.qid,
qhp->wq.rq.size,
diff --git a/drivers/infiniband/sw/siw/siw_qp.c
b/drivers/infiniband/sw/siw/siw_qp.c
index 0990307..743d3d2 100644
--- a/drivers/infiniband/sw/siw/siw_qp.c
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -1305,6 +1305,7 @@ int siw_qp_add(struct siw_device *sdev, struct
siw_qp *qp)
void siw_free_qp(struct kref *ref)
{
struct siw_qp *found, *qp = container_of(ref, struct siw_qp,
ref);
+ struct siw_base_qp *siw_base_qp = to_siw_base_qp(qp->ib_qp);
struct siw_device *sdev = qp->sdev;
unsigned long flags;
@@ -1327,4 +1328,5 @@ void siw_free_qp(struct kref *ref)
atomic_dec(&sdev->num_qp);
siw_dbg_qp(qp, "free QP\n");
kfree_rcu(qp, rcu);
+ kfree(siw_base_qp);
}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c
b/drivers/infiniband/sw/siw/siw_verbs.c
index 03176a3..35eee3f 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -605,7 +605,6 @@ int siw_verbs_modify_qp(struct ib_qp *base_qp,
struct ib_qp_attr *attr,
int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
{
struct siw_qp *qp = to_siw_qp(base_qp);
- struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
struct siw_ucontext *uctx =
rdma_udata_to_drv_context(udata, struct siw_ucontext,
base_ucontext);
@@ -642,7 +641,6 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct
ib_udata *udata)
qp->scq = qp->rcq = NULL;
siw_qp_put(qp);
- kfree(siw_base_qp);
return 0;
}
> On Tuesday, September 09/17/19, 2019 at 22:50:27 +0530, Sagi Grimberg wrote:
> >
> > >> To: "Krishnamraju Eraparaju" <krishna2@chelsio.com>
> > >> From: "Jason Gunthorpe" <jgg@ziepe.ca>
> > >> Date: 09/16/2019 06:28PM
> > >> Cc: "Steve Wise" <larrystevenwise@gmail.com>, "Bernard Metzler"
> > >> <BMT@zurich.ibm.com>, "Sagi Grimberg" <sagi@grimberg.me>,
> > >> "linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>
> > >> Subject: [EXTERNAL] Re: Re: [PATCH v3] iwcm: don't hold the irq
> > >> disabled lock on iw_rem_ref
> > >>
> > >> On Wed, Sep 11, 2019 at 09:28:16PM +0530, Krishnamraju Eraparaju
> > >> wrote:
> > >>> Hi Steve & Bernard,
> > >>>
> > >>> Thanks for the review comments.
> > >>> I will do those formating changes.
> > >>
> > >> I don't see anything in patchworks, but the consensus is to drop
> > >> Sagi's patch pending this future patch?
> > >>
> > >> Jason
> > >>
> > > This is my impression as well. But consensus should be
> > > explicit...Sagi, what do you think?
> >
> > I don't really care, but given the changes from Krishnamraju cause other
> > problems I'd ask if my version is also offending his test.
> Hi Sagi,
> Your version holds good for rdma_destroy_id() path only, but there are
> other places where iw_rem_ref() is being called within spinlocks, here is
> the call trace when iw_rem_ref() is called in cm_close_handler() path:
> [ 627.716339] Call Trace:
> [ 627.716339] ? load_new_mm_cr3+0xc0/0xc0
> [ 627.716339] on_each_cpu+0x23/0x40
> [ 627.716339] flush_tlb_kernel_range+0x74/0x80
> [ 627.716340] free_unmap_vmap_area+0x2a/0x40
> [ 627.716340] remove_vm_area+0x59/0x60
> [ 627.716340] __vunmap+0x46/0x210
> [ 627.716340] siw_free_qp+0x88/0x120 [siw]
> [ 627.716340] cm_work_handler+0x5b8/0xd00 <=====
> [ 627.716340] process_one_work+0x155/0x380
> [ 627.716341] worker_thread+0x41/0x3b0
> [ 627.716341] kthread+0xf3/0x130
> [ 627.716341] ? process_one_work+0x380/0x380
> [ 627.716341] ? kthread_bind+0x10/0x10
> [ 627.716341] ret_from_fork+0x35/0x40
>
> Hence, I moved all the occurances of iw_rem_ref() outside of spinlocks
> critical section.
> >
> > In general, I do not think that making resources free routines (both
> > explict or implicit via ref dec) under a spinlock is not a robust
> > design.
> >
> > I would first make it clear and documented what cm_id_priv->lock is
> > protecting. In my mind, it should protect *its own* mutations of
> > cm_id_priv and by design leave all the ops calls outside the lock.
> >
> > I don't understand what is causing the Chelsio issue observed, but
> > it looks like c4iw_destroy_qp blocks on a completion that depends on a
> > refcount that is taken also by iwcm, which means that I cannot call
> > ib_destroy_qp if the cm is not destroyed as well?
> >
> > If that is madatory, I'd say that instead of blocking on this
> > completion, we can simply convert c4iw_qp_rem_ref if use a kref
> > which is not order dependent.
>
> I agree,
> I'm exploring best possible ways to address this issue, will update my
> final resolution by this Friday.
next prev parent reply other threads:[~2019-09-21 18:35 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-04 21:25 [PATCH v3] iwcm: don't hold the irq disabled lock on iw_rem_ref Sagi Grimberg
2019-09-10 11:18 ` Krishnamraju Eraparaju
2019-09-10 16:53 ` Sagi Grimberg
2019-09-10 19:21 ` Krishnamraju Eraparaju
2019-09-11 9:38 ` Bernard Metzler
2019-09-11 14:42 ` Steve Wise
2019-09-11 15:58 ` Krishnamraju Eraparaju
2019-09-16 16:28 ` Jason Gunthorpe
2019-09-17 9:04 ` Bernard Metzler
2019-09-17 12:47 ` Krishnamraju Eraparaju
2019-09-17 13:40 ` Bernard Metzler
2019-09-17 17:20 ` Sagi Grimberg
2019-09-18 13:43 ` Krishnamraju Eraparaju
2019-09-21 18:35 ` Krishnamraju Eraparaju [this message]
2019-10-01 17:17 ` Krishnamraju Eraparaju
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190921183507.GA1622@chelsio.com \
--to=krishna2@chelsio.com \
--cc=BMT@zurich.ibm.com \
--cc=bharat@chelsio.com \
--cc=jgg@ziepe.ca \
--cc=larrystevenwise@gmail.com \
--cc=linux-rdma@vger.kernel.org \
--cc=nirranjan@chelsio.com \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).