[PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
@ 2022-08-24  9:42 Cheng Xu
  2022-08-24  9:42 ` [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain Cheng Xu
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-24  9:42 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, KaiShen

Hi,

This series introduces erdma's implementation of drain_sq and drain_rq.
Our hardware will stop processing any new WRs if QP state is error.
So the default __ib_drain_sq and __ib_drain_rq in core code can not work
for erdma. For this reason, we implement the drain_sq and drain_rq
interfaces.

In SQ draining or RQ draining, we post both drain send wr and drain
recv wr, and then modify_qp to error. At last, we wait the corresponding
completion in the separated interface.

The first patch introduces internal post_send/post_recv for qp drain, and
the second patch implements the drain_sq and drain_rq of erdma.

Thanks,
Cheng Xu

Cheng Xu (2):
  RDMA/erdma: Introduce internal post_send/post_recv for qp drain
  RDMA/erdma: Add drain_sq and drain_rq support

 drivers/infiniband/hw/erdma/erdma_main.c  |   4 +-
 drivers/infiniband/hw/erdma/erdma_qp.c    | 116 +++++++++++++++++++++-
 drivers/infiniband/hw/erdma/erdma_verbs.h |  27 ++++-
 3 files changed, 136 insertions(+), 11 deletions(-)

-- 
2.27.0

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain
  2022-08-24  9:42 [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Cheng Xu
@ 2022-08-24  9:42 ` Cheng Xu
  2022-08-24 12:10   ` Leon Romanovsky
  2022-08-24  9:42 ` [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support Cheng Xu
  2022-08-24 14:08 ` [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Tom Talpey
  2 siblings, 1 reply; 18+ messages in thread
From: Cheng Xu @ 2022-08-24  9:42 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, KaiShen

For erdma, hardware won't process newly posted send WRs or recv WRs
after QP state changed to error, and no flush cqes will generated
for them. So, internal post_send and post_recv functions are introduced
to prevent the new send WRs or recv WRs.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma_main.c  |  4 +-
 drivers/infiniband/hw/erdma/erdma_qp.c    | 45 ++++++++++++++++++++---
 drivers/infiniband/hw/erdma/erdma_verbs.h | 17 +++++++--
 3 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
index 07e743d24847..4921ebc1286d 100644
--- a/drivers/infiniband/hw/erdma/erdma_main.c
+++ b/drivers/infiniband/hw/erdma/erdma_main.c
@@ -460,8 +460,8 @@ static const struct ib_device_ops erdma_device_ops = {
 	.mmap = erdma_mmap,
 	.mmap_free = erdma_mmap_free,
 	.modify_qp = erdma_modify_qp,
-	.post_recv = erdma_post_recv,
-	.post_send = erdma_post_send,
+	.post_recv = erdma_post_recv_nodrain,
+	.post_send = erdma_post_send_nodrain,
 	.poll_cq = erdma_poll_cq,
 	.query_device = erdma_query_device,
 	.query_gid = erdma_query_gid,
diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c
index bc3ec22a62c5..abf8b134d076 100644
--- a/drivers/infiniband/hw/erdma/erdma_qp.c
+++ b/drivers/infiniband/hw/erdma/erdma_qp.c
@@ -475,8 +475,8 @@ static void kick_sq_db(struct erdma_qp *qp, u16 pi)
 	writeq(db_data, qp->kern_qp.hw_sq_db);
 }
 
-int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
-		    const struct ib_send_wr **bad_send_wr)
+static int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
+			   const struct ib_send_wr **bad_send_wr, bool drain)
 {
 	struct erdma_qp *qp = to_eqp(ibqp);
 	int ret = 0;
@@ -488,6 +488,16 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
 		return -EINVAL;
 
 	spin_lock_irqsave(&qp->lock, flags);
+
+	if (unlikely(qp->flags & ERDMA_QP_FLAGS_TX_STOPPED)) {
+		*bad_send_wr = send_wr;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely(drain))
+		qp->flags |= ERDMA_QP_FLAGS_TX_STOPPED;
+
 	sq_pi = qp->kern_qp.sq_pi;
 
 	while (wr) {
@@ -507,11 +517,19 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
 
 		wr = wr->next;
 	}
-	spin_unlock_irqrestore(&qp->lock, flags);
 
+out:
+	spin_unlock_irqrestore(&qp->lock, flags);
 	return ret;
 }
 
+int erdma_post_send_nodrain(struct ib_qp *ibqp,
+			    const struct ib_send_wr *send_wr,
+			    const struct ib_send_wr **bad_send_wr)
+{
+	return erdma_post_send(ibqp, send_wr, bad_send_wr, false);
+}
+
 static int erdma_post_recv_one(struct erdma_qp *qp,
 			       const struct ib_recv_wr *recv_wr)
 {
@@ -542,8 +560,8 @@ static int erdma_post_recv_one(struct erdma_qp *qp,
 	return 0;
 }
 
-int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
-		    const struct ib_recv_wr **bad_recv_wr)
+static int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
+			   const struct ib_recv_wr **bad_recv_wr, bool drain)
 {
 	const struct ib_recv_wr *wr = recv_wr;
 	struct erdma_qp *qp = to_eqp(ibqp);
@@ -552,6 +570,15 @@ int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
 
 	spin_lock_irqsave(&qp->lock, flags);
 
+	if (unlikely(qp->flags & ERDMA_QP_FLAGS_RX_STOPPED)) {
+		ret = -EINVAL;
+		*bad_recv_wr = recv_wr;
+		goto out;
+	}
+
+	if (unlikely(drain))
+		qp->flags |= ERDMA_QP_FLAGS_RX_STOPPED;
+
 	while (wr) {
 		ret = erdma_post_recv_one(qp, wr);
 		if (ret) {
@@ -561,6 +588,14 @@ int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
 		wr = wr->next;
 	}
 
+out:
 	spin_unlock_irqrestore(&qp->lock, flags);
 	return ret;
 }
+
+int erdma_post_recv_nodrain(struct ib_qp *ibqp,
+			    const struct ib_recv_wr *recv_wr,
+			    const struct ib_recv_wr **bad_recv_wr)
+{
+	return erdma_post_recv(ibqp, recv_wr, bad_recv_wr, false);
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
index c7baddb1f292..f4148fbac878 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.h
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
@@ -195,6 +195,12 @@ struct erdma_qp_attrs {
 	u8 pd_len;
 };
 
+enum erdma_qp_flags {
+	ERDMA_QP_FLAGS_DRAIN_ISSUED = (1 << 0),
+	ERDMA_QP_FLAGS_TX_STOPPED = (1 << 1),
+	ERDMA_QP_FLAGS_RX_STOPPED = (1 << 2),
+};
+
 struct erdma_qp {
 	struct ib_qp ibqp;
 	struct kref ref;
@@ -202,6 +208,7 @@ struct erdma_qp {
 	struct erdma_dev *dev;
 	struct erdma_cep *cep;
 	struct rw_semaphore state_lock;
+	unsigned long flags;
 
 	union {
 		struct erdma_kqp kern_qp;
@@ -328,10 +335,12 @@ void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
 void erdma_qp_get_ref(struct ib_qp *ibqp);
 void erdma_qp_put_ref(struct ib_qp *ibqp);
 struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id);
-int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
-		    const struct ib_send_wr **bad_send_wr);
-int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
-		    const struct ib_recv_wr **bad_recv_wr);
+int erdma_post_send_nodrain(struct ib_qp *ibqp,
+			    const struct ib_send_wr *send_wr,
+			    const struct ib_send_wr **bad_send_wr);
+int erdma_post_recv_nodrain(struct ib_qp *ibqp,
+			    const struct ib_recv_wr *recv_wr,
+			    const struct ib_recv_wr **bad_recv_wr);
 int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 				u32 max_num_sg);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support
  2022-08-24  9:42 [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Cheng Xu
  2022-08-24  9:42 ` [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain Cheng Xu
@ 2022-08-24  9:42 ` Cheng Xu
  2022-08-24 12:10   ` Leon Romanovsky
  2022-08-25  1:59   ` Cheng Xu
  2022-08-24 14:08 ` [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Tom Talpey
  2 siblings, 2 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-24  9:42 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, KaiShen

For erdma, hardware won't process any WRs after modifying QP state to
error, so the default __ib_drain_sq and __ib_drain_rq can not work for
erdma device. Here, we introduce custom implementation of drain_sq and
drain_rq interface to fit erdma hardware.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma_qp.c    | 71 +++++++++++++++++++++++
 drivers/infiniband/hw/erdma/erdma_verbs.h | 10 ++++
 2 files changed, 81 insertions(+)

diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c
index abf8b134d076..57fdb946fbfd 100644
--- a/drivers/infiniband/hw/erdma/erdma_qp.c
+++ b/drivers/infiniband/hw/erdma/erdma_qp.c
@@ -599,3 +599,74 @@ int erdma_post_recv_nodrain(struct ib_qp *ibqp,
 {
 	return erdma_post_recv(ibqp, recv_wr, bad_recv_wr, false);
 }
+
+static void erdma_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct erdma_drain_cqe *cqe =
+		container_of(wc->wr_cqe, struct erdma_drain_cqe, cqe);
+
+	complete(&cqe->done);
+}
+
+static void erdma_drain_qp_common(struct ib_qp *ibqp, struct completion *comp,
+				  struct ib_cq *ibcq)
+{
+	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	struct erdma_qp *qp = to_eqp(ibqp);
+	const struct ib_send_wr *bad_swr;
+	const struct ib_recv_wr *bad_rwr;
+	struct ib_rdma_wr swr = {
+		.wr = {
+			.next = NULL,
+			{ .wr_cqe   = &qp->kern_qp.sdrain.cqe, },
+			.opcode = IB_WR_RDMA_WRITE,
+			.send_flags = IB_SEND_SIGNALED,
+		},
+	};
+	struct ib_recv_wr rwr = {
+		.next = NULL,
+		.wr_cqe = &qp->kern_qp.rdrain.cqe,
+		.num_sge = 0,
+	};
+
+	if (qp->flags & ERDMA_QP_FLAGS_DRAIN_ISSUED)
+		goto wait_for_completion;
+
+	qp->flags |= ERDMA_QP_FLAGS_DRAIN_ISSUED;
+
+	qp->kern_qp.rdrain.cqe.done = erdma_drain_qp_done;
+	init_completion(&qp->kern_qp.rdrain.done);
+
+	qp->kern_qp.sdrain.cqe.done = erdma_drain_qp_done;
+	init_completion(&qp->kern_qp.sdrain.done);
+
+	if (erdma_post_recv(ibqp, &rwr, &bad_rwr, true))
+		return;
+
+	if (erdma_post_send(ibqp, &swr.wr, &bad_swr, true))
+		return;
+
+	if (ib_modify_qp(ibqp, &attr, IB_QP_STATE))
+		return;
+
+wait_for_completion:
+	if (ibcq->poll_ctx == IB_POLL_DIRECT)
+		while (wait_for_completion_timeout(comp, HZ / 10) <= 0)
+			ib_process_cq_direct(ibcq, -1);
+	else
+		wait_for_completion(comp);
+}
+
+void erdma_drain_sq(struct ib_qp *ibqp)
+{
+	struct erdma_qp *qp = to_eqp(ibqp);
+
+	erdma_drain_qp_common(ibqp, &qp->kern_qp.sdrain.done, ibqp->send_cq);
+}
+
+void erdma_drain_rq(struct ib_qp *ibqp)
+{
+	struct erdma_qp *qp = to_eqp(ibqp);
+
+	erdma_drain_qp_common(ibqp, &qp->kern_qp.rdrain.done, ibqp->recv_cq);
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
index f4148fbac878..4cec92c8a737 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.h
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
@@ -133,6 +133,11 @@ struct erdma_uqp {
 	u32 rq_offset;
 };
 
+struct erdma_drain_cqe {
+	struct ib_cqe cqe;
+	struct completion done;
+};
+
 struct erdma_kqp {
 	u16 sq_pi;
 	u16 sq_ci;
@@ -155,6 +160,9 @@ struct erdma_kqp {
 	void *sq_db_info;
 	void *rq_db_info;
 
+	struct erdma_drain_cqe sdrain;
+	struct erdma_drain_cqe rdrain;
+
 	u8 sig_all;
 };
 
@@ -341,6 +349,8 @@ int erdma_post_send_nodrain(struct ib_qp *ibqp,
 int erdma_post_recv_nodrain(struct ib_qp *ibqp,
 			    const struct ib_recv_wr *recv_wr,
 			    const struct ib_recv_wr **bad_recv_wr);
+void erdma_drain_sq(struct ib_qp *ibqp);
+void erdma_drain_rq(struct ib_qp *ibqp);
 int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 				u32 max_num_sg);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain
  2022-08-24  9:42 ` [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain Cheng Xu
@ 2022-08-24 12:10   ` Leon Romanovsky
  0 siblings, 0 replies; 18+ messages in thread
From: Leon Romanovsky @ 2022-08-24 12:10 UTC (permalink / raw)
  To: Cheng Xu; +Cc: jgg, linux-rdma, KaiShen

On Wed, Aug 24, 2022 at 05:42:50PM +0800, Cheng Xu wrote:
> For erdma, hardware won't process newly posted send WRs or recv WRs
> after QP state changed to error, and no flush cqes will generated
> for them. So, internal post_send and post_recv functions are introduced
> to prevent the new send WRs or recv WRs.
> 
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>  drivers/infiniband/hw/erdma/erdma_main.c  |  4 +-
>  drivers/infiniband/hw/erdma/erdma_qp.c    | 45 ++++++++++++++++++++---
>  drivers/infiniband/hw/erdma/erdma_verbs.h | 17 +++++++--
>  3 files changed, 55 insertions(+), 11 deletions(-)
> 

Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support
  2022-08-24  9:42 ` [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support Cheng Xu
@ 2022-08-24 12:10   ` Leon Romanovsky
  2022-08-25  1:59   ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Leon Romanovsky @ 2022-08-24 12:10 UTC (permalink / raw)
  To: Cheng Xu; +Cc: jgg, linux-rdma, KaiShen

On Wed, Aug 24, 2022 at 05:42:51PM +0800, Cheng Xu wrote:
> For erdma, hardware won't process any WRs after modifying QP state to
> error, so the default __ib_drain_sq and __ib_drain_rq can not work for
> erdma device. Here, we introduce custom implementation of drain_sq and
> drain_rq interface to fit erdma hardware.
> 
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>  drivers/infiniband/hw/erdma/erdma_qp.c    | 71 +++++++++++++++++++++++
>  drivers/infiniband/hw/erdma/erdma_verbs.h | 10 ++++
>  2 files changed, 81 insertions(+)
> 

Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-24  9:42 [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Cheng Xu
  2022-08-24  9:42 ` [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain Cheng Xu
  2022-08-24  9:42 ` [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support Cheng Xu
@ 2022-08-24 14:08 ` Tom Talpey
  2022-08-24 14:56   ` Bernard Metzler
  2022-08-25  1:54   ` Cheng Xu
  2 siblings, 2 replies; 18+ messages in thread
From: Tom Talpey @ 2022-08-24 14:08 UTC (permalink / raw)
  To: Cheng Xu, jgg, leon; +Cc: linux-rdma, KaiShen

On 8/24/2022 5:42 AM, Cheng Xu wrote:
> Hi,
> 
> This series introduces erdma's implementation of drain_sq and drain_rq.
> Our hardware will stop processing any new WRs if QP state is error.

Doesn't this violate the IB specification? Failing newly posted WRs
before older WRs have flushed to the CQ means that ordering is not
preserved. Many upper layers depend on this.

Tom.

> So the default __ib_drain_sq and __ib_drain_rq in core code can not work
> for erdma. For this reason, we implement the drain_sq and drain_rq
> interfaces.
> 
> In SQ draining or RQ draining, we post both drain send wr and drain
> recv wr, and then modify_qp to error. At last, we wait the corresponding
> completion in the separated interface.
> 
> The first patch introduces internal post_send/post_recv for qp drain, and
> the second patch implements the drain_sq and drain_rq of erdma.
> 
> Thanks,
> Cheng Xu
> 
> Cheng Xu (2):
>    RDMA/erdma: Introduce internal post_send/post_recv for qp drain
>    RDMA/erdma: Add drain_sq and drain_rq support
> 
>   drivers/infiniband/hw/erdma/erdma_main.c  |   4 +-
>   drivers/infiniband/hw/erdma/erdma_qp.c    | 116 +++++++++++++++++++++-
>   drivers/infiniband/hw/erdma/erdma_verbs.h |  27 ++++-
>   3 files changed, 136 insertions(+), 11 deletions(-)
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-24 14:08 ` [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Tom Talpey
@ 2022-08-24 14:56   ` Bernard Metzler
  2022-08-25  1:54   ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Bernard Metzler @ 2022-08-24 14:56 UTC (permalink / raw)
  To: Tom Talpey, Cheng Xu, jgg, leon; +Cc: linux-rdma, KaiShen

> -----Original Message-----
> From: Tom Talpey <tom@talpey.com>
> Sent: Wednesday, 24 August 2022 16:09
> To: Cheng Xu <chengyou@linux.alibaba.com>; jgg@ziepe.ca; leon@kernel.org
> Cc: linux-rdma@vger.kernel.org; KaiShen@linux.alibaba.com
> Subject: [EXTERNAL] Re: [PATCH for-next 0/2] RDMA/erdma: Introduce
> custom implementation of drain_sq and drain_rq
> 
> On 8/24/2022 5:42 AM, Cheng Xu wrote:
> > Hi,
> >
> > This series introduces erdma's implementation of drain_sq and
> drain_rq.
> > Our hardware will stop processing any new WRs if QP state is error.
> 
> Doesn't this violate the IB specification? Failing newly posted WRs
> before older WRs have flushed to the CQ means that ordering is not
> preserved. Many upper layers depend on this.
> 

It would be ok to synchronously fail the post_send()/post_recv() call
if the QP is in error, or the WR is malformed. In that case, 
the WR does not translate into a WQE and will not produce a
work completion.

Bernard.


> Tom.
> 
> > So the default __ib_drain_sq and __ib_drain_rq in core code can not
> work
> > for erdma. For this reason, we implement the drain_sq and drain_rq
> > interfaces.
> >
> > In SQ draining or RQ draining, we post both drain send wr and drain
> > recv wr, and then modify_qp to error. At last, we wait the
> corresponding
> > completion in the separated interface.
> >
> > The first patch introduces internal post_send/post_recv for qp drain,
> and
> > the second patch implements the drain_sq and drain_rq of erdma.
> >
> > Thanks,
> > Cheng Xu
> >
> > Cheng Xu (2):
> >    RDMA/erdma: Introduce internal post_send/post_recv for qp drain
> >    RDMA/erdma: Add drain_sq and drain_rq support
> >
> >   drivers/infiniband/hw/erdma/erdma_main.c  |   4 +-
> >   drivers/infiniband/hw/erdma/erdma_qp.c    | 116
> +++++++++++++++++++++-
> >   drivers/infiniband/hw/erdma/erdma_verbs.h |  27 ++++-
> >   3 files changed, 136 insertions(+), 11 deletions(-)
> >

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-24 14:08 ` [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Tom Talpey
  2022-08-24 14:56   ` Bernard Metzler
@ 2022-08-25  1:54   ` Cheng Xu
  2022-08-25 16:37     ` Tom Talpey
  1 sibling, 1 reply; 18+ messages in thread
From: Cheng Xu @ 2022-08-25  1:54 UTC (permalink / raw)
  To: Tom Talpey, jgg, leon; +Cc: linux-rdma, KaiShen

On 8/24/22 10:08 PM, Tom Talpey wrote:
> On 8/24/2022 5:42 AM, Cheng Xu wrote:
>> Hi,
>>
>> This series introduces erdma's implementation of drain_sq and drain_rq.
>> Our hardware will stop processing any new WRs if QP state is error.
> 
> Doesn't this violate the IB specification? Failing newly posted WRs
> before older WRs have flushed to the CQ means that ordering is not
> preserved.

I agree with Bernard's point.

I'm not very familiar with with IB specification. But for RNIC/iWarp [1],
post WR in Error state has two optional actions: "Post WQE, and then Flush it"
or "Return an Immediate Error" (Showed in Figure 10). So, I think failing
newly posted WRs is reasonable.

> Many upper layers depend on this.

For kernel verbs, erdma currently supports NoF. we tested the NoF cases,
and found that it's never happened that newly WRs were posted after QP
changed to error, and the drain_qp should be the terminal of IO process.

Also, in userspace, I find that many providers prevents new WRs if QP state is
not right.

So, it seems ok in practice.

Thanks,
Cheng Xu

[1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support
  2022-08-24  9:42 ` [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support Cheng Xu
  2022-08-24 12:10   ` Leon Romanovsky
@ 2022-08-25  1:59   ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-25  1:59 UTC (permalink / raw)
  To: jgg, leon; +Cc: linux-rdma, KaiShen



On 8/24/22 5:42 PM, Cheng Xu wrote:
> For erdma, hardware won't process any WRs after modifying QP state to
> error, so the default __ib_drain_sq and __ib_drain_rq can not work for
> erdma device. Here, we introduce custom implementation of drain_sq and
> drain_rq interface to fit erdma hardware.
> 
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>  drivers/infiniband/hw/erdma/erdma_qp.c    | 71 +++++++++++++++++++++++
>  drivers/infiniband/hw/erdma/erdma_verbs.h | 10 ++++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c
> index abf8b134d076..57fdb946fbfd 100644
> --- a/drivers/infiniband/hw/erdma/erdma_qp.c
> +++ b/drivers/infiniband/hw/erdma/erdma_qp.c
> @@ -599,3 +599,74 @@ int erdma_post_recv_nodrain(struct ib_qp *ibqp,
>  {
>  	return erdma_post_recv(ibqp, recv_wr, bad_recv_wr, false);
>  }
> +
> +static void erdma_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	struct erdma_drain_cqe *cqe =
> +		container_of(wc->wr_cqe, struct erdma_drain_cqe, cqe);
> +
> +	complete(&cqe->done);
> +}
> +
> +static void erdma_drain_qp_common(struct ib_qp *ibqp, struct completion *comp,
> +				  struct ib_cq *ibcq)
> +{
> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +	struct erdma_qp *qp = to_eqp(ibqp);
> +	const struct ib_send_wr *bad_swr;
> +	const struct ib_recv_wr *bad_rwr;
> +	struct ib_rdma_wr swr = {
> +		.wr = {
> +			.next = NULL,
> +			{ .wr_cqe   = &qp->kern_qp.sdrain.cqe, },
> +			.opcode = IB_WR_RDMA_WRITE,
> +			.send_flags = IB_SEND_SIGNALED,
> +		},
> +	};
> +	struct ib_recv_wr rwr = {
> +		.next = NULL,
> +		.wr_cqe = &qp->kern_qp.rdrain.cqe,
> +		.num_sge = 0,
> +	};
> +
> +	if (qp->flags & ERDMA_QP_FLAGS_DRAIN_ISSUED)
> +		goto wait_for_completion;
> +
> +	qp->flags |= ERDMA_QP_FLAGS_DRAIN_ISSUED;
> +
> +	qp->kern_qp.rdrain.cqe.done = erdma_drain_qp_done;
> +	init_completion(&qp->kern_qp.rdrain.done);
> +
> +	qp->kern_qp.sdrain.cqe.done = erdma_drain_qp_done;
> +	init_completion(&qp->kern_qp.sdrain.done);
> +
> +	if (erdma_post_recv(ibqp, &rwr, &bad_rwr, true))
> +		return;
> +
> +	if (erdma_post_send(ibqp, &swr.wr, &bad_swr, true))
> +		return;
> +
> +	if (ib_modify_qp(ibqp, &attr, IB_QP_STATE))
> +		return;
> +
> +wait_for_completion:
> +	if (ibcq->poll_ctx == IB_POLL_DIRECT)
> +		while (wait_for_completion_timeout(comp, HZ / 10) <= 0)
> +			ib_process_cq_direct(ibcq, -1);
> +	else
> +		wait_for_completion(comp);
> +}
> +
> +void erdma_drain_sq(struct ib_qp *ibqp)
> +{
> +	struct erdma_qp *qp = to_eqp(ibqp);
> +
> +	erdma_drain_qp_common(ibqp, &qp->kern_qp.sdrain.done, ibqp->send_cq);
> +}
> +
> +void erdma_drain_rq(struct ib_qp *ibqp)
> +{
> +	struct erdma_qp *qp = to_eqp(ibqp);
> +
> +	erdma_drain_qp_common(ibqp, &qp->kern_qp.rdrain.done, ibqp->recv_cq);
> +}
> diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
> index f4148fbac878..4cec92c8a737 100644
> --- a/drivers/infiniband/hw/erdma/erdma_verbs.h
> +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
> @@ -133,6 +133,11 @@ struct erdma_uqp {
>  	u32 rq_offset;
>  };
>  
> +struct erdma_drain_cqe {
> +	struct ib_cqe cqe;
> +	struct completion done;
> +};
> +
>  struct erdma_kqp {
>  	u16 sq_pi;
>  	u16 sq_ci;
> @@ -155,6 +160,9 @@ struct erdma_kqp {
>  	void *sq_db_info;
>  	void *rq_db_info;
>  
> +	struct erdma_drain_cqe sdrain;
> +	struct erdma_drain_cqe rdrain;
> +
>  	u8 sig_all;
>  };
>  
> @@ -341,6 +349,8 @@ int erdma_post_send_nodrain(struct ib_qp *ibqp,
>  int erdma_post_recv_nodrain(struct ib_qp *ibqp,
>  			    const struct ib_recv_wr *recv_wr,
>  			    const struct ib_recv_wr **bad_recv_wr);
> +void erdma_drain_sq(struct ib_qp *ibqp);
> +void erdma_drain_rq(struct ib_qp *ibqp);
>  int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
>  struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
>  				u32 max_num_sg);

I'm sorry that I forgot to involved the corresponding assignment of struct ib_device_ops
into this patch. I will send v2 to fix this.


diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
index 4921ebc1286d..e4ce77607f10 100644
--- a/drivers/infiniband/hw/erdma/erdma_main.c
+++ b/drivers/infiniband/hw/erdma/erdma_main.c
@@ -446,6 +446,8 @@ static const struct ib_device_ops erdma_device_ops = {
        .dereg_mr = erdma_dereg_mr,
        .destroy_cq = erdma_destroy_cq,
        .destroy_qp = erdma_destroy_qp,
+       .drain_rq = erdma_drain_rq,
+       .drain_sq = erdma_drain_sq,
        .get_dma_mr = erdma_get_dma_mr,
        .get_port_immutable = erdma_get_port_immutable,
        .iw_accept = erdma_accept,





^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-25  1:54   ` Cheng Xu
@ 2022-08-25 16:37     ` Tom Talpey
  2022-08-26  3:21       ` Cheng Xu
  0 siblings, 1 reply; 18+ messages in thread
From: Tom Talpey @ 2022-08-25 16:37 UTC (permalink / raw)
  To: Cheng Xu, jgg, leon; +Cc: linux-rdma, KaiShen

On 8/24/2022 9:54 PM, Cheng Xu wrote:
> 
> 
> On 8/24/22 10:08 PM, Tom Talpey wrote:
>> On 8/24/2022 5:42 AM, Cheng Xu wrote:
>>> Hi,
>>>
>>> This series introduces erdma's implementation of drain_sq and drain_rq.
>>> Our hardware will stop processing any new WRs if QP state is error.
>>
>> Doesn't this violate the IB specification? Failing newly posted WRs
>> before older WRs have flushed to the CQ means that ordering is not
>> preserved.
> 
> I agree with Bernard's point.
> 
> I'm not very familiar with with IB specification. But for RNIC/iWarp [1],
> post WR in Error state has two optional actions: "Post WQE, and then Flush it"
> or "Return an Immediate Error" (Showed in Figure 10). So, I think failing
> newly posted WRs is reasonable.

Both IB and iWARP allow new post-WR calls to fail synchronously if
the QP is in the ERROR state. But the QP can only enter ERROR once the
SQ and RQ are fully drained to the CQ(s). Until that happens, the
WRs need to flush through.

Your code seems to start failing WR's when the TX_STOPPED or RX_STOPPED
bits are set. But that bit is being set when the drain *begins*, not
when it flushes through. That seems wrong, to me.

>> Many upper layers depend on this.
> 
> For kernel verbs, erdma currently supports NoF. we tested the NoF cases,
> and found that it's never happened that newly WRs were posted after QP
> changed to error, and the drain_qp should be the terminal of IO process.
> 
> Also, in userspace, I find that many providers prevents new WRs if QP state is
> not right.

Sure, but your new STOPPED bits don't propagate up to userspace, right?
The post calls will fail unexpectedly, because the QP is not yet in
ERROR state.

I'm also concerned how consumers who are posting their own "drain" WQEs
will behave. This is a common approach that many already take. But now
they will see different behavior when posting them...

Tom.

> 
> So, it seems ok in practice.
> 
> Thanks,
> Cheng Xu
> 
> 
> [1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-25 16:37     ` Tom Talpey
@ 2022-08-26  3:21       ` Cheng Xu
  2022-08-26 13:11         ` Tom Talpey
  0 siblings, 1 reply; 18+ messages in thread
From: Cheng Xu @ 2022-08-26  3:21 UTC (permalink / raw)
  To: Tom Talpey, jgg, leon; +Cc: linux-rdma, KaiShen

On 8/26/22 12:37 AM, Tom Talpey wrote:
> On 8/24/2022 9:54 PM, Cheng Xu wrote:
>>
>>
>> On 8/24/22 10:08 PM, Tom Talpey wrote:
>>> On 8/24/2022 5:42 AM, Cheng Xu wrote:
>>>> Hi,
>>>>
>>>> This series introduces erdma's implementation of drain_sq and drain_rq.
>>>> Our hardware will stop processing any new WRs if QP state is error.
>>>
>>> Doesn't this violate the IB specification? Failing newly posted WRs
>>> before older WRs have flushed to the CQ means that ordering is not
>>> preserved.
>>
>> I agree with Bernard's point.
>>
>> I'm not very familiar with with IB specification. But for RNIC/iWarp [1],
>> post WR in Error state has two optional actions: "Post WQE, and then Flush it"
>> or "Return an Immediate Error" (Showed in Figure 10). So, I think failing
>> newly posted WRs is reasonable.
> 
> <...> But the QP can only enter ERROR once the
> SQ and RQ are fully drained to the CQ(s). Until that happens, the
> WRs need to flush through.
> 

Emm, let's put erdma aside first, it seems that specification does not require
this. According to "6.2.4 Error State" in the document [1]:

 The following is done on entry into the Error state:
 * The RI MUST flush any incomplete WRs on the SQ or RQ. 
   .....
 * At some point in the execution of the flushing operation, the RI
   MUST begin to return an Immediate Error for any attempt to post
   a WR to a Work Queue;
   ....

As the second point says, The flushing operation and the behavior of returning
Immediate Error are asynchronous. what you mentioned is not guaranteed. Failing
the post_send/post_recv may happens at any time during modify_qp to error.

[1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf

> Your code seems to start failing WR's when the TX_STOPPED or RX_STOPPED
> bits are set. But that bit is being set when the drain *begins*, not
> when it flushes through. That seems wrong, to me.
> 

Back to erdma's scenario, As I explains above, I think failing immediately when
flushing begins does not violate the specification.

>>> Many upper layers depend on this.
>>
>> For kernel verbs, erdma currently supports NoF. we tested the NoF cases,
>> and found that it's never happened that newly WRs were posted after QP
>> changed to error, and the drain_qp should be the terminal of IO process.
>>
>> Also, in userspace, I find that many providers prevents new WRs if QP state is
>> not right.
> 
> Sure, but your new STOPPED bits don't propagate up to userspace, right?

Yes. they are only used for kernel QPs. The bits are used for setting an accurate
time point to prevent newly WRs when modify qp to error.

> The post calls will fail unexpectedly, because the QP is not yet in
> ERROR state.

Do you mean this happens in userspace or kernel? The new bits do not influence
userspace, and erdma will have the same behavior with other providers in userspace.

For kernel, this is only used in drain_qp scenario. posting WRs and drain qp
concurrently will lead uncertain results. This is also mentioned in the comment
of ib_drain_qp:

 * ensure that there are no other contexts that are posting WRs concurrently.
 * Otherwise the drain is not guaranteed.

> I'm also concerned how consumers who are posting their own "drain" WQEs
> will behave. This is a common approach that many already take. But now
> they will see different behavior when posting them...
> 

For userspace, erdma is not special.
For kernel, I think the standard way to drain WR is using ib_drain_qp, not custom
implementation. I'm not sure that there is some ULPs in kernel has their own drain
flow.

Thanks,
Cheng Xu

> Tom.
> 
> 
>>
>> So, it seems ok in practice.
>>
>> Thanks,
>> Cheng Xu
>>
>>
>> [1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf
>>
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-26  3:21       ` Cheng Xu
@ 2022-08-26 13:11         ` Tom Talpey
  2022-08-26 13:57           ` Jason Gunthorpe
  2022-08-29  3:37           ` Cheng Xu
  0 siblings, 2 replies; 18+ messages in thread
From: Tom Talpey @ 2022-08-26 13:11 UTC (permalink / raw)
  To: Cheng Xu, jgg, leon; +Cc: linux-rdma, KaiShen

On 8/25/2022 11:21 PM, Cheng Xu wrote:
> On 8/26/22 12:37 AM, Tom Talpey wrote:
>> On 8/24/2022 9:54 PM, Cheng Xu wrote:
>>>
>>>
>>> On 8/24/22 10:08 PM, Tom Talpey wrote:
>>>> On 8/24/2022 5:42 AM, Cheng Xu wrote:
>>>>> Hi,
>>>>>
>>>>> This series introduces erdma's implementation of drain_sq and drain_rq.
>>>>> Our hardware will stop processing any new WRs if QP state is error.
>>>>
>>>> Doesn't this violate the IB specification? Failing newly posted WRs
>>>> before older WRs have flushed to the CQ means that ordering is not
>>>> preserved.
>>>
>>> I agree with Bernard's point.
>>>
>>> I'm not very familiar with with IB specification. But for RNIC/iWarp [1],
>>> post WR in Error state has two optional actions: "Post WQE, and then Flush it"
>>> or "Return an Immediate Error" (Showed in Figure 10). So, I think failing
>>> newly posted WRs is reasonable.
>>
>> <...> But the QP can only enter ERROR once the
>> SQ and RQ are fully drained to the CQ(s). Until that happens, the
>> WRs need to flush through.
>>
> 
> Emm, let's put erdma aside first, it seems that specification does not require
> this. According to "6.2.4 Error State" in the document [1]:
> 
>   The following is done on entry into the Error state:
>   * The RI MUST flush any incomplete WRs on the SQ or RQ.
>     .....
>   * At some point in the execution of the flushing operation, the RI
>     MUST begin to return an Immediate Error for any attempt to post
>     a WR to a Work Queue;
>     ....
> 
> As the second point says, The flushing operation and the behavior of returning
> Immediate Error are asynchronous. what you mentioned is not guaranteed. Failing
> the post_send/post_recv may happens at any time during modify_qp to error.
> 
> [1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf

Well, that language is very imprecise, "at some point" is not exactly
definitive. I'll explain one scenario that makes it problematic.

>> Your code seems to start failing WR's when the TX_STOPPED or RX_STOPPED
>> bits are set. But that bit is being set when the drain *begins*, not
>> when it flushes through. That seems wrong, to me.
>>
> 
> Back to erdma's scenario, As I explains above, I think failing immediately when
> flushing begins does not violate the specification.

Consider a consumer which posts with a mix of IB_SEND_SIGNALED and
also unsignaled WRs, for example, fast-memory registration followed
by a send, a very typical storage consumer operation.

- post_wr(memreg, !signaled) => post success
-      => operation success, no completion generated
- ...  <= provider detects error here
- post_wr(send, signaled) => post fail (new in your patch)
- ...  <= provider notifies async error, etc.

The consumer now knows there's an error, and needs to tear down.
It must remove the DMA mapping before proceeding, but the hardware
may still be using it. How does it determine the status of that
first post_wr, so it may proceed?

The IB spec explicitly states that the post verb can only return
the immediate error after the QP has exited the ERROR state, which
includes all pending WRs having been flushed and made visible on
the CQ. Here is an excerpt from the Post Send Request section
11.4.1.1 specifying its output modifiers:

-> Invalid QP state.
-> Note: This error is returned only when the QP is in the Reset,
-> Init, or RTR states. It is not returned when the QP is in the Error
-> or Send Queue Error states due to race conditions that could
-> result in indeterminate behavior. Work Requests posted to the
-> Send Queue while the QP is in the Error or Send Queue Error
-> states are completed with a flush error.

So, the consumer will post a new, signaled, work request, and wait
for it to "flush through" by polling the CQ. Because WR's always
complete in-order, this final completion must appear after *all*
prior WR's, and this gives the consumer the green light to proceed.

With your change, ERDMA will pre-emptively fail such a newly posted
request, and generate no new completion. The consumer is left in limbo
on the status of its prior requests. Providers must not override this.

Tom.

>>>> Many upper layers depend on this.
>>>
>>> For kernel verbs, erdma currently supports NoF. we tested the NoF cases,
>>> and found that it's never happened that newly WRs were posted after QP
>>> changed to error, and the drain_qp should be the terminal of IO process.
>>>
>>> Also, in userspace, I find that many providers prevents new WRs if QP state is
>>> not right.
>>
>> Sure, but your new STOPPED bits don't propagate up to userspace, right?
> 
> Yes. they are only used for kernel QPs. The bits are used for setting an accurate
> time point to prevent newly WRs when modify qp to error.
> 
>> The post calls will fail unexpectedly, because the QP is not yet in
>> ERROR state.
> 
> Do you mean this happens in userspace or kernel? The new bits do not influence
> userspace, and erdma will have the same behavior with other providers in userspace.
> 
> For kernel, this is only used in drain_qp scenario. posting WRs and drain qp
> concurrently will lead uncertain results. This is also mentioned in the comment
> of ib_drain_qp:
> 
>   * ensure that there are no other contexts that are posting WRs concurrently.
>   * Otherwise the drain is not guaranteed.
> 
>> I'm also concerned how consumers who are posting their own "drain" WQEs
>> will behave. This is a common approach that many already take. But now
>> they will see different behavior when posting them...
>>
> 
> For userspace, erdma is not special.
> For kernel, I think the standard way to drain WR is using ib_drain_qp, not custom
> implementation. I'm not sure that there is some ULPs in kernel has their own drain
> flow.
> 
> Thanks,
> Cheng Xu
> 
>> Tom.
>>
>>
>>>
>>> So, it seems ok in practice.
>>>
>>> Thanks,
>>> Cheng Xu
>>>
>>>
>>> [1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf
>>>
>>>
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-26 13:11         ` Tom Talpey
@ 2022-08-26 13:57           ` Jason Gunthorpe
  2022-08-29  4:01             ` Cheng Xu
  2022-08-29  3:37           ` Cheng Xu
  1 sibling, 1 reply; 18+ messages in thread
From: Jason Gunthorpe @ 2022-08-26 13:57 UTC (permalink / raw)
  To: Tom Talpey; +Cc: Cheng Xu, leon, linux-rdma, KaiShen

On Fri, Aug 26, 2022 at 09:11:25AM -0400, Tom Talpey wrote:

> With your change, ERDMA will pre-emptively fail such a newly posted
> request, and generate no new completion. The consumer is left in limbo
> on the status of its prior requests. Providers must not override this.

Yeah, I tend to agree with Tom.

And I also want to point out that Linux RDMA verbs does not follow the
SW specifications of either IBTA or the iWarp group. We have our own
expectation for how these APIs work that our own ULPs rely on.

So pedantically debating what a software spec we don't follow says is
not relavent. The utility is to understand the intention and use cases
and ensure we cover the same. Usually this means we follow the spec :)

Jason

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-26 13:11         ` Tom Talpey
  2022-08-26 13:57           ` Jason Gunthorpe
@ 2022-08-29  3:37           ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-29  3:37 UTC (permalink / raw)
  To: Tom Talpey, jgg, leon; +Cc: linux-rdma, KaiShen



On 8/26/22 9:11 PM, Tom Talpey wrote:
> On 8/25/2022 11:21 PM, Cheng Xu wrote:
>> On 8/26/22 12:37 AM, Tom Talpey wrote:
>>> On 8/24/2022 9:54 PM, Cheng Xu wrote:
>>>>
>>>>
>>>> On 8/24/22 10:08 PM, Tom Talpey wrote:
>>>>> On 8/24/2022 5:42 AM, Cheng Xu wrote:
>>>>>> Hi,
>>>>>>
>>>>>> This series introduces erdma's implementation of drain_sq and drain_rq.
>>>>>> Our hardware will stop processing any new WRs if QP state is error.
>>>>>
>>>>> Doesn't this violate the IB specification? Failing newly posted WRs
>>>>> before older WRs have flushed to the CQ means that ordering is not
>>>>> preserved.
>>>>
>>>> I agree with Bernard's point.
>>>>
>>>> I'm not very familiar with with IB specification. But for RNIC/iWarp [1],
>>>> post WR in Error state has two optional actions: "Post WQE, and then Flush it"
>>>> or "Return an Immediate Error" (Showed in Figure 10). So, I think failing
>>>> newly posted WRs is reasonable.
>>>
>>> <...> But the QP can only enter ERROR once the
>>> SQ and RQ are fully drained to the CQ(s). Until that happens, the
>>> WRs need to flush through.
>>>
>>
>> Emm, let's put erdma aside first, it seems that specification does not require
>> this. According to "6.2.4 Error State" in the document [1]:
>>
>>   The following is done on entry into the Error state:
>>   * The RI MUST flush any incomplete WRs on the SQ or RQ.
>>     .....
>>   * At some point in the execution of the flushing operation, the RI
>>     MUST begin to return an Immediate Error for any attempt to post
>>     a WR to a Work Queue;
>>     ....
>>
>> As the second point says, The flushing operation and the behavior of returning
>> Immediate Error are asynchronous. what you mentioned is not guaranteed. Failing
>> the post_send/post_recv may happens at any time during modify_qp to error.
>>
>> [1] http://www.rdmaconsortium.org/home/draft-hilland-iwarp-verbs-v1.0-RDMAC.pdf
> 
> Well, that language is very imprecise, "at some point" is not exactly
> definitive. I'll explain one scenario that makes it problematic.
> 
>>> Your code seems to start failing WR's when the TX_STOPPED or RX_STOPPED
>>> bits are set. But that bit is being set when the drain *begins*, not
>>> when it flushes through. That seems wrong, to me.
>>>
>>
>> Back to erdma's scenario, As I explains above, I think failing immediately when
>> flushing begins does not violate the specification.
> 
> Consider a consumer which posts with a mix of IB_SEND_SIGNALED and
> also unsignaled WRs, for example, fast-memory registration followed
> by a send, a very typical storage consumer operation.
> 
> - post_wr(memreg, !signaled) => post success
> -      => operation success, no completion generated
> - ...  <= provider detects error here
> - post_wr(send, signaled) => post fail (new in your patch)
> - ...  <= provider notifies async error, etc.
> 
> The consumer now knows there's an error, and needs to tear down.
> It must remove the DMA mapping before proceeding, but the hardware
> may still be using it. How does it determine the status of that
> first post_wr, so it may proceed?
> 
> The IB spec explicitly states that the post verb can only return
> the immediate error after the QP has exited the ERROR state, which
> includes all pending WRs having been flushed and made visible on
> the CQ. Here is an excerpt from the Post Send Request section
> 11.4.1.1 specifying its output modifiers:
> 
> -> Invalid QP state.
> -> Note: This error is returned only when the QP is in the Reset,
> -> Init, or RTR states. It is not returned when the QP is in the Error
> -> or Send Queue Error states due to race conditions that could
> -> result in indeterminate behavior. Work Requests posted to the
> -> Send Queue while the QP is in the Error or Send Queue Error
> -> states are completed with a flush error.
> 

Get it, thanks. The IB spec seems to be more clear.

> So, the consumer will post a new, signaled, work request, and wait
> for it to "flush through" by polling the CQ. Because WR's always
> complete in-order, this final completion must appear after *all*
> prior WR's, and this gives the consumer the green light to proceed.
> 

Yeah, this is right, and the default ib_drain_qp do it in this way.

> With your change, ERDMA will pre-emptively fail such a newly posted
> request, and generate no new completion. The consumer is left in limbo
> on the status of its prior requests. Providers must not override this.

For the ULPs that do not use ib_drain_qp interface, we will have problem.

But currently it seems that almost all the ULPs in kernel call ib_drain_qp
to finish the drain flow. While ib_drain_qp allows vendors to have
custom ib_drain_qp implementations which is invisible to ULPs.

Thanks,
Cheng Xu


> Tom.
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-26 13:57           ` Jason Gunthorpe
@ 2022-08-29  4:01             ` Cheng Xu
  2022-08-30 18:45               ` Tom Talpey
  0 siblings, 1 reply; 18+ messages in thread
From: Cheng Xu @ 2022-08-29  4:01 UTC (permalink / raw)
  To: Jason Gunthorpe, Tom Talpey; +Cc: leon, linux-rdma, KaiShen

On 8/26/22 9:57 PM, Jason Gunthorpe wrote:
> On Fri, Aug 26, 2022 at 09:11:25AM -0400, Tom Talpey wrote:
> 
>> With your change, ERDMA will pre-emptively fail such a newly posted
>> request, and generate no new completion. The consumer is left in limbo
>> on the status of its prior requests. Providers must not override this.
> 
> Yeah, I tend to agree with Tom.
> 
> And I also want to point out that Linux RDMA verbs does not follow the
> SW specifications of either IBTA or the iWarp group. We have our own
> expectation for how these APIs work that our own ULPs rely on.
> 
> So pedantically debating what a software spec we don't follow says is
> not relavent. The utility is to understand the intention and use cases
> and ensure we cover the same. Usually this means we follow the spec :)
> 

Yeah, I totally agree with this.

Actually, I thought that ULPs do not concern about the details of how the
flushing and modify_qp being performed in the drivers. The drain flow is
handled by a single ib_drain_qp call for ULPs. While ib_drain_qp API allows
vendor-custom implementation, this is invisible to ULPs.

For the ULPs which implement their own drain flow instead of using
ib_drain_qp  (I think it is rare in kernel), they will fail in erdma.

Anyway, since our implementation is disputed, We'd like to keep the same
behavior with other vendors. Maybe firmware updating w/o driver changes or
software flushing in driver will fix this.

Thanks,
Cheng Xu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-29  4:01             ` Cheng Xu
@ 2022-08-30 18:45               ` Tom Talpey
  2022-08-31  2:08                 ` Cheng Xu
  2022-08-31  2:52                 ` Cheng Xu
  0 siblings, 2 replies; 18+ messages in thread
From: Tom Talpey @ 2022-08-30 18:45 UTC (permalink / raw)
  To: Cheng Xu, Jason Gunthorpe; +Cc: leon, linux-rdma, KaiShen

On 8/29/2022 12:01 AM, Cheng Xu wrote:
> 
> 
> On 8/26/22 9:57 PM, Jason Gunthorpe wrote:
>> On Fri, Aug 26, 2022 at 09:11:25AM -0400, Tom Talpey wrote:
>>
>>> With your change, ERDMA will pre-emptively fail such a newly posted
>>> request, and generate no new completion. The consumer is left in limbo
>>> on the status of its prior requests. Providers must not override this.
>>
>> Yeah, I tend to agree with Tom.
>>
>> And I also want to point out that Linux RDMA verbs does not follow the
>> SW specifications of either IBTA or the iWarp group. We have our own
>> expectation for how these APIs work that our own ULPs rely on.
>>
>> So pedantically debating what a software spec we don't follow says is
>> not relavent. The utility is to understand the intention and use cases
>> and ensure we cover the same. Usually this means we follow the spec :)
>>
> 
> Yeah, I totally agree with this.
> 
> Actually, I thought that ULPs do not concern about the details of how the
> flushing and modify_qp being performed in the drivers. The drain flow is
> handled by a single ib_drain_qp call for ULPs. While ib_drain_qp API allows
> vendor-custom implementation, this is invisible to ULPs.
> 
> For the ULPs which implement their own drain flow instead of using
> ib_drain_qp  (I think it is rare in kernel), they will fail in erdma.
> 
> Anyway, since our implementation is disputed, We'd like to keep the same
> behavior with other vendors. Maybe firmware updating w/o driver changes or
> software flushing in driver will fix this.

To be clear, my concern is about the ordering of CQE flushes with
respect to the WR posting fails. Draining the CQs in whatever way
you choose to optimize for your device is not the issue, although
it seems odd to me that you need such a thing.

The problem is that your patch started failing the new requests
_before_ the drain could be used to clean up. This introduced
two new provider behaviors that consumers would not expect:

- first error detected in a post call (on the fast path!)
- inability to determine if prior requests were complete

I'd really suggest getting a copy of the full IB spec and examining
the difference between QP "Error" and "SQ Error" states. They are
subtle but important.

Tom.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-30 18:45               ` Tom Talpey
@ 2022-08-31  2:08                 ` Cheng Xu
  2022-08-31  2:52                 ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-31  2:08 UTC (permalink / raw)
  To: Tom Talpey, Jason Gunthorpe; +Cc: leon, linux-rdma, KaiShen



On 8/31/22 2:45 AM, Tom Talpey wrote:
> On 8/29/2022 12:01 AM, Cheng Xu wrote:
>>
>>
>> On 8/26/22 9:57 PM, Jason Gunthorpe wrote:
>>> On Fri, Aug 26, 2022 at 09:11:25AM -0400, Tom Talpey wrote:
>>>
>>>> With your change, ERDMA will pre-emptively fail such a newly posted
>>>> request, and generate no new completion. The consumer is left in limbo
>>>> on the status of its prior requests. Providers must not override this.
>>>
>>> Yeah, I tend to agree with Tom.
>>>
>>> And I also want to point out that Linux RDMA verbs does not follow the
>>> SW specifications of either IBTA or the iWarp group. We have our own
>>> expectation for how these APIs work that our own ULPs rely on.
>>>
>>> So pedantically debating what a software spec we don't follow says is
>>> not relavent. The utility is to understand the intention and use cases
>>> and ensure we cover the same. Usually this means we follow the spec :)
>>>
>>
>> Yeah, I totally agree with this.
>>
>> Actually, I thought that ULPs do not concern about the details of how the
>> flushing and modify_qp being performed in the drivers. The drain flow is
>> handled by a single ib_drain_qp call for ULPs. While ib_drain_qp API allows
>> vendor-custom implementation, this is invisible to ULPs.
>>
>> For the ULPs which implement their own drain flow instead of using
>> ib_drain_qp  (I think it is rare in kernel), they will fail in erdma.
>>
>> Anyway, since our implementation is disputed, We'd like to keep the same
>> behavior with other vendors. Maybe firmware updating w/o driver changes or
>> software flushing in driver will fix this.
> 
> To be clear, my concern is about the ordering of CQE flushes with
> respect to the WR posting fails.Draining the CQs in whatever way
> you choose to optimize for your device is not the issue, although
> it seems odd to me that you need such a thing.
> 

Yeah, I understand what you concern about. I'm sorry that there may be
ambiguity in my last reply.

After discussed internally, we would like to drop this patch (e.g., failing
WRs before drain, or failing WRs in QP Error State), because it indeed has problem
in the cases you mentioned. And we are seeking for new solutions. New solutions
will not failing the WRs in drain cases, and by this erdma will have the same behavior
with other vendors.

More, the reason why we introduced this patch is that our hardware do not flush
newly WRs in QP Error State currently. So new solutions could be:
 - Let our hardware flush newly WRs, or
 - Flush WRs in our driver if hardware does not flush.
Either of them will eliminate the odd logic in this patch. For now, we tend the
first option.

Thanks,
Cheng Xu


> The problem is that your patch started failing the new requests
> _before_ the drain could be used to clean up. This introduced
> two new provider behaviors that consumers would not expect:
> 
> - first error detected in a post call (on the fast path!)
> - inability to determine if prior requests were complete
> 
> I'd really suggest getting a copy of the full IB spec and examining
> the difference between QP "Error" and "SQ Error" states. They are
> subtle but important.
> 
> Tom.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq
  2022-08-30 18:45               ` Tom Talpey
  2022-08-31  2:08                 ` Cheng Xu
@ 2022-08-31  2:52                 ` Cheng Xu
  1 sibling, 0 replies; 18+ messages in thread
From: Cheng Xu @ 2022-08-31  2:52 UTC (permalink / raw)
  To: Tom Talpey, Jason Gunthorpe; +Cc: leon, linux-rdma, KaiShen



On 8/31/22 2:45 AM, Tom Talpey wrote:
> On 8/29/2022 12:01 AM, Cheng Xu wrote:
>>
>>
>> On 8/26/22 9:57 PM, Jason Gunthorpe wrote:
>>> On Fri, Aug 26, 2022 at 09:11:25AM -0400, Tom Talpey wrote:
>>>
>>>> With your change, ERDMA will pre-emptively fail such a newly posted
>>>> request, and generate no new completion. The consumer is left in limbo
>>>> on the status of its prior requests. Providers must not override this.
>>>
>>> Yeah, I tend to agree with Tom.
>>>
>>> And I also want to point out that Linux RDMA verbs does not follow the
>>> SW specifications of either IBTA or the iWarp group. We have our own
>>> expectation for how these APIs work that our own ULPs rely on.
>>>
>>> So pedantically debating what a software spec we don't follow says is
>>> not relavent. The utility is to understand the intention and use cases
>>> and ensure we cover the same. Usually this means we follow the spec :)
>>>
>>
>> Yeah, I totally agree with this.
>>
>> Actually, I thought that ULPs do not concern about the details of how the
>> flushing and modify_qp being performed in the drivers. The drain flow is
>> handled by a single ib_drain_qp call for ULPs. While ib_drain_qp API allows
>> vendor-custom implementation, this is invisible to ULPs.
>>
>> For the ULPs which implement their own drain flow instead of using
>> ib_drain_qp  (I think it is rare in kernel), they will fail in erdma.
>>
>> Anyway, since our implementation is disputed, We'd like to keep the same
>> behavior with other vendors. Maybe firmware updating w/o driver changes or
>> software flushing in driver will fix this.
> 
> To be clear, my concern is about the ordering of CQE flushes with
> respect to the WR posting fails. Draining the CQs in whatever way
> you choose to optimize for your device is not the issue, although
> it seems odd to me that you need such a thing.
> 
> The problem is that your patch started failing the new requests
> _before_ the drain could be used to clean up. This introduced
> two new provider behaviors that consumers would not expect:
> 
> - first error detected in a post call (on the fast path!)
> - inability to determine if prior requests were complete
> 
Yes, you are right. As I replied, we will drop this patch, and follow
the common behaviors as other providers do.

> I'd really suggest getting a copy of the full IB spec and examining
> the difference between QP "Error" and "SQ Error" states. They are
> subtle but important.

Yeah, I'm already doing this. Thanks very much.

Cheng Xu


> Tom.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2022-08-31  2:52 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-24  9:42 [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Cheng Xu
2022-08-24  9:42 ` [PATCH for-next 1/2] RDMA/erdma: Introduce internal post_send/post_recv for qp drain Cheng Xu
2022-08-24 12:10   ` Leon Romanovsky
2022-08-24  9:42 ` [PATCH for-next 2/2] RDMA/erdma: Add drain_sq and drain_rq support Cheng Xu
2022-08-24 12:10   ` Leon Romanovsky
2022-08-25  1:59   ` Cheng Xu
2022-08-24 14:08 ` [PATCH for-next 0/2] RDMA/erdma: Introduce custom implementation of drain_sq and drain_rq Tom Talpey
2022-08-24 14:56   ` Bernard Metzler
2022-08-25  1:54   ` Cheng Xu
2022-08-25 16:37     ` Tom Talpey
2022-08-26  3:21       ` Cheng Xu
2022-08-26 13:11         ` Tom Talpey
2022-08-26 13:57           ` Jason Gunthorpe
2022-08-29  4:01             ` Cheng Xu
2022-08-30 18:45               ` Tom Talpey
2022-08-31  2:08                 ` Cheng Xu
2022-08-31  2:52                 ` Cheng Xu
2022-08-29  3:37           ` Cheng Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.