All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
@ 2021-12-30 12:14 Xiao Yang
  2021-12-30 12:14 ` [RFC PATCH 1/2] RDMA/rxe: Rename send_atomic_ack() and atomic member of struct resp_res Xiao Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Xiao Yang @ 2021-12-30 12:14 UTC (permalink / raw)
  To: linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki, Xiao Yang

The IB SPEC v1.5[1][2] added new RDMA operations (Atomic Write and Flush).
This patchset makes SoftRoCE support new RDMA Atomic Write on RC service.

I added RDMA Atomic Write API and a rdma_atomic_write example on my
rdma-core repository[3].  You can verify the patchset by building and
running the rdma_atomic_write example.
server:
$ ./rdma_atomic_write_server -s [server_address] -p [port_number]
client:
$ ./rdma_atomic_write_client -s [server_address] -p [port_number]

[1]: https://www.infinibandta.org/ibta-specification/ # login required
[2]: https://www.infinibandta.org/wp-content/uploads/2021/08/IBTA-Overview-of-IBTA-Volume-1-Release-1.5-and-MPE-2021-08-17-Secure.pptx
[3]: https://github.com/yangx-jy/rdma-core

BTW: This patchset also needs the following fix.
https://www.spinics.net/lists/linux-rdma/msg107838.html

Xiao Yang (2):
  RDMA/rxe: Rename send_atomic_ack() and atomic member of struct
    resp_res
  RDMA/rxe: Add RDMA Atomic Write operation

 drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
 drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++
 drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
 drivers/infiniband/sw/rxe/rxe_qp.c     |  5 ++-
 drivers/infiniband/sw/rxe/rxe_req.c    | 10 +++--
 drivers/infiniband/sw/rxe/rxe_resp.c   | 59 ++++++++++++++++++++------
 drivers/infiniband/sw/rxe/rxe_verbs.h  |  2 +-
 include/rdma/ib_pack.h                 |  2 +
 include/rdma/ib_verbs.h                |  2 +
 include/uapi/rdma/ib_user_verbs.h      |  2 +
 10 files changed, 88 insertions(+), 19 deletions(-)

-- 
2.23.0




^ permalink raw reply	[flat|nested] 33+ messages in thread

* [RFC PATCH 1/2] RDMA/rxe: Rename send_atomic_ack() and atomic member of struct resp_res
  2021-12-30 12:14 [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
@ 2021-12-30 12:14 ` Xiao Yang
  2021-12-30 12:14 ` [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Xiao Yang @ 2021-12-30 12:14 UTC (permalink / raw)
  To: linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki, Xiao Yang

send_atomic_ack() and atomic member of struct resp_res will be common
in the future so rename them.

Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_qp.c    |  2 +-
 drivers/infiniband/sw/rxe/rxe_resp.c  | 10 +++++-----
 drivers/infiniband/sw/rxe/rxe_verbs.h |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 54b8711321c1..f3eec350f95c 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -136,7 +136,7 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
 void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
 {
 	if (res->type == RXE_ATOMIC_MASK) {
-		kfree_skb(res->atomic.skb);
+		kfree_skb(res->resp.skb);
 	} else if (res->type == RXE_READ_MASK) {
 		if (res->read.mr)
 			rxe_drop_ref(res->read.mr);
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 380934e38923..738e6b6335e5 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -959,7 +959,7 @@ static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
 	return err;
 }
 
-static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
 			   u8 syndrome)
 {
 	int rc = 0;
@@ -981,7 +981,7 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
 
 	skb_get(skb);
 	res->type = RXE_ATOMIC_MASK;
-	res->atomic.skb = skb;
+	res->resp.skb = skb;
 	res->first_psn = ack_pkt.psn;
 	res->last_psn  = ack_pkt.psn;
 	res->cur_psn   = ack_pkt.psn;
@@ -1004,7 +1004,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
 	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
 		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
 	else if (pkt->mask & RXE_ATOMIC_MASK)
-		send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+		send_resp(qp, pkt, AETH_ACK_UNLIMITED);
 	else if (bth_ack(pkt))
 		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
 
@@ -1113,9 +1113,9 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
 		/* Find the operation in our list of responder resources. */
 		res = find_resource(qp, pkt->psn);
 		if (res) {
-			skb_get(res->atomic.skb);
+			skb_get(res->resp.skb);
 			/* Resend the result. */
-			rc = rxe_xmit_packet(qp, pkt, res->atomic.skb);
+			rc = rxe_xmit_packet(qp, pkt, res->resp.skb);
 			if (rc) {
 				pr_err("Failed resending result. This flow is not handled - skb ignored\n");
 				rc = RESPST_CLEANUP;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 35e041450090..e0606e5f9962 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -155,7 +155,7 @@ struct resp_res {
 	union {
 		struct {
 			struct sk_buff	*skb;
-		} atomic;
+		} resp;
 		struct {
 			struct rxe_mr	*mr;
 			u64		va_org;
-- 
2.23.0




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 12:14 [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
  2021-12-30 12:14 ` [RFC PATCH 1/2] RDMA/rxe: Rename send_atomic_ack() and atomic member of struct resp_res Xiao Yang
@ 2021-12-30 12:14 ` Xiao Yang
  2021-12-30 21:39   ` Tom Talpey
  2021-12-31  3:01   ` lizhijian
  2021-12-30 19:21 ` [RFC PATCH 0/2] " Gromadzki, Tomasz
  2022-02-17  3:50 ` yangx.jy
  3 siblings, 2 replies; 33+ messages in thread
From: Xiao Yang @ 2021-12-30 12:14 UTC (permalink / raw)
  To: linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki, Xiao Yang

This patch implements RDMA Atomic Write operation for RC service.

Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
 drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
 drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
 drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
 drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
 drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
 include/rdma/ib_pack.h                 |  2 ++
 include/rdma/ib_verbs.h                |  2 ++
 include/uapi/rdma/ib_user_verbs.h      |  2 ++
 9 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index d771ba8449a1..5a5c0c3501de 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
 	case IB_WR_LOCAL_INV:			return IB_WC_LOCAL_INV;
 	case IB_WR_REG_MR:			return IB_WC_REG_MR;
 	case IB_WR_BIND_MW:			return IB_WC_BIND_MW;
+	case IB_WR_RDMA_ATOMIC_WRITE:		return IB_WC_RDMA_ATOMIC_WRITE;
 
 	default:
 		return 0xff;
@@ -256,6 +257,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp,
 		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
 			return COMPST_ERROR;
 
+		if (wqe->wr.opcode == IB_WR_RDMA_ATOMIC_WRITE)
+			return COMPST_WRITE_SEND;
+
 		fallthrough;
 		/* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
 		 */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
index 3ef5a10a6efd..bd3639534d5a 100644
--- a/drivers/infiniband/sw/rxe/rxe_opcode.c
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -103,6 +103,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
 			[IB_QPT_UC]	= WR_LOCAL_OP_MASK,
 		},
 	},
+	[IB_WR_RDMA_ATOMIC_WRITE]                       = {
+		.name   = "IB_WR_RDMA_ATOMIC_WRITE",
+		.mask   = {
+			[IB_QPT_RC]     = WR_INLINE_MASK | WR_ATOMIC_WRITE_MASK,
+		},
+	},
 };
 
 struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
@@ -379,6 +385,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
 						+ RXE_IETH_BYTES,
 		}
 	},
+	[IB_OPCODE_RC_RDMA_ATOMIC_WRITE]                        = {
+		.name   = "IB_OPCODE_RC_RDMA_ATOMIC_WRITE",
+		.mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_ATOMIC_WRITE_MASK | RXE_START_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]       = 0,
+			[RXE_RETH]      = RXE_BTH_BYTES,
+			[RXE_PAYLOAD]   = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		}
+	},
 
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]			= {
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
index 8f9aaaf260f2..a470e9b0b884 100644
--- a/drivers/infiniband/sw/rxe/rxe_opcode.h
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -20,6 +20,7 @@ enum rxe_wr_mask {
 	WR_READ_MASK			= BIT(3),
 	WR_WRITE_MASK			= BIT(4),
 	WR_LOCAL_OP_MASK		= BIT(5),
+	WR_ATOMIC_WRITE_MASK		= BIT(7),
 
 	WR_READ_OR_WRITE_MASK		= WR_READ_MASK | WR_WRITE_MASK,
 	WR_WRITE_OR_SEND_MASK		= WR_WRITE_MASK | WR_SEND_MASK,
@@ -81,6 +82,8 @@ enum rxe_hdr_mask {
 
 	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
 
+	RXE_ATOMIC_WRITE_MASK   = BIT(NUM_HDR_TYPES + 14),
+
 	RXE_READ_OR_ATOMIC_MASK	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
 	RXE_WRITE_OR_SEND_MASK	= (RXE_WRITE_MASK | RXE_SEND_MASK),
 	RXE_READ_OR_WRITE_MASK	= (RXE_READ_MASK | RXE_WRITE_MASK),
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index f3eec350f95c..22a1c4bcfa60 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -135,7 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
 
 void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
 {
-	if (res->type == RXE_ATOMIC_MASK) {
+	if (res->type == RXE_ATOMIC_MASK ||
+			res->type == RXE_ATOMIC_WRITE_MASK) {
 		kfree_skb(res->resp.skb);
 	} else if (res->type == RXE_READ_MASK) {
 		if (res->read.mr)
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 0c9d2af15f3d..f024fb6e67c9 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -240,6 +240,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
 		else
 			return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
 				IB_OPCODE_RC_SEND_FIRST;
+
+	case IB_WR_RDMA_ATOMIC_WRITE:
+		return IB_OPCODE_RC_RDMA_ATOMIC_WRITE;
+
 	case IB_WR_REG_MR:
 	case IB_WR_LOCAL_INV:
 		return opcode;
@@ -463,7 +467,7 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 	if (err)
 		return err;
 
-	if (pkt->mask & RXE_WRITE_OR_SEND_MASK) {
+	if (pkt->mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		if (wqe->wr.send_flags & IB_SEND_INLINE) {
 			u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
 
@@ -680,13 +684,13 @@ int rxe_requester(void *arg)
 	}
 
 	mask = rxe_opcode[opcode].mask;
-	if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) {
+	if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))) {
 		if (check_init_depth(qp, wqe))
 			goto exit;
 	}
 
 	mtu = get_mtu(qp);
-	payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0;
+	payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? wqe->dma.resid : 0;
 	if (payload > mtu) {
 		if (qp_type(qp) == IB_QPT_UD) {
 			/* C10-93.1.1: If the total sum of all the buffer lengths specified for a
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 738e6b6335e5..dc9af8b06bd1 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -258,7 +258,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp,
 	case IB_QPT_RC:
 		if (((pkt->mask & RXE_READ_MASK) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
-		    ((pkt->mask & RXE_WRITE_MASK) &&
+		    ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
 		    ((pkt->mask & RXE_ATOMIC_MASK) &&
 		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
@@ -362,7 +362,7 @@ static enum resp_states check_resource(struct rxe_qp *qp,
 		}
 	}
 
-	if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) {
+	if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		/* it is the requesters job to not send
 		 * too many read/atomic ops, we just
 		 * recycle the responder resource queue
@@ -413,7 +413,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
 	enum resp_states state;
 	int access;
 
-	if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
+	if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		if (pkt->mask & RXE_RETH_MASK) {
 			qp->resp.va = reth_va(pkt);
 			qp->resp.offset = 0;
@@ -479,7 +479,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
 		goto err;
 	}
 
-	if (pkt->mask & RXE_WRITE_MASK)	 {
+	if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
 		if (resid > mtu) {
 			if (pktlen != mtu || bth_pad(pkt)) {
 				state = RESPST_ERR_LENGTH;
@@ -591,6 +591,32 @@ static enum resp_states process_atomic(struct rxe_qp *qp,
 	return ret;
 }
 
+
+/* Guarantee atomicity of atomic write operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_write_ops_lock);
+
+static enum resp_states process_atomic_write(struct rxe_qp *qp,
+					     struct rxe_pkt_info *pkt)
+{
+	u64 *src, *dst;
+	struct rxe_mr *mr = qp->resp.mr;
+
+	src = payload_addr(pkt);
+
+	dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
+	if (!dst || (uintptr_t)dst & 7)
+		return RESPST_ERR_MISALIGNED_ATOMIC;
+
+	spin_lock_bh(&atomic_write_ops_lock);
+	*dst = *src;
+	spin_unlock_bh(&atomic_write_ops_lock);
+
+	/* decrease resp.resid to zero */
+	qp->resp.resid -= sizeof(u64);
+
+	return RESPST_NONE;
+}
+
 static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
 					  struct rxe_pkt_info *pkt,
 					  struct rxe_pkt_info *ack,
@@ -801,6 +827,10 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
 		err = process_atomic(qp, pkt);
 		if (err)
 			return err;
+	} else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
+		err = process_atomic_write(qp, pkt);
+		if (err)
+			return err;
 	} else {
 		/* Unreachable */
 		WARN_ON_ONCE(1);
@@ -967,9 +997,12 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
 	struct sk_buff *skb;
 	struct resp_res *res;
 
+	int opcode = pkt->mask & RXE_ATOMIC_MASK ?
+				IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE :
+				IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+
 	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
-				 IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
-				 syndrome);
+				 opcode, 0, pkt->psn, syndrome);
 	if (!skb) {
 		rc = -ENOMEM;
 		goto out;
@@ -980,7 +1013,7 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
 	rxe_advance_resp_resource(qp);
 
 	skb_get(skb);
-	res->type = RXE_ATOMIC_MASK;
+	res->type = pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK);
 	res->resp.skb = skb;
 	res->first_psn = ack_pkt.psn;
 	res->last_psn  = ack_pkt.psn;
@@ -1003,7 +1036,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
 
 	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
 		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
-	else if (pkt->mask & RXE_ATOMIC_MASK)
+	else if (pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))
 		send_resp(qp, pkt, AETH_ACK_UNLIMITED);
 	else if (bth_ack(pkt))
 		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index a9162f25beaf..bd12e1049ef8 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -84,6 +84,7 @@ enum {
 	/* opcode 0x15 is reserved */
 	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
 	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
+	IB_OPCODE_RDMA_ATOMIC_WRITE                 = 0x19,
 
 	/* real constants follow -- see comment about above IB_OPCODE()
 	   macro for more details */
@@ -112,6 +113,7 @@ enum {
 	IB_OPCODE(RC, FETCH_ADD),
 	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
 	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+	IB_OPCODE(RC, RDMA_ATOMIC_WRITE),
 
 	/* UC */
 	IB_OPCODE(UC, SEND_FIRST),
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6e9ad656ecb7..949b3586d35b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -971,6 +971,7 @@ enum ib_wc_opcode {
 	IB_WC_REG_MR,
 	IB_WC_MASKED_COMP_SWAP,
 	IB_WC_MASKED_FETCH_ADD,
+	IB_WC_RDMA_ATOMIC_WRITE = IB_UVERBS_WC_RDMA_ATOMIC_WRITE,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -1311,6 +1312,7 @@ enum ib_wr_opcode {
 		IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
 	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
 		IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+	IB_WR_RDMA_ATOMIC_WRITE = IB_UVERBS_WR_RDMA_ATOMIC_WRITE,
 
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 7ee73a0652f1..3b0b509fb96f 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode {
 	IB_UVERBS_WC_BIND_MW = 5,
 	IB_UVERBS_WC_LOCAL_INV = 6,
 	IB_UVERBS_WC_TSO = 7,
+	IB_UVERBS_WC_RDMA_ATOMIC_WRITE = 9,
 };
 
 struct ib_uverbs_wc {
@@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode {
 	IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
 	IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
 	IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+	IB_UVERBS_WR_RDMA_ATOMIC_WRITE = 15,
 	/* Review enum ib_wr_opcode before modifying this */
 };
 
-- 
2.23.0




^ permalink raw reply related	[flat|nested] 33+ messages in thread

* RE: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 12:14 [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
  2021-12-30 12:14 ` [RFC PATCH 1/2] RDMA/rxe: Rename send_atomic_ack() and atomic member of struct resp_res Xiao Yang
  2021-12-30 12:14 ` [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
@ 2021-12-30 19:21 ` Gromadzki, Tomasz
  2021-12-30 21:42   ` Tom Talpey
  2022-02-17  3:50 ` yangx.jy
  3 siblings, 1 reply; 33+ messages in thread
From: Gromadzki, Tomasz @ 2021-12-30 19:21 UTC (permalink / raw)
  To: Xiao Yang, linux-rdma; +Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian

1)
> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
>			int nsge, int flags, uint64_t remote_addr, uint32_t rkey)

Do we need this API at all?
Other atomic operations (compare_swap/add) do not use struct ibv_sge at all but have a dedicated place in 
struct ib_send_wr {
...
		struct {
			uint64_t	remote_addr;
			uint64_t	compare_add;
			uint64_t	swap;
			uint32_t	rkey;
		} atomic;
...
}

Would it be better to reuse (extend) any existing field?
i.e.
		struct {
			uint64_t	remote_addr;
			uint64_t	compare_add;
			uint64_t	swap_write;
			uint32_t	rkey;
		} atomic;

Thanks
Tomasz

> -----Original Message-----
> From: Xiao Yang <yangx.jy@fujitsu.com>
> Sent: Thursday, December 30, 2021 1:14 PM
> To: linux-rdma@vger.kernel.org
> Cc: yanjun.zhu@linux.dev; rpearsonhpe@gmail.com; jgg@nvidia.com; y-
> goto@fujitsu.com; lizhijian@fujitsu.com; Gromadzki, Tomasz
> <tomasz.gromadzki@intel.com>; Xiao Yang <yangx.jy@fujitsu.com>
> Subject: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
> 
> The IB SPEC v1.5[1][2] added new RDMA operations (Atomic Write and
> Flush).
> This patchset makes SoftRoCE support new RDMA Atomic Write on RC
> service.
> 
> I added RDMA Atomic Write API and a rdma_atomic_write example on my
> rdma-core repository[3].  You can verify the patchset by building and running
> the rdma_atomic_write example.
> server:
> $ ./rdma_atomic_write_server -s [server_address] -p [port_number]
> client:
> $ ./rdma_atomic_write_client -s [server_address] -p [port_number]
> 
> [1]: https://www.infinibandta.org/ibta-specification/ # login required
> [2]: https://www.infinibandta.org/wp-content/uploads/2021/08/IBTA-
> Overview-of-IBTA-Volume-1-Release-1.5-and-MPE-2021-08-17-Secure.pptx
> [3]: https://github.com/yangx-jy/rdma-core
> 
> BTW: This patchset also needs the following fix.
> https://www.spinics.net/lists/linux-rdma/msg107838.html
> 
> Xiao Yang (2):
>   RDMA/rxe: Rename send_atomic_ack() and atomic member of struct
>     resp_res
>   RDMA/rxe: Add RDMA Atomic Write operation
> 
>  drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
>  drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++
> drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>  drivers/infiniband/sw/rxe/rxe_qp.c     |  5 ++-
>  drivers/infiniband/sw/rxe/rxe_req.c    | 10 +++--
>  drivers/infiniband/sw/rxe/rxe_resp.c   | 59 ++++++++++++++++++++------
>  drivers/infiniband/sw/rxe/rxe_verbs.h  |  2 +-
>  include/rdma/ib_pack.h                 |  2 +
>  include/rdma/ib_verbs.h                |  2 +
>  include/uapi/rdma/ib_user_verbs.h      |  2 +
>  10 files changed, 88 insertions(+), 19 deletions(-)
> 
> --
> 2.23.0
> 
> 

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 12:14 ` [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
@ 2021-12-30 21:39   ` Tom Talpey
  2021-12-31  8:29     ` yangx.jy
  2022-01-05 23:53     ` Jason Gunthorpe
  2021-12-31  3:01   ` lizhijian
  1 sibling, 2 replies; 33+ messages in thread
From: Tom Talpey @ 2021-12-30 21:39 UTC (permalink / raw)
  To: Xiao Yang, linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian, tomasz.gromadzki


On 12/30/2021 7:14 AM, Xiao Yang wrote:
> This patch implements RDMA Atomic Write operation for RC service.
> 
> Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
> ---
>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>   drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>   drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
>   include/rdma/ib_pack.h                 |  2 ++
>   include/rdma/ib_verbs.h                |  2 ++
>   include/uapi/rdma/ib_user_verbs.h      |  2 ++
>   9 files changed, 81 insertions(+), 12 deletions(-)
> 

<snip>

> +/* Guarantee atomicity of atomic write operations at the machine level. */
> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
> +
> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
> +					     struct rxe_pkt_info *pkt)
> +{
> +	u64 *src, *dst;
> +	struct rxe_mr *mr = qp->resp.mr;
> +
> +	src = payload_addr(pkt);
> +
> +	dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
> +	if (!dst || (uintptr_t)dst & 7)
> +		return RESPST_ERR_MISALIGNED_ATOMIC;
> +
> +	spin_lock_bh(&atomic_write_ops_lock);
> +	*dst = *src;
> +	spin_unlock_bh(&atomic_write_ops_lock);

Spinlock protection is insufficient! Using a spinlock protects only
the atomicity of the store operation with respect to another remote
atomic_write operation. But the semantics of RDMA_ATOMIC_WRITE go much
further. The operation requires a fully atomic bus transaction, across
the memory complex and with respect to CPU, PCI, and other sources.
And this guarantee needs to apply to all architectures, including ones
with noncoherent caches (software consistency).

Because RXE is a software provider, I believe the most natural approach
here is to use an atomic64_set(dst, *src). But I'm not certain this
is supported on all the target architectures, therefore it may require
some additional failure checks, such as stricter alignment than testing
(dst & 7), or returning a failure if atomic64 operations are not
available. I especially think the ARM and PPC platforms will need
an expert review.

IOW, nak!

Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 19:21 ` [RFC PATCH 0/2] " Gromadzki, Tomasz
@ 2021-12-30 21:42   ` Tom Talpey
  2021-12-31  6:30     ` yangx.jy
  0 siblings, 1 reply; 33+ messages in thread
From: Tom Talpey @ 2021-12-30 21:42 UTC (permalink / raw)
  To: Gromadzki, Tomasz, Xiao Yang, linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian

On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
> 1)
>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
>> 			int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
> 
> Do we need this API at all?
> Other atomic operations (compare_swap/add) do not use struct ibv_sge at all but have a dedicated place in
> struct ib_send_wr {
> ...
> 		struct {
> 			uint64_t	remote_addr;
> 			uint64_t	compare_add;
> 			uint64_t	swap;
> 			uint32_t	rkey;
> 		} atomic;
> ...
> }
> 
> Would it be better to reuse (extend) any existing field?
> i.e.
> 		struct {
> 			uint64_t	remote_addr;
> 			uint64_t	compare_add;
> 			uint64_t	swap_write;
> 			uint32_t	rkey;
> 		} atomic;

Agreed. Passing the data to be written as an SGE is unnatural
since it is always exactly 64 bits. Tweaking the existing atomic
parameter block as Tomasz suggests is the best approach.

Tom.

> Thanks
> Tomasz
> 
>> -----Original Message-----
>> From: Xiao Yang <yangx.jy@fujitsu.com>
>> Sent: Thursday, December 30, 2021 1:14 PM
>> To: linux-rdma@vger.kernel.org
>> Cc: yanjun.zhu@linux.dev; rpearsonhpe@gmail.com; jgg@nvidia.com; y-
>> goto@fujitsu.com; lizhijian@fujitsu.com; Gromadzki, Tomasz
>> <tomasz.gromadzki@intel.com>; Xiao Yang <yangx.jy@fujitsu.com>
>> Subject: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
>>
>> The IB SPEC v1.5[1][2] added new RDMA operations (Atomic Write and
>> Flush).
>> This patchset makes SoftRoCE support new RDMA Atomic Write on RC
>> service.
>>
>> I added RDMA Atomic Write API and a rdma_atomic_write example on my
>> rdma-core repository[3].  You can verify the patchset by building and running
>> the rdma_atomic_write example.
>> server:
>> $ ./rdma_atomic_write_server -s [server_address] -p [port_number]
>> client:
>> $ ./rdma_atomic_write_client -s [server_address] -p [port_number]
>>
>> [1]: https://www.infinibandta.org/ibta-specification/ # login required
>> [2]: https://www.infinibandta.org/wp-content/uploads/2021/08/IBTA-
>> Overview-of-IBTA-Volume-1-Release-1.5-and-MPE-2021-08-17-Secure.pptx
>> [3]: https://github.com/yangx-jy/rdma-core
>>
>> BTW: This patchset also needs the following fix.
>> https://www.spinics.net/lists/linux-rdma/msg107838.html
>>
>> Xiao Yang (2):
>>    RDMA/rxe: Rename send_atomic_ack() and atomic member of struct
>>      resp_res
>>    RDMA/rxe: Add RDMA Atomic Write operation
>>
>>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
>>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++
>> drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>>   drivers/infiniband/sw/rxe/rxe_qp.c     |  5 ++-
>>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 +++--
>>   drivers/infiniband/sw/rxe/rxe_resp.c   | 59 ++++++++++++++++++++------
>>   drivers/infiniband/sw/rxe/rxe_verbs.h  |  2 +-
>>   include/rdma/ib_pack.h                 |  2 +
>>   include/rdma/ib_verbs.h                |  2 +
>>   include/uapi/rdma/ib_user_verbs.h      |  2 +
>>   10 files changed, 88 insertions(+), 19 deletions(-)
>>
>> --
>> 2.23.0
>>
>>
> 
> ---------------------------------------------------------------------
> Intel Technology Poland sp. z o.o.
> ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
> Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
> This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.
> 
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 12:14 ` [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
  2021-12-30 21:39   ` Tom Talpey
@ 2021-12-31  3:01   ` lizhijian
  2021-12-31  6:02     ` yangx.jy
  1 sibling, 1 reply; 33+ messages in thread
From: lizhijian @ 2021-12-31  3:01 UTC (permalink / raw)
  To: yangx.jy, linux-rdma
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, tomasz.gromadzki



On 30/12/2021 20:14, Xiao Yang wrote:
> This patch implements RDMA Atomic Write operation for RC service.
>
> Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
> ---
>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>   drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>   drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
>   include/rdma/ib_pack.h                 |  2 ++
>   include/rdma/ib_verbs.h                |  2 ++
>   include/uapi/rdma/ib_user_verbs.h      |  2 ++
>   9 files changed, 81 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index d771ba8449a1..5a5c0c3501de 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
>   	case IB_WR_LOCAL_INV:			return IB_WC_LOCAL_INV;
>   	case IB_WR_REG_MR:			return IB_WC_REG_MR;
>   	case IB_WR_BIND_MW:			return IB_WC_BIND_MW;
> +	case IB_WR_RDMA_ATOMIC_WRITE:		return IB_WC_RDMA_ATOMIC_WRITE;
>   
>   	default:
>   		return 0xff;
> @@ -256,6 +257,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp,
>   		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
>   			return COMPST_ERROR;
>   
> +		if (wqe->wr.opcode == IB_WR_RDMA_ATOMIC_WRITE)
> +			return COMPST_WRITE_SEND;
> +
>   		fallthrough;
>   		/* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
>   		 */
> diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
> index 3ef5a10a6efd..bd3639534d5a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_opcode.c
> +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
> @@ -103,6 +103,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
>   			[IB_QPT_UC]	= WR_LOCAL_OP_MASK,
>   		},
>   	},
> +	[IB_WR_RDMA_ATOMIC_WRITE]                       = {
> +		.name   = "IB_WR_RDMA_ATOMIC_WRITE",
> +		.mask   = {
> +			[IB_QPT_RC]     = WR_INLINE_MASK | WR_ATOMIC_WRITE_MASK,
> +		},
> +	},
>   };
>   
>   struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
> @@ -379,6 +385,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
>   						+ RXE_IETH_BYTES,
>   		}
>   	},
> +	[IB_OPCODE_RC_RDMA_ATOMIC_WRITE]                        = {
> +		.name   = "IB_OPCODE_RC_RDMA_ATOMIC_WRITE",
> +		.mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
> +				| RXE_ATOMIC_WRITE_MASK | RXE_START_MASK
> +				| RXE_END_MASK,
> +		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
> +		.offset = {
> +			[RXE_BTH]       = 0,
> +			[RXE_RETH]      = RXE_BTH_BYTES,
> +			[RXE_PAYLOAD]   = RXE_BTH_BYTES + RXE_RETH_BYTES,
> +		}
> +	},
>   
>   	/* UC */
>   	[IB_OPCODE_UC_SEND_FIRST]			= {
> diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
> index 8f9aaaf260f2..a470e9b0b884 100644
> --- a/drivers/infiniband/sw/rxe/rxe_opcode.h
> +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
> @@ -20,6 +20,7 @@ enum rxe_wr_mask {
>   	WR_READ_MASK			= BIT(3),
>   	WR_WRITE_MASK			= BIT(4),
>   	WR_LOCAL_OP_MASK		= BIT(5),
> +	WR_ATOMIC_WRITE_MASK		= BIT(7),
>   
>   	WR_READ_OR_WRITE_MASK		= WR_READ_MASK | WR_WRITE_MASK,
>   	WR_WRITE_OR_SEND_MASK		= WR_WRITE_MASK | WR_SEND_MASK,
> @@ -81,6 +82,8 @@ enum rxe_hdr_mask {
>   
>   	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
>   
> +	RXE_ATOMIC_WRITE_MASK   = BIT(NUM_HDR_TYPES + 14),
> +
>   	RXE_READ_OR_ATOMIC_MASK	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
>   	RXE_WRITE_OR_SEND_MASK	= (RXE_WRITE_MASK | RXE_SEND_MASK),
>   	RXE_READ_OR_WRITE_MASK	= (RXE_READ_MASK | RXE_WRITE_MASK),
> diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
> index f3eec350f95c..22a1c4bcfa60 100644
> --- a/drivers/infiniband/sw/rxe/rxe_qp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_qp.c
> @@ -135,7 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
>   
>   void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
>   {
> -	if (res->type == RXE_ATOMIC_MASK) {
> +	if (res->type == RXE_ATOMIC_MASK ||
> +			res->type == RXE_ATOMIC_WRITE_MASK) {
>   		kfree_skb(res->resp.skb);
>   	} else if (res->type == RXE_READ_MASK) {
>   		if (res->read.mr)
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index 0c9d2af15f3d..f024fb6e67c9 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -240,6 +240,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
>   		else
>   			return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
>   				IB_OPCODE_RC_SEND_FIRST;
> +
> +	case IB_WR_RDMA_ATOMIC_WRITE:
> +		return IB_OPCODE_RC_RDMA_ATOMIC_WRITE;
> +
>   	case IB_WR_REG_MR:
>   	case IB_WR_LOCAL_INV:
>   		return opcode;
> @@ -463,7 +467,7 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
>   	if (err)
>   		return err;
>   
> -	if (pkt->mask & RXE_WRITE_OR_SEND_MASK) {
> +	if (pkt->mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (wqe->wr.send_flags & IB_SEND_INLINE) {
>   			u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
>   
> @@ -680,13 +684,13 @@ int rxe_requester(void *arg)
>   	}
>   
>   	mask = rxe_opcode[opcode].mask;
> -	if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) {
> +	if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))) {
>   		if (check_init_depth(qp, wqe))
>   			goto exit;
>   	}
>   
>   	mtu = get_mtu(qp);
> -	payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0;
> +	payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? wqe->dma.resid : 0;
>   	if (payload > mtu) {
>   		if (qp_type(qp) == IB_QPT_UD) {
>   			/* C10-93.1.1: If the total sum of all the buffer lengths specified for a
> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> index 738e6b6335e5..dc9af8b06bd1 100644
> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> @@ -258,7 +258,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp,
>   	case IB_QPT_RC:
>   		if (((pkt->mask & RXE_READ_MASK) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
> -		    ((pkt->mask & RXE_WRITE_MASK) &&
> +		    ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
>   		    ((pkt->mask & RXE_ATOMIC_MASK) &&
>   		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
> @@ -362,7 +362,7 @@ static enum resp_states check_resource(struct rxe_qp *qp,
>   		}
>   	}
>   
> -	if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) {
> +	if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		/* it is the requesters job to not send
>   		 * too many read/atomic ops, we just
>   		 * recycle the responder resource queue
> @@ -413,7 +413,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>   	enum resp_states state;
>   	int access;
>   
> -	if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
> +	if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (pkt->mask & RXE_RETH_MASK) {
>   			qp->resp.va = reth_va(pkt);
>   			qp->resp.offset = 0;
> @@ -479,7 +479,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
>   		goto err;
>   	}
>   
> -	if (pkt->mask & RXE_WRITE_MASK)	 {
> +	if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
>   		if (resid > mtu) {
>   			if (pktlen != mtu || bth_pad(pkt)) {
>   				state = RESPST_ERR_LENGTH;
> @@ -591,6 +591,32 @@ static enum resp_states process_atomic(struct rxe_qp *qp,
>   	return ret;
>   }
>   
> +
> +/* Guarantee atomicity of atomic write operations at the machine level. */
> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
> +
> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
> +					     struct rxe_pkt_info *pkt)
> +{
> +	u64 *src, *dst;
> +	struct rxe_mr *mr = qp->resp.mr;
> +
> +	src = payload_addr(pkt);
> +
> +	dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
> +	if (!dst || (uintptr_t)dst & 7)
> +		return RESPST_ERR_MISALIGNED_ATOMIC;
> +
> +	spin_lock_bh(&atomic_write_ops_lock);
> +	*dst = *src;
> +	spin_unlock_bh(&atomic_write_ops_lock);
> +
> +	/* decrease resp.resid to zero */
> +	qp->resp.resid -= sizeof(u64);
> +
> +	return RESPST_NONE;
> +}
> +
>   static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
>   					  struct rxe_pkt_info *pkt,
>   					  struct rxe_pkt_info *ack,
> @@ -801,6 +827,10 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
>   		err = process_atomic(qp, pkt);
>   		if (err)
>   			return err;
> +	} else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
> +		err = process_atomic_write(qp, pkt);
> +		if (err)
> +			return err;
>   	} else {
>   		/* Unreachable */
>   		WARN_ON_ONCE(1);
> @@ -967,9 +997,12 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
>   	struct sk_buff *skb;
>   	struct resp_res *res;
>   
> +	int opcode = pkt->mask & RXE_ATOMIC_MASK ?
> +				IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE :
> +				IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
> +
>   	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
> -				 IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
> -				 syndrome);
> +				 opcode, 0, pkt->psn, syndrome);
>   	if (!skb) {
>   		rc = -ENOMEM;
>   		goto out;
> @@ -980,7 +1013,7 @@ static int send_resp(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
>   	rxe_advance_resp_resource(qp);
>   
>   	skb_get(skb);
> -	res->type = RXE_ATOMIC_MASK;
> +	res->type = pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK);
>   	res->resp.skb = skb;
>   	res->first_psn = ack_pkt.psn;
>   	res->last_psn  = ack_pkt.psn;
> @@ -1003,7 +1036,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
>   
>   	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
>   		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
> -	else if (pkt->mask & RXE_ATOMIC_MASK)
> +	else if (pkt->mask & (RXE_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK))
>   		send_resp(qp, pkt, AETH_ACK_UNLIMITED);
>   	else if (bth_ack(pkt))
>   		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
> diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
> index a9162f25beaf..bd12e1049ef8 100644
> --- a/include/rdma/ib_pack.h
> +++ b/include/rdma/ib_pack.h
> @@ -84,6 +84,7 @@ enum {
>   	/* opcode 0x15 is reserved */
>   	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
>   	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
> +	IB_OPCODE_RDMA_ATOMIC_WRITE                 = 0x19,
I think you didn't follow the SPEC.

SPEC:
ATOMIC WRITE BTH shall hold the Opcode = 0x1D (valid only for RC, RD
and XRC, reserved for unreliable transport services).

Thanks
Zhijian

>   
>   	/* real constants follow -- see comment about above IB_OPCODE()
>   	   macro for more details */
> @@ -112,6 +113,7 @@ enum {
>   	IB_OPCODE(RC, FETCH_ADD),
>   	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
>   	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
> +	IB_OPCODE(RC, RDMA_ATOMIC_WRITE),
>   
>   	/* UC */
>   	IB_OPCODE(UC, SEND_FIRST),
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 6e9ad656ecb7..949b3586d35b 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -971,6 +971,7 @@ enum ib_wc_opcode {
>   	IB_WC_REG_MR,
>   	IB_WC_MASKED_COMP_SWAP,
>   	IB_WC_MASKED_FETCH_ADD,
> +	IB_WC_RDMA_ATOMIC_WRITE = IB_UVERBS_WC_RDMA_ATOMIC_WRITE,
>   /*
>    * Set value of IB_WC_RECV so consumers can test if a completion is a
>    * receive by testing (opcode & IB_WC_RECV).
> @@ -1311,6 +1312,7 @@ enum ib_wr_opcode {
>   		IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
>   	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
>   		IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
> +	IB_WR_RDMA_ATOMIC_WRITE = IB_UVERBS_WR_RDMA_ATOMIC_WRITE,
>   
>   	/* These are kernel only and can not be issued by userspace */
>   	IB_WR_REG_MR = 0x20,
> diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
> index 7ee73a0652f1..3b0b509fb96f 100644
> --- a/include/uapi/rdma/ib_user_verbs.h
> +++ b/include/uapi/rdma/ib_user_verbs.h
> @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode {
>   	IB_UVERBS_WC_BIND_MW = 5,
>   	IB_UVERBS_WC_LOCAL_INV = 6,
>   	IB_UVERBS_WC_TSO = 7,
> +	IB_UVERBS_WC_RDMA_ATOMIC_WRITE = 9,
>   };
>   
>   struct ib_uverbs_wc {
> @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode {
>   	IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
>   	IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
>   	IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
> +	IB_UVERBS_WR_RDMA_ATOMIC_WRITE = 15,
>   	/* Review enum ib_wr_opcode before modifying this */
>   };
>   

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-31  3:01   ` lizhijian
@ 2021-12-31  6:02     ` yangx.jy
  0 siblings, 0 replies; 33+ messages in thread
From: yangx.jy @ 2021-12-31  6:02 UTC (permalink / raw)
  To: lizhijian
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, tomasz.gromadzki

On 2021/12/31 11:01, Li, Zhijian/李 智坚 wrote:
> I think you didn't follow the SPEC.
>
> SPEC:
> ATOMIC WRITE BTH shall hold the Opcode = 0x1D (valid only for RC, RD
> and XRC, reserved for unreliable transport services).
Hi,

Good catch, I will correct the value in next version.

Best Regards,
Xiao Yang

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 21:42   ` Tom Talpey
@ 2021-12-31  6:30     ` yangx.jy
  2022-01-04  9:28       ` yangx.jy
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2021-12-31  6:30 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Gromadzki, Tomasz, linux-rdma, yanjun.zhu, rpearsonhpe, jgg,
	y-goto, lizhijian

On 2021/12/31 5:42, Tom Talpey wrote:
> On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
>> 1)
>>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct 
>>> ibv_sge *sgl,
>>>             int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
>>
>> Do we need this API at all?
>> Other atomic operations (compare_swap/add) do not use struct ibv_sge 
>> at all but have a dedicated place in
>> struct ib_send_wr {
>> ...
>>         struct {
>>             uint64_t    remote_addr;
>>             uint64_t    compare_add;
>>             uint64_t    swap;
>>             uint32_t    rkey;
>>         } atomic;
>> ...
>> }
>>
>> Would it be better to reuse (extend) any existing field?
>> i.e.
>>         struct {
>>             uint64_t    remote_addr;
>>             uint64_t    compare_add;
>>             uint64_t    swap_write;
>>             uint32_t    rkey;
>>         } atomic;
>
> Agreed. Passing the data to be written as an SGE is unnatural
> since it is always exactly 64 bits. Tweaking the existing atomic
> parameter block as Tomasz suggests is the best approach.
Hi Tomasz, Tom

Thanks for your quick reply.

If we want to pass the 8-byte value by tweaking struct atomic on user 
space, why don't we
tranfer the 8-byte value by ATOMIC Extended Transport Header (AtomicETH) 
on kernel space?
PS: IBTA defines that the 8-byte value is tranfered by RDMA Extended 
Transport Heade(RETH) + payload.

Is it inconsistent to use struct atomic on user space and RETH + payload 
on kernel space?

Best Regards,
Xiao Yang
>
> Tom. 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 21:39   ` Tom Talpey
@ 2021-12-31  8:29     ` yangx.jy
  2021-12-31 15:09       ` Tom Talpey
  2022-01-05 23:53     ` Jason Gunthorpe
  1 sibling, 1 reply; 33+ messages in thread
From: yangx.jy @ 2021-12-31  8:29 UTC (permalink / raw)
  To: Tom Talpey
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki

On 2021/12/31 5:39, Tom Talpey wrote:
>
> On 12/30/2021 7:14 AM, Xiao Yang wrote:
>> This patch implements RDMA Atomic Write operation for RC service.
>>
>> Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
>> ---
>>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>>   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>>   drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>>   drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
>>   include/rdma/ib_pack.h                 |  2 ++
>>   include/rdma/ib_verbs.h                |  2 ++
>>   include/uapi/rdma/ib_user_verbs.h      |  2 ++
>>   9 files changed, 81 insertions(+), 12 deletions(-)
>>
>
> <snip>
>
>> +/* Guarantee atomicity of atomic write operations at the machine 
>> level. */
>> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
>> +
>> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
>> +                         struct rxe_pkt_info *pkt)
>> +{
>> +    u64 *src, *dst;
>> +    struct rxe_mr *mr = qp->resp.mr;
>> +
>> +    src = payload_addr(pkt);
>> +
>> +    dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, 
>> sizeof(u64));
>> +    if (!dst || (uintptr_t)dst & 7)
>> +        return RESPST_ERR_MISALIGNED_ATOMIC;
>> +
>> +    spin_lock_bh(&atomic_write_ops_lock);
>> +    *dst = *src;
>> +    spin_unlock_bh(&atomic_write_ops_lock);
>
> Spinlock protection is insufficient! Using a spinlock protects only
> the atomicity of the store operation with respect to another remote
> atomic_write operation. But the semantics of RDMA_ATOMIC_WRITE go much
> further. The operation requires a fully atomic bus transaction, across
> the memory complex and with respect to CPU, PCI, and other sources.
> And this guarantee needs to apply to all architectures, including ones
> with noncoherent caches (software consistency).
>
> Because RXE is a software provider, I believe the most natural approach
> here is to use an atomic64_set(dst, *src). But I'm not certain this
> is supported on all the target architectures, therefore it may require
> some additional failure checks, such as stricter alignment than testing
> (dst & 7), or returning a failure if atomic64 operations are not
> available. I especially think the ARM and PPC platforms will need
> an expert review.
Hi Tom,

Thanks a lot for the detailed suggestion.

I will try to use atomic64_set() and add additional failure checks.
By the way, process_atomic() uses the same method(spinlock + assignment 
expression),
so do you think we also need to update it by using atomic64_set()?

Best Regards,
Xiao Yang
>
> IOW, nak!
>
> Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-31  8:29     ` yangx.jy
@ 2021-12-31 15:09       ` Tom Talpey
       [not found]         ` <61D563B4.2070106@fujitsu.com>
  0 siblings, 1 reply; 33+ messages in thread
From: Tom Talpey @ 2021-12-31 15:09 UTC (permalink / raw)
  To: yangx.jy
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki

On 12/31/2021 3:29 AM, yangx.jy@fujitsu.com wrote:
> On 2021/12/31 5:39, Tom Talpey wrote:
>>
>> On 12/30/2021 7:14 AM, Xiao Yang wrote:
>>> This patch implements RDMA Atomic Write operation for RC service.
>>>
>>> Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
>>> ---
>>>    drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>>>    drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>>>    drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>>>    drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>>>    drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>>>    drivers/infiniband/sw/rxe/rxe_resp.c   | 49 +++++++++++++++++++++-----
>>>    include/rdma/ib_pack.h                 |  2 ++
>>>    include/rdma/ib_verbs.h                |  2 ++
>>>    include/uapi/rdma/ib_user_verbs.h      |  2 ++
>>>    9 files changed, 81 insertions(+), 12 deletions(-)
>>>
>>
>> <snip>
>>
>>> +/* Guarantee atomicity of atomic write operations at the machine
>>> level. */
>>> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
>>> +
>>> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
>>> +                         struct rxe_pkt_info *pkt)
>>> +{
>>> +    u64 *src, *dst;
>>> +    struct rxe_mr *mr = qp->resp.mr;
>>> +
>>> +    src = payload_addr(pkt);
>>> +
>>> +    dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset,
>>> sizeof(u64));
>>> +    if (!dst || (uintptr_t)dst & 7)
>>> +        return RESPST_ERR_MISALIGNED_ATOMIC;
>>> +
>>> +    spin_lock_bh(&atomic_write_ops_lock);
>>> +    *dst = *src;
>>> +    spin_unlock_bh(&atomic_write_ops_lock);
>>
>> Spinlock protection is insufficient! Using a spinlock protects only
>> the atomicity of the store operation with respect to another remote
>> atomic_write operation. But the semantics of RDMA_ATOMIC_WRITE go much
>> further. The operation requires a fully atomic bus transaction, across
>> the memory complex and with respect to CPU, PCI, and other sources.
>> And this guarantee needs to apply to all architectures, including ones
>> with noncoherent caches (software consistency).
>>
>> Because RXE is a software provider, I believe the most natural approach
>> here is to use an atomic64_set(dst, *src). But I'm not certain this
>> is supported on all the target architectures, therefore it may require
>> some additional failure checks, such as stricter alignment than testing
>> (dst & 7), or returning a failure if atomic64 operations are not
>> available. I especially think the ARM and PPC platforms will need
>> an expert review.
> Hi Tom,
> 
> Thanks a lot for the detailed suggestion.
> 
> I will try to use atomic64_set() and add additional failure checks.
> By the way, process_atomic() uses the same method(spinlock + assignment
> expression),
> so do you think we also need to update it by using atomic64_set()?

It is *criticial* for you to understand that the ATOMIC_WRITE has a
much stronger semantic than ordinary RDMA atomics.

The ordinary atomics are only atomic from the perspective of a single
connection and a single adapter. And the result is not placed in memory
atomically, only its execution in the adapter is processed that way.
And the final result is only consistently visible to the application
after a completion event is delivered. This rather huge compromise is
because when atomics were first designed, there were no PCI primitives
which supported atomics.

An RDMA atomic_write operation is atomic all the way to memory. It
must to be implemented in a platform operation which is similarly
atomic. If implemented by a PCIe adapter, it would use the newer
PCIe atomics to perform a locked read-modify-write. If implemented
in software as you are doing, it must use a similar local r-m-w
instruction, or the platform equivalent if necessary.

Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-31  6:30     ` yangx.jy
@ 2022-01-04  9:28       ` yangx.jy
  2022-01-04 15:17         ` Tom Talpey
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-04  9:28 UTC (permalink / raw)
  To: Tom Talpey, Gromadzki, Tomasz
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian

On 2021/12/31 14:30, yangx.jy@fujitsu.com wrote:
> On 2021/12/31 5:42, Tom Talpey wrote:
>> On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
>>> 1)
>>>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct
>>>> ibv_sge *sgl,
>>>>              int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
>>> Do we need this API at all?
>>> Other atomic operations (compare_swap/add) do not use struct ibv_sge
>>> at all but have a dedicated place in
>>> struct ib_send_wr {
>>> ...
>>>          struct {
>>>              uint64_t    remote_addr;
>>>              uint64_t    compare_add;
>>>              uint64_t    swap;
>>>              uint32_t    rkey;
>>>          } atomic;
>>> ...
>>> }
>>>
>>> Would it be better to reuse (extend) any existing field?
>>> i.e.
>>>          struct {
>>>              uint64_t    remote_addr;
>>>              uint64_t    compare_add;
>>>              uint64_t    swap_write;
>>>              uint32_t    rkey;
>>>          } atomic;
>> Agreed. Passing the data to be written as an SGE is unnatural
>> since it is always exactly 64 bits. Tweaking the existing atomic
>> parameter block as Tomasz suggests is the best approach.
> Hi Tomasz, Tom
>
> Thanks for your quick reply.
>
> If we want to pass the 8-byte value by tweaking struct atomic on user
> space, why don't we
> tranfer the 8-byte value by ATOMIC Extended Transport Header (AtomicETH)
> on kernel space?
> PS: IBTA defines that the 8-byte value is tranfered by RDMA Extended
> Transport Heade(RETH) + payload.
>
> Is it inconsistent to use struct atomic on user space and RETH + payload
> on kernel space?
Hi Tomasz, Tom

I think the following rules are right:
RDMA READ/WRITE should use struct rdma in libverbs and RETH + payload in 
kernel.
RDMA Atomic should use struct atomic in libverbs and AtomicETH in kernel.

According to IBTA's definition, RDMA Atomic Write should use struct rdma 
in libibverbs.
How about adding a member in struct rdma? for example:
struct {
     uint64_t    remote_addr;
     uint32_t    rkey;
     uint64_t    wr_value:
} rdma;

Best Regards,
Xiao Yang
> Best Regards,
> Xiao Yang
>> Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-04  9:28       ` yangx.jy
@ 2022-01-04 15:17         ` Tom Talpey
  2022-01-05  1:00           ` yangx.jy
  2022-02-11 13:18           ` Gromadzki, Tomasz
  0 siblings, 2 replies; 33+ messages in thread
From: Tom Talpey @ 2022-01-04 15:17 UTC (permalink / raw)
  To: yangx.jy, Gromadzki, Tomasz
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian


On 1/4/2022 4:28 AM, yangx.jy@fujitsu.com wrote:
> On 2021/12/31 14:30, yangx.jy@fujitsu.com wrote:
>> On 2021/12/31 5:42, Tom Talpey wrote:
>>> On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
>>>> 1)
>>>>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct
>>>>> ibv_sge *sgl,
>>>>>               int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
>>>> Do we need this API at all?
>>>> Other atomic operations (compare_swap/add) do not use struct ibv_sge
>>>> at all but have a dedicated place in
>>>> struct ib_send_wr {
>>>> ...
>>>>           struct {
>>>>               uint64_t    remote_addr;
>>>>               uint64_t    compare_add;
>>>>               uint64_t    swap;
>>>>               uint32_t    rkey;
>>>>           } atomic;
>>>> ...
>>>> }
>>>>
>>>> Would it be better to reuse (extend) any existing field?
>>>> i.e.
>>>>           struct {
>>>>               uint64_t    remote_addr;
>>>>               uint64_t    compare_add;
>>>>               uint64_t    swap_write;
>>>>               uint32_t    rkey;
>>>>           } atomic;
>>> Agreed. Passing the data to be written as an SGE is unnatural
>>> since it is always exactly 64 bits. Tweaking the existing atomic
>>> parameter block as Tomasz suggests is the best approach.
>> Hi Tomasz, Tom
>>
>> Thanks for your quick reply.
>>
>> If we want to pass the 8-byte value by tweaking struct atomic on user
>> space, why don't we
>> tranfer the 8-byte value by ATOMIC Extended Transport Header (AtomicETH)
>> on kernel space?
>> PS: IBTA defines that the 8-byte value is tranfered by RDMA Extended
>> Transport Heade(RETH) + payload.
>>
>> Is it inconsistent to use struct atomic on user space and RETH + payload
>> on kernel space?
> Hi Tomasz, Tom
> 
> I think the following rules are right:
> RDMA READ/WRITE should use struct rdma in libverbs and RETH + payload in
> kernel.
> RDMA Atomic should use struct atomic in libverbs and AtomicETH in kernel.
> 
> According to IBTA's definition, RDMA Atomic Write should use struct rdma
> in libibverbs.

I don't quite understand this statement, the IBTA defines the protocol
but does not define the API at such a level.

I do however agree with this:

> How about adding a member in struct rdma? for example:
> struct {
>       uint64_t    remote_addr;
>       uint32_t    rkey;
>       uint64_t    wr_value:
> } rdma;

Yes, that's what Tomasz and I were suggesting - a new template for the
ATOMIC_WRITE request payload. The three fields are to be supplied by
the verb consumer when posting the work request.

Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-04 15:17         ` Tom Talpey
@ 2022-01-05  1:00           ` yangx.jy
  2022-01-06  0:01             ` Jason Gunthorpe
  2022-02-11 13:18           ` Gromadzki, Tomasz
  1 sibling, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-05  1:00 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Gromadzki, Tomasz, linux-rdma, yanjun.zhu, rpearsonhpe, jgg,
	y-goto, lizhijian

On 2022/1/4 23:17, Tom Talpey wrote:
>
> On 1/4/2022 4:28 AM, yangx.jy@fujitsu.com wrote:
>> On 2021/12/31 14:30, yangx.jy@fujitsu.com wrote:
>>> On 2021/12/31 5:42, Tom Talpey wrote:
>>>> On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
>>>>> 1)
>>>>>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context, struct
>>>>>> ibv_sge *sgl,
>>>>>>               int nsge, int flags, uint64_t remote_addr, uint32_t 
>>>>>> rkey)
>>>>> Do we need this API at all?
>>>>> Other atomic operations (compare_swap/add) do not use struct ibv_sge
>>>>> at all but have a dedicated place in
>>>>> struct ib_send_wr {
>>>>> ...
>>>>>           struct {
>>>>>               uint64_t    remote_addr;
>>>>>               uint64_t    compare_add;
>>>>>               uint64_t    swap;
>>>>>               uint32_t    rkey;
>>>>>           } atomic;
>>>>> ...
>>>>> }
>>>>>
>>>>> Would it be better to reuse (extend) any existing field?
>>>>> i.e.
>>>>>           struct {
>>>>>               uint64_t    remote_addr;
>>>>>               uint64_t    compare_add;
>>>>>               uint64_t    swap_write;
>>>>>               uint32_t    rkey;
>>>>>           } atomic;
>>>> Agreed. Passing the data to be written as an SGE is unnatural
>>>> since it is always exactly 64 bits. Tweaking the existing atomic
>>>> parameter block as Tomasz suggests is the best approach.
>>> Hi Tomasz, Tom
>>>
>>> Thanks for your quick reply.
>>>
>>> If we want to pass the 8-byte value by tweaking struct atomic on user
>>> space, why don't we
>>> tranfer the 8-byte value by ATOMIC Extended Transport Header 
>>> (AtomicETH)
>>> on kernel space?
>>> PS: IBTA defines that the 8-byte value is tranfered by RDMA Extended
>>> Transport Heade(RETH) + payload.
>>>
>>> Is it inconsistent to use struct atomic on user space and RETH + 
>>> payload
>>> on kernel space?
>> Hi Tomasz, Tom
>>
>> I think the following rules are right:
>> RDMA READ/WRITE should use struct rdma in libverbs and RETH + payload in
>> kernel.
>> RDMA Atomic should use struct atomic in libverbs and AtomicETH in 
>> kernel.
>>
>> According to IBTA's definition, RDMA Atomic Write should use struct rdma
>> in libibverbs.
>
> I don't quite understand this statement, the IBTA defines the protocol
> but does not define the API at such a level.
Hi Tom,

1) In kernel, current SoftRoCE copies the content of struct rdma to RETH 
and copies the content of struct atomic to AtomicETH.
2) IBTA defines that RDMA Atomic Write uses RETH + payload.
According to these two reasons, I perfer to tweak the existing struct rdma.

>
> I do however agree with this:
>
>> How about adding a member in struct rdma? for example:
>> struct {
>>       uint64_t    remote_addr;
>>       uint32_t    rkey;
>>       uint64_t    wr_value:
>> } rdma;
>
> Yes, that's what Tomasz and I were suggesting - a new template for the
> ATOMIC_WRITE request payload. The three fields are to be supplied by
> the verb consumer when posting the work request.

OK, I will update the patch in this way.

Best Regards,
Xiao Yang
>
> Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 21:39   ` Tom Talpey
  2021-12-31  8:29     ` yangx.jy
@ 2022-01-05 23:53     ` Jason Gunthorpe
  2022-01-06 10:52       ` yangx.jy
  1 sibling, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-05 23:53 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Xiao Yang, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto,
	lizhijian, tomasz.gromadzki

On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:

> Because RXE is a software provider, I believe the most natural approach
> here is to use an atomic64_set(dst, *src).

A smp_store_release() is most likely sufficient.

The spec word 'atomic' is pretty confusing. What it is asking for, in
the modern memory model vernacular, is 'write once' for the entire
payload and 'release' to only be observable after other stores made by
the same QP.

'release' semantics will depend on the CPU complex having a release
dependency chain throughout all CPUs that completed any prior response
on the QP. It looks like this is achieved through tasklet scheduling..

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-05  1:00           ` yangx.jy
@ 2022-01-06  0:01             ` Jason Gunthorpe
  2022-01-06  1:54               ` yangx.jy
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-06  0:01 UTC (permalink / raw)
  To: yangx.jy
  Cc: Tom Talpey, Gromadzki, Tomasz, linux-rdma, yanjun.zhu,
	rpearsonhpe, y-goto, lizhijian

On Wed, Jan 05, 2022 at 01:00:42AM +0000, yangx.jy@fujitsu.com wrote:

> 1) In kernel, current SoftRoCE copies the content of struct rdma to RETH 
> and copies the content of struct atomic to AtomicETH.
> 2) IBTA defines that RDMA Atomic Write uses RETH + payload.
> According to these two reasons, I perfer to tweak the existing struct rdma.

No this is basically meaningless

The wr struct is designed as a 'tagged union' where the op specified
which union is in effect.

It turns out that the op generally specifies the network headers as
well, but that is just a side effect.

> >> How about adding a member in struct rdma? for example:
> >> struct {
> >>       uint64_t    remote_addr;
> >>       uint32_t    rkey;
> >>       uint64_t    wr_value:
> >> } rdma;
> >
> > Yes, that's what Tomasz and I were suggesting - a new template for the
> > ATOMIC_WRITE request payload. The three fields are to be supplied by
> > the verb consumer when posting the work request.
> 
> OK, I will update the patch in this way.

We are not extending the ib_send_wr anymore anyhow.

You should implement new ops inside struct ibv_qp_ex as function
calls.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-06  0:01             ` Jason Gunthorpe
@ 2022-01-06  1:54               ` yangx.jy
  2022-01-10 15:42                 ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-06  1:54 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Tom Talpey, Gromadzki, Tomasz, linux-rdma, yanjun.zhu,
	rpearsonhpe, y-goto, lizhijian

On 2022/1/6 8:01, Jason Gunthorpe wrote:
> On Wed, Jan 05, 2022 at 01:00:42AM +0000, yangx.jy@fujitsu.com wrote:
>
>> 1) In kernel, current SoftRoCE copies the content of struct rdma to RETH
>> and copies the content of struct atomic to AtomicETH.
>> 2) IBTA defines that RDMA Atomic Write uses RETH + payload.
>> According to these two reasons, I perfer to tweak the existing struct rdma.
> No this is basically meaningless
>
> The wr struct is designed as a 'tagged union' where the op specified
> which union is in effect.
>
> It turns out that the op generally specifies the network headers as
> well, but that is just a side effect.
>
>>>> How about adding a member in struct rdma? for example:
>>>> struct {
>>>>        uint64_t    remote_addr;
>>>>        uint32_t    rkey;
>>>>        uint64_t    wr_value:
>>>> } rdma;
>>> Yes, that's what Tomasz and I were suggesting - a new template for the
>>> ATOMIC_WRITE request payload. The three fields are to be supplied by
>>> the verb consumer when posting the work request.
>> OK, I will update the patch in this way.
> We are not extending the ib_send_wr anymore anyhow.
>
> You should implement new ops inside struct ibv_qp_ex as function
> calls.

Hi Jason,

For SoftRoCE, do you mean that I only need to extend struct rxe_send_wr 
and add  ibv_wr_rdma_atomic_write() ?
----------------------------------------------------------------------------------------------------------------------
struct rxe_send_wr {
     ...
         struct {
             __aligned_u64 remote_addr;
+           __aligned_u64 atomic_wr;
             __u32   rkey;
             __u32   reserved;
         } rdma;
     ...
}

static inline void ibv_wr_rdma_atomic_write(struct ibv_qp_ex *qp, 
uint32_t rkey,
                                      uint64_t remote_addr)
{
         qp->wr_rdma_atomic_write(qp, rkey, remote_addr);
}
--------------------------------------------------------------------------------------------------------------------

Besides, could you tell me why we cannot extend struct ibv_send_wr for 
ibv_post_send()?

Best Regards,
Xiao Yang
> Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-05 23:53     ` Jason Gunthorpe
@ 2022-01-06 10:52       ` yangx.jy
  2022-01-06 13:00         ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-06 10:52 UTC (permalink / raw)
  To: Jason Gunthorpe, Tom Talpey
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian, tomasz.gromadzki

On 2022/1/6 7:53, Jason Gunthorpe wrote:
> On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
>
>> Because RXE is a software provider, I believe the most natural approach
>> here is to use an atomic64_set(dst, *src).
> A smp_store_release() is most likely sufficient.
Hi Jason, Tom

Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
I think the semantics of 'atomic write' is to do atomic write and make 
the 8-byte data reach the memory.

Best Regards,
Xiao Yang
> The spec word 'atomic' is pretty confusing. What it is asking for, in
> the modern memory model vernacular, is 'write once' for the entire
> payload and 'release' to only be observable after other stores made by
> the same QP.
>
> 'release' semantics will depend on the CPU complex having a release
> dependency chain throughout all CPUs that completed any prior response
> on the QP. It looks like this is achieved through tasklet scheduling..
>
> Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-06 10:52       ` yangx.jy
@ 2022-01-06 13:00         ` Jason Gunthorpe
  2022-01-07  2:15           ` yangx.jy
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-06 13:00 UTC (permalink / raw)
  To: yangx.jy
  Cc: Tom Talpey, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto,
	lizhijian, tomasz.gromadzki

On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
> On 2022/1/6 7:53, Jason Gunthorpe wrote:
> > On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
> >
> >> Because RXE is a software provider, I believe the most natural approach
> >> here is to use an atomic64_set(dst, *src).
> > A smp_store_release() is most likely sufficient.
> Hi Jason, Tom
> 
> Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
> I think the semantics of 'atomic write' is to do atomic write and make 
> the 8-byte data reach the memory.

No, it is not 'data reach memory' it is a 'release' in that if the CPU
later does an 'acquire' on the written data it is guarenteed to see
all the preceeding writes.

Memory barriers are always paired with read and write, it is important
to understand what the read half of the pair is.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-06 13:00         ` Jason Gunthorpe
@ 2022-01-07  2:15           ` yangx.jy
  2022-01-07 12:22             ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-07  2:15 UTC (permalink / raw)
  To: Jason Gunthorpe, Tom Talpey
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian, tomasz.gromadzki

On 2022/1/6 21:00, Jason Gunthorpe wrote:
> On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
>> On 2022/1/6 7:53, Jason Gunthorpe wrote:
>>> On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
>>>
>>>> Because RXE is a software provider, I believe the most natural approach
>>>> here is to use an atomic64_set(dst, *src).
>>> A smp_store_release() is most likely sufficient.
>> Hi Jason, Tom
>>
>> Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
>> I think the semantics of 'atomic write' is to do atomic write and make
>> the 8-byte data reach the memory.
> No, it is not 'data reach memory' it is a 'release' in that if the CPU
> later does an 'acquire' on the written data it is guarenteed to see
> all the preceeding writes.
Hi Jason, Tom

Sorry for the wrong statement. I mean that the semantics of 'atomic 
write' is to write an 8-byte value atomically and make the 8-byte value 
visible for all CPUs.
'smp_store_release' makes all the preceding writes visible for all CPUs 
before doing an atomic write. I think this guarantee should be done by 
the preceding 'flush'.
'smp_store_mb' does an atomic write and then makes the atomic write 
visible for all CPUs. Subsequent 'flush' is only used to make the atomic 
write persistent.
Please correct me if my understand is wrong.

Best Regards,
Xiao Yang
> Memory barriers are always paired with read and write, it is important
> to understand what the read half of the pair is.
>
> Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07  2:15           ` yangx.jy
@ 2022-01-07 12:22             ` Jason Gunthorpe
  2022-01-07 15:38               ` Tom Talpey
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-07 12:22 UTC (permalink / raw)
  To: yangx.jy
  Cc: Tom Talpey, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto,
	lizhijian, tomasz.gromadzki

On Fri, Jan 07, 2022 at 02:15:25AM +0000, yangx.jy@fujitsu.com wrote:
> On 2022/1/6 21:00, Jason Gunthorpe wrote:
> > On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
> >> On 2022/1/6 7:53, Jason Gunthorpe wrote:
> >>> On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
> >>>
> >>>> Because RXE is a software provider, I believe the most natural approach
> >>>> here is to use an atomic64_set(dst, *src).
> >>> A smp_store_release() is most likely sufficient.
> >> Hi Jason, Tom
> >>
> >> Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
> >> I think the semantics of 'atomic write' is to do atomic write and make
> >> the 8-byte data reach the memory.
> > No, it is not 'data reach memory' it is a 'release' in that if the CPU
> > later does an 'acquire' on the written data it is guarenteed to see
> > all the preceeding writes.
> Hi Jason, Tom
> 
> Sorry for the wrong statement. I mean that the semantics of 'atomic 
> write' is to write an 8-byte value atomically and make the 8-byte value 
> visible for all CPUs.
> 'smp_store_release' makes all the preceding writes visible for all CPUs 
> before doing an atomic write. I think this guarantee should be done by 
> the preceding 'flush'.

That isn't what the spec says by my reading, and it would be a useless
primitive to allow the ATOMIC WRITE to become visible before any data
it might be guarding.

> 'smp_store_mb' does an atomic write and then makes the atomic write 
> visible for all CPUs. Subsequent 'flush' is only used to make the atomic 
> write persistent.

persistent is a different subject.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07 12:22             ` Jason Gunthorpe
@ 2022-01-07 15:38               ` Tom Talpey
  2022-01-07 19:28                 ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Tom Talpey @ 2022-01-07 15:38 UTC (permalink / raw)
  To: Jason Gunthorpe, yangx.jy
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian, tomasz.gromadzki


On 1/7/2022 7:22 AM, Jason Gunthorpe wrote:
> On Fri, Jan 07, 2022 at 02:15:25AM +0000, yangx.jy@fujitsu.com wrote:
>> On 2022/1/6 21:00, Jason Gunthorpe wrote:
>>> On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
>>>> On 2022/1/6 7:53, Jason Gunthorpe wrote:
>>>>> On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
>>>>>
>>>>>> Because RXE is a software provider, I believe the most natural approach
>>>>>> here is to use an atomic64_set(dst, *src).
>>>>> A smp_store_release() is most likely sufficient.
>>>> Hi Jason, Tom
>>>>
>>>> Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
>>>> I think the semantics of 'atomic write' is to do atomic write and make
>>>> the 8-byte data reach the memory.
>>> No, it is not 'data reach memory' it is a 'release' in that if the CPU
>>> later does an 'acquire' on the written data it is guarenteed to see
>>> all the preceeding writes.
>> Hi Jason, Tom
>>
>> Sorry for the wrong statement. I mean that the semantics of 'atomic
>> write' is to write an 8-byte value atomically and make the 8-byte value
>> visible for all CPUs.
>> 'smp_store_release' makes all the preceding writes visible for all CPUs
>> before doing an atomic write. I think this guarantee should be done by
>> the preceding 'flush'.

An ATOMIC_WRITE is not required to provide visibility for prior writes,
but it *must* be ordered after those writes. If a FLUSH is used prior to
ATOMIC_WRITE, then there's nothing to do. But in other workloads, it is
still mandatory to provide the ordering. It's probably easiest, and no
less expensive, to just wmb() before processing the ATOMIC_WRITE.

Xiao Yang - where do you see the spec requiring that the ATOMIC_WRITE
64-bit payload be made globally visible as part of its execution? That
was not the intention. It is true, however, that some platforms (x86)
may provide visibility as a side effect.

> That isn't what the spec says by my reading, and it would be a useless
> primitive to allow the ATOMIC WRITE to become visible before any data
> it might be guarding.

Right. Visibility of the ATOMIC_WRITE would be even worse than if it
were misordered before the prior writes!

>> 'smp_store_mb' does an atomic write and then makes the atomic write
>> visible for all CPUs. Subsequent 'flush' is only used to make the atomic
>> write persistent.
> 
> persistent is a different subject.

Yep, absolutely.

Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
       [not found]         ` <61D563B4.2070106@fujitsu.com>
@ 2022-01-07 15:50           ` Tom Talpey
  2022-01-07 17:11             ` Jason Gunthorpe
  2022-01-12  9:24             ` yangx.jy
  0 siblings, 2 replies; 33+ messages in thread
From: Tom Talpey @ 2022-01-07 15:50 UTC (permalink / raw)
  To: yangx.jy
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki


On 1/5/2022 4:24 AM, yangx.jy@fujitsu.com wrote:
> On 2021/12/31 23:09, Tom Talpey wrote:
>> On 12/31/2021 3:29 AM, yangx.jy@fujitsu.com wrote:
>>> On 2021/12/31 5:39, Tom Talpey wrote:
>>>>
>>>> On 12/30/2021 7:14 AM, Xiao Yang wrote:
>>>>> This patch implements RDMA Atomic Write operation for RC service.
>>>>>
>>>>> Signed-off-by: Xiao Yang <yangx.jy@fujitsu.com>
>>>>> ---
>>>>>    drivers/infiniband/sw/rxe/rxe_comp.c   |  4 +++
>>>>>    drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++
>>>>>    drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>>>>>    drivers/infiniband/sw/rxe/rxe_qp.c     |  3 +-
>>>>>    drivers/infiniband/sw/rxe/rxe_req.c    | 10 ++++--
>>>>>    drivers/infiniband/sw/rxe/rxe_resp.c   | 49 
>>>>> +++++++++++++++++++++-----
>>>>>    include/rdma/ib_pack.h                 |  2 ++
>>>>>    include/rdma/ib_verbs.h                |  2 ++
>>>>>    include/uapi/rdma/ib_user_verbs.h      |  2 ++
>>>>>    9 files changed, 81 insertions(+), 12 deletions(-)
>>>>>
>>>>
>>>> <snip>
>>>>
>>>>> +/* Guarantee atomicity of atomic write operations at the machine
>>>>> level. */
>>>>> +static DEFINE_SPINLOCK(atomic_write_ops_lock);
>>>>> +
>>>>> +static enum resp_states process_atomic_write(struct rxe_qp *qp,
>>>>> +                         struct rxe_pkt_info *pkt)
>>>>> +{
>>>>> +    u64 *src, *dst;
>>>>> +    struct rxe_mr *mr = qp->resp.mr;
>>>>> +
>>>>> +    src = payload_addr(pkt);
>>>>> +
>>>>> +    dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset,
>>>>> sizeof(u64));
>>>>> +    if (!dst || (uintptr_t)dst & 7)
>>>>> +        return RESPST_ERR_MISALIGNED_ATOMIC;
>>>>> +
>>>>> +    spin_lock_bh(&atomic_write_ops_lock);
>>>>> +    *dst = *src;
>>>>> +    spin_unlock_bh(&atomic_write_ops_lock);
>>>>
>>>> Spinlock protection is insufficient! Using a spinlock protects only
>>>> the atomicity of the store operation with respect to another remote
>>>> atomic_write operation. But the semantics of RDMA_ATOMIC_WRITE go much
>>>> further. The operation requires a fully atomic bus transaction, across
>>>> the memory complex and with respect to CPU, PCI, and other sources.
>>>> And this guarantee needs to apply to all architectures, including ones
>>>> with noncoherent caches (software consistency).
>>>>
>>>> Because RXE is a software provider, I believe the most natural approach
>>>> here is to use an atomic64_set(dst, *src). But I'm not certain this
>>>> is supported on all the target architectures, therefore it may require
>>>> some additional failure checks, such as stricter alignment than testing
>>>> (dst & 7), or returning a failure if atomic64 operations are not
>>>> available. I especially think the ARM and PPC platforms will need
>>>> an expert review.
>>> Hi Tom,
>>>
>>> Thanks a lot for the detailed suggestion.
>>>
>>> I will try to use atomic64_set() and add additional failure checks.
>>> By the way, process_atomic() uses the same method(spinlock + assignment
>>> expression),
>>> so do you think we also need to update it by using atomic64_set()?
>>
>> It is *criticial* for you to understand that the ATOMIC_WRITE has a
>> much stronger semantic than ordinary RDMA atomics.
>>
>> The ordinary atomics are only atomic from the perspective of a single
>> connection and a single adapter. And the result is not placed in memory
>> atomically, only its execution in the adapter is processed that way.
>> And the final result is only consistently visible to the application
>> after a completion event is delivered. This rather huge compromise is
>> because when atomics were first designed, there were no PCI primitives
>> which supported atomics.
>>
>> An RDMA atomic_write operation is atomic all the way to memory. It
>> must to be implemented in a platform operation which is similarly
>> atomic. If implemented by a PCIe adapter, it would use the newer
>> PCIe atomics to perform a locked read-modify-write. If implemented
>> in software as you are doing, it must use a similar local r-m-w
>> instruction, or the platform equivalent if necessary.
> Hi Tom,
> 
> It's OK to replace "spinlock + assignment expression" with 
> atomic64_set(), but atomic64_set() only ensures
> the atomicity of write operation as well(In other words, it cannot 
> ensure that the data is written into memory atomically)
> let's see the definition of atomic64_set() on x86 and arm64:
> ----------------------------------------------------------------
> #define __WRITE_ONCE(x, val)                                            \
> do {                                                                    \
>          *(volatile typeof(x) *)&(x) = (val);                            \
> } while (0)
> 
> x86:
> static inline void arch_atomic64_set(atomic64_t *v, s64 i)
> {
>          __WRITE_ONCE(v->counter, i);
> }
> 
> arm64:
> #define arch_atomic64_set                       arch_atomic_set
> #define arch_atomic_set(v, i)                   
> __WRITE_ONCE(((v)->counter), (i))
> ----------------------------------------------------------------
> *
> We may need both atomic64_set() and wmb() here. Do you think so?
> ---------------------------------------------------------
> atomic64_set(dst, *src);
> wmb(); *
> ----------------------------------------------------------------

No, I don't think an explicit wmb() is required *after* the store. The
only requirement is that the ATOMIC_WRITE 64-bit payload is not torn.
It's not necessary to order it with respect to subsequent writes from
other QPs, or even the same one.

> In addtion, I think we don't need to check if atomic64_set() is 
> available because all arches support atomic64_set().
> Many arches have own atomic64_set() and the others use the generic 
> atomic64_set(), for example:
> ----------------------------------------------------------------
> x86:
> static inline void arch_atomic64_set(atomic64_t *v, s64 i)
> {
>          __WRITE_ONCE(v->counter, i);
> }
> 
> generic:
> void generic_atomic64_set(atomic64_t *v, s64 i)
> {
>          unsigned long flags;
>          raw_spinlock_t *lock = lock_addr(v);
> 
>          raw_spin_lock_irqsave(lock, flags);
>          v->counter = i;
>          raw_spin_unlock_irqrestore(lock, flags);
> }
> EXPORT_SYMBOL(generic_atomic64_set);
> ----------------------------------------------------------------

We're discussing this in the other fork of the thread. I think Jason's
suggestion of using smb_store_mb() has good merit.

> Finally, could you tell me how to add stricter alignment than testing 
> (dst & 7)?

Alignment is only part of the issue. The instruction set, type of
mapping, and the width of the bus are also important to determine if
a torn write might occur.

Off the top of my head, an uncached mapping would be a problem on an
architecture which did not support 64-bit stores. A cache will merge
32-bit writes, which won't happen if it's disabled. I guess it could
be argued this is uninteresting, in a modern platform, but...

A second might be MMIO space, or a similar dedicated memory block such
as a GPU. It's completely a platform question whether these can provide
an untorn 64-bit write semantic.

Keep in mind that ATOMIC_WRITE allows an application to poll a 64-bit
location to receive a result, without reaping any completion first. The
value must appear after all prior operations have completed. And the
entire 64-bit value must be intact. That's it.

Tom.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07 15:50           ` Tom Talpey
@ 2022-01-07 17:11             ` Jason Gunthorpe
  2022-01-12  9:24             ` yangx.jy
  1 sibling, 0 replies; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-07 17:11 UTC (permalink / raw)
  To: Tom Talpey
  Cc: yangx.jy, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian,
	tomasz.gromadzki

On Fri, Jan 07, 2022 at 10:50:47AM -0500, Tom Talpey wrote:

> Keep in mind that ATOMIC_WRITE allows an application to poll a 64-bit
> location to receive a result, without reaping any completion first. The
> value must appear after all prior operations have completed. And the
> entire 64-bit value must be intact. That's it.

This is exactly why it is a release operation.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07 15:38               ` Tom Talpey
@ 2022-01-07 19:28                 ` Jason Gunthorpe
  2022-01-07 20:11                   ` Tom Talpey
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-07 19:28 UTC (permalink / raw)
  To: Tom Talpey
  Cc: yangx.jy, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian,
	tomasz.gromadzki

On Fri, Jan 07, 2022 at 10:38:30AM -0500, Tom Talpey wrote:
> 
> On 1/7/2022 7:22 AM, Jason Gunthorpe wrote:
> > On Fri, Jan 07, 2022 at 02:15:25AM +0000, yangx.jy@fujitsu.com wrote:
> > > On 2022/1/6 21:00, Jason Gunthorpe wrote:
> > > > On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
> > > > > On 2022/1/6 7:53, Jason Gunthorpe wrote:
> > > > > > On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
> > > > > > 
> > > > > > > Because RXE is a software provider, I believe the most natural approach
> > > > > > > here is to use an atomic64_set(dst, *src).
> > > > > > A smp_store_release() is most likely sufficient.
> > > > > Hi Jason, Tom
> > > > > 
> > > > > Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
> > > > > I think the semantics of 'atomic write' is to do atomic write and make
> > > > > the 8-byte data reach the memory.
> > > > No, it is not 'data reach memory' it is a 'release' in that if the CPU
> > > > later does an 'acquire' on the written data it is guarenteed to see
> > > > all the preceeding writes.
> > > Hi Jason, Tom
> > > 
> > > Sorry for the wrong statement. I mean that the semantics of 'atomic
> > > write' is to write an 8-byte value atomically and make the 8-byte value
> > > visible for all CPUs.
> > > 'smp_store_release' makes all the preceding writes visible for all CPUs
> > > before doing an atomic write. I think this guarantee should be done by
> > > the preceding 'flush'.
> 
> An ATOMIC_WRITE is not required to provide visibility for prior writes,
> but it *must* be ordered after those writes. 

It doesn't make much sense to really talk about "visibility", it is
very rare something would need something to fully stop until other
things can see it.

What we generally talk about these days is only order.

This is what release/acquire is about. smp_store_release() says that
someone doing smp_load_acquire() on the same data is guaranteed to
observe the previous writes if it observes the data that was written.

Eg if you release a head pointer in a queue then acquiring the new
head pointer value also guarentees that all data in the queue is
visible to you.

However, release doesn't say anything about *when* other observers may
have this visibility, and it certainly doesn't stop and wait until all
observers are guarenteed to see the new data.

> ATOMIC_WRITE, then there's nothing to do. But in other workloads, it is
> still mandatory to provide the ordering. It's probably easiest, and no
> less expensive, to just wmb() before processing the ATOMIC_WRITE.

Which is what smp_store_release() does:

#define __smp_store_release(p, v)                                       \
do {                                                                    \
        __smp_mb();                                                     \
        WRITE_ONCE(*p, v);                                              \
} while (0)

Notice this is the opposite of what smpt_store_mb() does:

#define __smp_store_mb(var, value)  \
do { \
        WRITE_ONCE(var, value); \
        __smp_mb(); \
} while (0)

Which is *not* a release and does *not* guarentee order properties. It
is very similar to what FLUSH would provide in IBA, and very few
things benefit from this. (Indeed, I suspect many of the users in the
kernel are wrong, looking at you SIW..)

> Xiao Yang - where do you see the spec requiring that the ATOMIC_WRITE
> 64-bit payload be made globally visible as part of its execution? 

I don't see this either. I don't think IBA contemplates something
analogous to 'sequentially consistent ordering'.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07 19:28                 ` Jason Gunthorpe
@ 2022-01-07 20:11                   ` Tom Talpey
  0 siblings, 0 replies; 33+ messages in thread
From: Tom Talpey @ 2022-01-07 20:11 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: yangx.jy, linux-rdma, yanjun.zhu, rpearsonhpe, y-goto, lizhijian,
	tomasz.gromadzki

On 1/7/2022 2:28 PM, Jason Gunthorpe wrote:
> On Fri, Jan 07, 2022 at 10:38:30AM -0500, Tom Talpey wrote:
>>
>> On 1/7/2022 7:22 AM, Jason Gunthorpe wrote:
>>> On Fri, Jan 07, 2022 at 02:15:25AM +0000, yangx.jy@fujitsu.com wrote:
>>>> On 2022/1/6 21:00, Jason Gunthorpe wrote:
>>>>> On Thu, Jan 06, 2022 at 10:52:47AM +0000, yangx.jy@fujitsu.com wrote:
>>>>>> On 2022/1/6 7:53, Jason Gunthorpe wrote:
>>>>>>> On Thu, Dec 30, 2021 at 04:39:01PM -0500, Tom Talpey wrote:
>>>>>>>
>>>>>>>> Because RXE is a software provider, I believe the most natural approach
>>>>>>>> here is to use an atomic64_set(dst, *src).
>>>>>>> A smp_store_release() is most likely sufficient.
>>>>>> Hi Jason, Tom
>>>>>>
>>>>>> Is smp_store_mb() better here? It calls WRITE_ONCE + smb_mb/barrier().
>>>>>> I think the semantics of 'atomic write' is to do atomic write and make
>>>>>> the 8-byte data reach the memory.
>>>>> No, it is not 'data reach memory' it is a 'release' in that if the CPU
>>>>> later does an 'acquire' on the written data it is guarenteed to see
>>>>> all the preceeding writes.
>>>> Hi Jason, Tom
>>>>
>>>> Sorry for the wrong statement. I mean that the semantics of 'atomic
>>>> write' is to write an 8-byte value atomically and make the 8-byte value
>>>> visible for all CPUs.
>>>> 'smp_store_release' makes all the preceding writes visible for all CPUs
>>>> before doing an atomic write. I think this guarantee should be done by
>>>> the preceding 'flush'.
>>
>> An ATOMIC_WRITE is not required to provide visibility for prior writes,
>> but it *must* be ordered after those writes.
> 
> It doesn't make much sense to really talk about "visibility", it is
> very rare something would need something to fully stop until other
> things can see it.

Jason, sorry I wasn't precise enough in my terminology. Yes, the spec
is not concerned with "visibility" full stop. The new concept of
"global visibility" is added, and is what I was pointing out is *not*
required here.

> What we generally talk about these days is only order.
> 
> This is what release/acquire is about. smp_store_release() says that
> someone doing smp_load_acquire() on the same data is guaranteed to
> observe the previous writes if it observes the data that was written.

I like it! This definitely looks like the proper semantic here.

Tom.

> Eg if you release a head pointer in a queue then acquiring the new
> head pointer value also guarentees that all data in the queue is
> visible to you.
> 
> However, release doesn't say anything about *when* other observers may
> have this visibility, and it certainly doesn't stop and wait until all
> observers are guarenteed to see the new data.
> 
>> ATOMIC_WRITE, then there's nothing to do. But in other workloads, it is
>> still mandatory to provide the ordering. It's probably easiest, and no
>> less expensive, to just wmb() before processing the ATOMIC_WRITE.
> 
> Which is what smp_store_release() does:
> 
> #define __smp_store_release(p, v)                                       \
> do {                                                                    \
>          __smp_mb();                                                     \
>          WRITE_ONCE(*p, v);                                              \
> } while (0)
> 
> Notice this is the opposite of what smpt_store_mb() does:
> 
> #define __smp_store_mb(var, value)  \
> do { \
>          WRITE_ONCE(var, value); \
>          __smp_mb(); \
> } while (0)
> 
> Which is *not* a release and does *not* guarentee order properties. It
> is very similar to what FLUSH would provide in IBA, and very few
> things benefit from this. (Indeed, I suspect many of the users in the
> kernel are wrong, looking at you SIW..)
> 
>> Xiao Yang - where do you see the spec requiring that the ATOMIC_WRITE
>> 64-bit payload be made globally visible as part of its execution?
> 
> I don't see this either. I don't think IBA contemplates something
> analogous to 'sequentially consistent ordering'.
> 
> Jason
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-06  1:54               ` yangx.jy
@ 2022-01-10 15:42                 ` Jason Gunthorpe
  2022-01-11  2:34                   ` yangx.jy
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-10 15:42 UTC (permalink / raw)
  To: yangx.jy
  Cc: Tom Talpey, Gromadzki, Tomasz, linux-rdma, yanjun.zhu,
	rpearsonhpe, y-goto, lizhijian

On Thu, Jan 06, 2022 at 01:54:30AM +0000, yangx.jy@fujitsu.com wrote:
> On 2022/1/6 8:01, Jason Gunthorpe wrote:
> > On Wed, Jan 05, 2022 at 01:00:42AM +0000, yangx.jy@fujitsu.com wrote:
> >
> >> 1) In kernel, current SoftRoCE copies the content of struct rdma to RETH
> >> and copies the content of struct atomic to AtomicETH.
> >> 2) IBTA defines that RDMA Atomic Write uses RETH + payload.
> >> According to these two reasons, I perfer to tweak the existing struct rdma.
> > No this is basically meaningless
> >
> > The wr struct is designed as a 'tagged union' where the op specified
> > which union is in effect.
> >
> > It turns out that the op generally specifies the network headers as
> > well, but that is just a side effect.
> >
> >>>> How about adding a member in struct rdma? for example:
> >>>> struct {
> >>>>        uint64_t    remote_addr;
> >>>>        uint32_t    rkey;
> >>>>        uint64_t    wr_value:
> >>>> } rdma;
> >>> Yes, that's what Tomasz and I were suggesting - a new template for the
> >>> ATOMIC_WRITE request payload. The three fields are to be supplied by
> >>> the verb consumer when posting the work request.
> >> OK, I will update the patch in this way.
> > We are not extending the ib_send_wr anymore anyhow.
> >
> > You should implement new ops inside struct ibv_qp_ex as function
> > calls.
> 
> Hi Jason,
> 
> For SoftRoCE, do you mean that I only need to extend struct rxe_send_wr 
> and add  ibv_wr_rdma_atomic_write() ?
> struct rxe_send_wr {
>      ...
>          struct {
>              __aligned_u64 remote_addr;
> +           __aligned_u64 atomic_wr;
>              __u32   rkey;
>              __u32   reserved;
>          } rdma;

You can't make a change like this to anything in
include/uapi/rdma/rdma_user_rxe.h, it has to remain compatiable.

 
> static inline void ibv_wr_rdma_atomic_write(struct ibv_qp_ex *qp, 
> uint32_t rkey,
>                                       uint64_t remote_addr)
> {
>          qp->wr_rdma_atomic_write(qp, rkey, remote_addr);
> }

Yes, something like that
 
> Besides, could you tell me why we cannot extend struct ibv_send_wr for 
> ibv_post_send()?

The ABI is not allowed to change so what is doable with it is very
limited.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-10 15:42                 ` Jason Gunthorpe
@ 2022-01-11  2:34                   ` yangx.jy
  2022-01-11 23:29                     ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-01-11  2:34 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Tom Talpey, Gromadzki, Tomasz, linux-rdma, yanjun.zhu,
	rpearsonhpe, y-goto, lizhijian

On 2022/1/10 23:42, Jason Gunthorpe wrote:
> On Thu, Jan 06, 2022 at 01:54:30AM +0000, yangx.jy@fujitsu.com wrote:
>> On 2022/1/6 8:01, Jason Gunthorpe wrote:
>>> On Wed, Jan 05, 2022 at 01:00:42AM +0000, yangx.jy@fujitsu.com wrote:
>>>
>>>> 1) In kernel, current SoftRoCE copies the content of struct rdma to RETH
>>>> and copies the content of struct atomic to AtomicETH.
>>>> 2) IBTA defines that RDMA Atomic Write uses RETH + payload.
>>>> According to these two reasons, I perfer to tweak the existing struct rdma.
>>> No this is basically meaningless
>>>
>>> The wr struct is designed as a 'tagged union' where the op specified
>>> which union is in effect.
>>>
>>> It turns out that the op generally specifies the network headers as
>>> well, but that is just a side effect.
>>>
>>>>>> How about adding a member in struct rdma? for example:
>>>>>> struct {
>>>>>>         uint64_t    remote_addr;
>>>>>>         uint32_t    rkey;
>>>>>>         uint64_t    wr_value:
>>>>>> } rdma;
>>>>> Yes, that's what Tomasz and I were suggesting - a new template for the
>>>>> ATOMIC_WRITE request payload. The three fields are to be supplied by
>>>>> the verb consumer when posting the work request.
>>>> OK, I will update the patch in this way.
>>> We are not extending the ib_send_wr anymore anyhow.
>>>
>>> You should implement new ops inside struct ibv_qp_ex as function
>>> calls.
>> Hi Jason,
>>
>> For SoftRoCE, do you mean that I only need to extend struct rxe_send_wr
>> and add  ibv_wr_rdma_atomic_write() ?
>> struct rxe_send_wr {
>>       ...
>>           struct {
>>               __aligned_u64 remote_addr;
>> +           __aligned_u64 atomic_wr;
>>               __u32   rkey;
>>               __u32   reserved;
>>           } rdma;
> You can't make a change like this to anything in
> include/uapi/rdma/rdma_user_rxe.h, it has to remain compatiable.
Hi Jason,

How about adding atomic_wr member at the end of struct rdma? like this:
------------------------------------------------------
struct rxe_send_wr {
     ...
         struct {
             __aligned_u64 remote_addr;
             __u32   rkey;
             __u32   reserved;
+          __aligned_u64 atomic_wr;
         } rdma;
     ...
}
------------------------------------------------------
If it is also wrong, how to extend struct rdma?
>
>> static inline void ibv_wr_rdma_atomic_write(struct ibv_qp_ex *qp,
>> uint32_t rkey,
>>                                        uint64_t remote_addr)
>> {
>>           qp->wr_rdma_atomic_write(qp, rkey, remote_addr);
>> }
> Yes, something like that
>
>> Besides, could you tell me why we cannot extend struct ibv_send_wr for
>> ibv_post_send()?
> The ABI is not allowed to change so what is doable with it is very
> limited.
Thanks for your explanation.

Best Regards,
Xiao Yang
> Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-11  2:34                   ` yangx.jy
@ 2022-01-11 23:29                     ` Jason Gunthorpe
  0 siblings, 0 replies; 33+ messages in thread
From: Jason Gunthorpe @ 2022-01-11 23:29 UTC (permalink / raw)
  To: yangx.jy
  Cc: Tom Talpey, Gromadzki, Tomasz, linux-rdma, yanjun.zhu,
	rpearsonhpe, y-goto, lizhijian

On Tue, Jan 11, 2022 at 02:34:51AM +0000, yangx.jy@fujitsu.com wrote:

> struct rxe_send_wr {
>      ...
>          struct {
>              __aligned_u64 remote_addr;
>              __u32   rkey;
>              __u32   reserved;
> +          __aligned_u64 atomic_wr;
>          } rdma;
>      ...
> }
> If it is also wrong, how to extend struct rdma?

I think that coudl be OK, you need to confirm that the size of
rxe_send_wr didn't change

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-07 15:50           ` Tom Talpey
  2022-01-07 17:11             ` Jason Gunthorpe
@ 2022-01-12  9:24             ` yangx.jy
  1 sibling, 0 replies; 33+ messages in thread
From: yangx.jy @ 2022-01-12  9:24 UTC (permalink / raw)
  To: Tom Talpey
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki

On 2022/1/7 23:50, Tom Talpey wrote:
> Alignment is only part of the issue. The instruction set, type of
> mapping, and the width of the bus are also important to determine if
> a torn write might occur.
>
> Off the top of my head, an uncached mapping would be a problem on an
> architecture which did not support 64-bit stores. A cache will merge
> 32-bit writes, which won't happen if it's disabled. I guess it could
> be argued this is uninteresting, in a modern platform, but...
>
> A second might be MMIO space, or a similar dedicated memory block such
> as a GPU. It's completely a platform question whether these can provide
> an untorn 64-bit write semantic. 
Hi Tom,

These issues come from differnet arches or devices.
Either atomic64_set() or smp_store_release() returns void so how to 
check these issues in SoftRoCE driver?
Sorry, it's not clear for me to add which stricter checks.

Best Regards,
Xiao Yang

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-01-04 15:17         ` Tom Talpey
  2022-01-05  1:00           ` yangx.jy
@ 2022-02-11 13:18           ` Gromadzki, Tomasz
  1 sibling, 0 replies; 33+ messages in thread
From: Gromadzki, Tomasz @ 2022-02-11 13:18 UTC (permalink / raw)
  To: Tom Talpey, yangx.jy
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian

Hi

I have one question related to atomic write API and ibv_wr extension:

> > void (*wr_rdma_atomic_write)(struct ibv_qp_ex *qp, uint32_t rkey,
> >			     uint64_t remote_addr, uint64_t atomic_wr);

> > struct {
> >       uint64_t    remote_addr;
> >       uint32_t    rkey;
> >       uint64_t    wr_value:
> > } rdma;


In both places, atomic value is defined as uint64 but the IBTA spec defines it as 8 bytes array.

"The message size must be 8 bytes and is written to a contiguous range of the destination QP's virtual address space"

Would it be better to have 
uint8_t atomic_wr in wr_rdma_atomic_write(...) declaration

and 

uint8_t wr_value[8] in struct { ... } rdma?

I have checked CmpSwap and FetchAdd and in these cases, IBTA Spec says explicitly about 64 bits values.

Regards
Tomasz


> -----Original Message-----
> From: Tom Talpey <tom@talpey.com>
> Sent: Tuesday, January 4, 2022 4:17 PM
> To: yangx.jy@fujitsu.com; Gromadzki, Tomasz
> <tomasz.gromadzki@intel.com>
> Cc: linux-rdma@vger.kernel.org; yanjun.zhu@linux.dev;
> rpearsonhpe@gmail.com; jgg@nvidia.com; y-goto@fujitsu.com;
> lizhijian@fujitsu.com
> Subject: Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
> 
> 
> On 1/4/2022 4:28 AM, yangx.jy@fujitsu.com wrote:
> > On 2021/12/31 14:30, yangx.jy@fujitsu.com wrote:
> >> On 2021/12/31 5:42, Tom Talpey wrote:
> >>> On 12/30/2021 2:21 PM, Gromadzki, Tomasz wrote:
> >>>> 1)
> >>>>> rdma_post_atomic_writev(struct rdma_cm_id *id, void *context,
> >>>>> struct ibv_sge *sgl,
> >>>>>               int nsge, int flags, uint64_t remote_addr, uint32_t
> >>>>> rkey)
> >>>> Do we need this API at all?
> >>>> Other atomic operations (compare_swap/add) do not use struct
> >>>> ibv_sge at all but have a dedicated place in struct ib_send_wr {
> >>>> ...
> >>>>           struct {
> >>>>               uint64_t    remote_addr;
> >>>>               uint64_t    compare_add;
> >>>>               uint64_t    swap;
> >>>>               uint32_t    rkey;
> >>>>           } atomic;
> >>>> ...
> >>>> }
> >>>>
> >>>> Would it be better to reuse (extend) any existing field?
> >>>> i.e.
> >>>>           struct {
> >>>>               uint64_t    remote_addr;
> >>>>               uint64_t    compare_add;
> >>>>               uint64_t    swap_write;
> >>>>               uint32_t    rkey;
> >>>>           } atomic;
> >>> Agreed. Passing the data to be written as an SGE is unnatural since
> >>> it is always exactly 64 bits. Tweaking the existing atomic parameter
> >>> block as Tomasz suggests is the best approach.
> >> Hi Tomasz, Tom
> >>
> >> Thanks for your quick reply.
> >>
> >> If we want to pass the 8-byte value by tweaking struct atomic on user
> >> space, why don't we tranfer the 8-byte value by ATOMIC Extended
> >> Transport Header (AtomicETH) on kernel space?
> >> PS: IBTA defines that the 8-byte value is tranfered by RDMA Extended
> >> Transport Heade(RETH) + payload.
> >>
> >> Is it inconsistent to use struct atomic on user space and RETH +
> >> payload on kernel space?
> > Hi Tomasz, Tom
> >
> > I think the following rules are right:
> > RDMA READ/WRITE should use struct rdma in libverbs and RETH + payload
> > in kernel.
> > RDMA Atomic should use struct atomic in libverbs and AtomicETH in kernel.
> >
> > According to IBTA's definition, RDMA Atomic Write should use struct
> > rdma in libibverbs.
> 
> I don't quite understand this statement, the IBTA defines the protocol but
> does not define the API at such a level.
> 
> I do however agree with this:
> 
> > How about adding a member in struct rdma? for example:
> > struct {
> >       uint64_t    remote_addr;
> >       uint32_t    rkey;
> >       uint64_t    wr_value:
> > } rdma;
> 
> Yes, that's what Tomasz and I were suggesting - a new template for the
> ATOMIC_WRITE request payload. The three fields are to be supplied by the
> verb consumer when posting the work request.
> 
> Tom.
---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2021-12-30 12:14 [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
                   ` (2 preceding siblings ...)
  2021-12-30 19:21 ` [RFC PATCH 0/2] " Gromadzki, Tomasz
@ 2022-02-17  3:50 ` yangx.jy
  2022-02-19 10:37   ` Leon Romanovsky
  3 siblings, 1 reply; 33+ messages in thread
From: yangx.jy @ 2022-02-17  3:50 UTC (permalink / raw)
  To: linux-rdma, leon
  Cc: yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian, tomasz.gromadzki

On 2021/12/30 20:14, Xiao Yang wrote:
> The IB SPEC v1.5[1][2] added new RDMA operations (Atomic Write and Flush).
> This patchset makes SoftRoCE support new RDMA Atomic Write on RC service.
>
> I added RDMA Atomic Write API and a rdma_atomic_write example on my
> rdma-core repository[3].  You can verify the patchset by building and
> running the rdma_atomic_write example.
> server:
> $ ./rdma_atomic_write_server -s [server_address] -p [port_number]
> client:
> $ ./rdma_atomic_write_client -s [server_address] -p [port_number]

Hi Leon,

Do you think when I should post the userspace patchset to rdma-core?

I'm not sure if I should post it to rdma-core as a PR until the kernel 
patchset is merged?

Best Regards,

Xiao Yang

>
> [1]: https://www.infinibandta.org/ibta-specification/ # login required
> [2]: https://www.infinibandta.org/wp-content/uploads/2021/08/IBTA-Overview-of-IBTA-Volume-1-Release-1.5-and-MPE-2021-08-17-Secure.pptx
> [3]: https://github.com/yangx-jy/rdma-core
>
> BTW: This patchset also needs the following fix.
> https://www.spinics.net/lists/linux-rdma/msg107838.html
>
> Xiao Yang (2):
>    RDMA/rxe: Rename send_atomic_ack() and atomic member of struct
>      resp_res
>    RDMA/rxe: Add RDMA Atomic Write operation
>
>   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
>   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++
>   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
>   drivers/infiniband/sw/rxe/rxe_qp.c     |  5 ++-
>   drivers/infiniband/sw/rxe/rxe_req.c    | 10 +++--
>   drivers/infiniband/sw/rxe/rxe_resp.c   | 59 ++++++++++++++++++++------
>   drivers/infiniband/sw/rxe/rxe_verbs.h  |  2 +-
>   include/rdma/ib_pack.h                 |  2 +
>   include/rdma/ib_verbs.h                |  2 +
>   include/uapi/rdma/ib_user_verbs.h      |  2 +
>   10 files changed, 88 insertions(+), 19 deletions(-)
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation
  2022-02-17  3:50 ` yangx.jy
@ 2022-02-19 10:37   ` Leon Romanovsky
  0 siblings, 0 replies; 33+ messages in thread
From: Leon Romanovsky @ 2022-02-19 10:37 UTC (permalink / raw)
  To: yangx.jy
  Cc: linux-rdma, yanjun.zhu, rpearsonhpe, jgg, y-goto, lizhijian,
	tomasz.gromadzki

On Thu, Feb 17, 2022 at 03:50:02AM +0000, yangx.jy@fujitsu.com wrote:
> On 2021/12/30 20:14, Xiao Yang wrote:
> > The IB SPEC v1.5[1][2] added new RDMA operations (Atomic Write and Flush).
> > This patchset makes SoftRoCE support new RDMA Atomic Write on RC service.
> >
> > I added RDMA Atomic Write API and a rdma_atomic_write example on my
> > rdma-core repository[3].  You can verify the patchset by building and
> > running the rdma_atomic_write example.
> > server:
> > $ ./rdma_atomic_write_server -s [server_address] -p [port_number]
> > client:
> > $ ./rdma_atomic_write_client -s [server_address] -p [port_number]
> 
> Hi Leon,
> 
> Do you think when I should post the userspace patchset to rdma-core?
> 
> I'm not sure if I should post it to rdma-core as a PR until the kernel 
> patchset is merged?

For any UAPI changes, we require to present actual user space part. So
the PR to rdma-core is needed in order to merge the kernel series.

In this PR, the first patch is created with "kernel-headers/update --not-final"
script against local version of kernel headers. Once the kernel part is merged,
you will update that first patch and force push the rdma-core PR.

Thanks

> 
> Best Regards,
> 
> Xiao Yang
> 
> >
> > [1]: https://www.infinibandta.org/ibta-specification/ # login required
> > [2]: https://www.infinibandta.org/wp-content/uploads/2021/08/IBTA-Overview-of-IBTA-Volume-1-Release-1.5-and-MPE-2021-08-17-Secure.pptx
> > [3]: https://github.com/yangx-jy/rdma-core
> >
> > BTW: This patchset also needs the following fix.
> > https://www.spinics.net/lists/linux-rdma/msg107838.html
> >
> > Xiao Yang (2):
> >    RDMA/rxe: Rename send_atomic_ack() and atomic member of struct
> >      resp_res
> >    RDMA/rxe: Add RDMA Atomic Write operation
> >
> >   drivers/infiniband/sw/rxe/rxe_comp.c   |  4 ++
> >   drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++
> >   drivers/infiniband/sw/rxe/rxe_opcode.h |  3 ++
> >   drivers/infiniband/sw/rxe/rxe_qp.c     |  5 ++-
> >   drivers/infiniband/sw/rxe/rxe_req.c    | 10 +++--
> >   drivers/infiniband/sw/rxe/rxe_resp.c   | 59 ++++++++++++++++++++------
> >   drivers/infiniband/sw/rxe/rxe_verbs.h  |  2 +-
> >   include/rdma/ib_pack.h                 |  2 +
> >   include/rdma/ib_verbs.h                |  2 +
> >   include/uapi/rdma/ib_user_verbs.h      |  2 +
> >   10 files changed, 88 insertions(+), 19 deletions(-)
> >

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2022-02-19 10:37 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-30 12:14 [RFC PATCH 0/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
2021-12-30 12:14 ` [RFC PATCH 1/2] RDMA/rxe: Rename send_atomic_ack() and atomic member of struct resp_res Xiao Yang
2021-12-30 12:14 ` [RFC PATCH 2/2] RDMA/rxe: Add RDMA Atomic Write operation Xiao Yang
2021-12-30 21:39   ` Tom Talpey
2021-12-31  8:29     ` yangx.jy
2021-12-31 15:09       ` Tom Talpey
     [not found]         ` <61D563B4.2070106@fujitsu.com>
2022-01-07 15:50           ` Tom Talpey
2022-01-07 17:11             ` Jason Gunthorpe
2022-01-12  9:24             ` yangx.jy
2022-01-05 23:53     ` Jason Gunthorpe
2022-01-06 10:52       ` yangx.jy
2022-01-06 13:00         ` Jason Gunthorpe
2022-01-07  2:15           ` yangx.jy
2022-01-07 12:22             ` Jason Gunthorpe
2022-01-07 15:38               ` Tom Talpey
2022-01-07 19:28                 ` Jason Gunthorpe
2022-01-07 20:11                   ` Tom Talpey
2021-12-31  3:01   ` lizhijian
2021-12-31  6:02     ` yangx.jy
2021-12-30 19:21 ` [RFC PATCH 0/2] " Gromadzki, Tomasz
2021-12-30 21:42   ` Tom Talpey
2021-12-31  6:30     ` yangx.jy
2022-01-04  9:28       ` yangx.jy
2022-01-04 15:17         ` Tom Talpey
2022-01-05  1:00           ` yangx.jy
2022-01-06  0:01             ` Jason Gunthorpe
2022-01-06  1:54               ` yangx.jy
2022-01-10 15:42                 ` Jason Gunthorpe
2022-01-11  2:34                   ` yangx.jy
2022-01-11 23:29                     ` Jason Gunthorpe
2022-02-11 13:18           ` Gromadzki, Tomasz
2022-02-17  3:50 ` yangx.jy
2022-02-19 10:37   ` Leon Romanovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.