All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] NTB : Introduce message library
@ 2018-05-06 19:20 Atul Raut
  2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Atul Raut @ 2018-05-06 19:20 UTC (permalink / raw)
  To: linux-ntb; +Cc: Atul Raut

Hi All,

This is v2 of cleanup series, where have enabled support to ntb_transport layer
for message register based devices. Refactor ntb_perf module to get library
out of it, so that the other client module can make use of it.

This cleanup addresses some of comments by Allen and Dave.

Thanks & Regards,
Atul Raut

Atul Raut (4):
  NTB : Introduce message library
  NTB :  Add message library NTB API
  NTB : Modification to ntb_perf module
  NTB : Add support to message registers based devices

 drivers/ntb/ntb.c           | 222 +++++++++++++++++++++++++++
 drivers/ntb/ntb_transport.c | 357 ++++++++++++++++++++++++++++++++------------
 drivers/ntb/test/ntb_perf.c | 347 +++++-------------------------------------
 include/linux/ntb.h         | 163 ++++++++++++++++++++
 4 files changed, 685 insertions(+), 404 deletions(-)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2 1/4] NTB : Introduce message library
  2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
@ 2018-05-06 19:20 ` Atul Raut
  2018-05-07  5:29   ` Logan Gunthorpe
  2018-05-11 22:40   ` Serge Semin
  2018-05-06 19:20 ` [PATCH v2 2/4] NTB : Add message library NTB API Atul Raut
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 25+ messages in thread
From: Atul Raut @ 2018-05-06 19:20 UTC (permalink / raw)
  To: linux-ntb; +Cc: Atul Raut

Library created by refactoring common code from
ntb_perf module so that all client can make use
of it.
The library is based on scratchpad and message registers
based apis.

Signed-off-by: Atul Raut <araut@codeaurora.org>
---
 include/linux/ntb.h | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 181d166..19fe973 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -58,6 +58,8 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/io.h>
 
 struct ntb_client;
 struct ntb_dev;
@@ -163,6 +165,56 @@ enum ntb_default_port {
 #define NTB_DEF_PEER_CNT	(1)
 #define NTB_DEF_PEER_IDX	(0)
 
+enum nt_cmd {
+	NT_CMD_INVAL = -1,/* invalid spad command */
+	NT_CMD_SSIZE = 0, /* send out buffer size */
+	NT_CMD_RSIZE = 1, /* recv in  buffer size */
+	NT_CMD_SXLAT = 2, /* send in  buffer xlat */
+	NT_CMD_RXLAT = 3, /* recv out buffer xlat */
+	NT_CMD_CLEAR = 4, /* clear allocated memory */
+	NT_STS_DONE  = 5, /* init is done */
+	NT_STS_LNKUP = 6, /* link up state flag */
+	NT_QP_LINKS        = 7, /* available QP link */
+	NT_CMD_NUM_MWS        = 8, /* number of memory windows */
+	NT_CMD_NUM_QPS        = 9, /* number of QP */
+	NT_CMD_NTB_VERSION    = 10, /* ntb version */
+};
+
+struct msg_type {
+/* Scratchpad/Message IO operations */
+	int (*cmd_send)(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
+			int cmd_wid, u64 data);
+	int (*cmd_recv)(struct ntb_dev *nt, int *pidx, enum nt_cmd *cmd,
+			int *cmd_wid, u64 *data);
+};
+
+#define MSG_TRIES		50
+#define MSG_UDELAY_LOW		1000
+#define MSG_UDELAY_HIGH		2000
+
+/**
+ * Scratchpads-base commands interface
+ */
+#define NT_SPAD_CNT(_pcnt) \
+	(3*((_pcnt) + 1))
+#define NT_SPAD_CMD(_gidx) \
+	(3*(_gidx))
+#define NT_SPAD_LDATA(_gidx) \
+	(3*(_gidx) + 1)
+#define NT_SPAD_HDATA(_gidx) \
+	(3*(_gidx) + 2)
+#define NT_SPAD_NOTIFY(_gidx) \
+	(BIT_ULL(_gidx))
+
+/**
+ * Messages-base commands interface
+ */
+#define NT_MSG_CMD		0
+#define NT_MSG_CMD_WID	        1
+#define NT_MSG_LDATA		2
+#define NT_MSG_HDATA		3
+#define NT_MSG_CNT		4
+
 /**
  * struct ntb_client_ops - ntb client operations
  * @probe:		Notify client of a new device.
@@ -1502,4 +1554,115 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx,
 	return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
 }
 
+/**
+ * nt_spad_cmd_send() - send messages to peer using spad register.
+ * @ntb:	NTB device context.
+ * @pidx:	Port index of peer device.
+ * @cmd:	ntb commands.
+ * @cmd_gidx:	Global device index.
+ * @data:	message data.
+ *
+ * Send data to the port specific scratchpad
+ *
+ * Perform predefined number of attempts before give up.
+ * We are sending the data to the port specific scratchpad, so
+ * to prevent a multi-port access race-condition. Additionally
+ * there is no need in local locking since only thread-safe
+ * service work is using this method.
+ *
+ * Set peer db to inform data is ready.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_spad_cmd_send(struct ntb_dev *ntb, int pidx, enum nt_cmd cmd,
+		     int cmd_gidx, u64 data);
+
+/**
+ * nt_spad_cmd_recv() - Receive the messages using spad register.
+ * @ntb:	NTB device context.
+ * @pidx:	Port index of peer device a message being receive
+ * @cmd:	NTB command
+ * @cmd_wid:	Gloable device Index
+ * @data:	Received data
+ *
+ * Clear bits in the peer doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * We start scanning all over, since cleared DB may have been set
+ * by any peer. Yes, it makes peer with smaller index being
+ * serviced with greater priority, but it's convenient for spad
+ * and message code unification and simplicity.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_spad_cmd_recv(struct ntb_dev *ntb, int *pidx,
+		     enum nt_cmd *cmd, int *cmd_wid, u64 *data);
+
+/**
+ * nt_msg_cmd_send() - send messages to peer using message register.
+ * @ntb:	NTB device context.
+ * @pidx:	Port index of peer device.
+ * @cmd:	ntb commands.
+ * @cmd_gidx:	Memory window index.
+ * @data:	message data.
+ *
+ * Perform predefined number of attempts before give up. Message
+ * registers are free of race-condition problem when accessed
+ * from different ports, so we don't need splitting registers
+ * by global device index. We also won't have local locking,
+ * since the method is used from service work only.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_msg_cmd_send(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
+		    int cmd_wid, u64 data);
+
+/**
+ * nt_msg_cmd_recv() - Receive the messages using message register.
+ * @ntb:	NTB device context.
+ * @pidx:	Port index of peer device a message being receive
+ * @cmd:	NT command
+ * @cmd_wid:	Memory window Index
+ * @data:	Received data
+ *
+ * Get memory window index and data.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_msg_cmd_recv(struct ntb_dev *nt, int *pidx,
+		    enum nt_cmd *cmd, int *cmd_wid, u64 *data);
+
+/**
+ * nt_enable_messaging() - Enable messaging support.
+ * @ntb:	NTB device context.
+ * @gitx:	Global device Index.
+ *
+ * Check which messaging support to enable
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_enable_messaging(struct ntb_dev *ndev, int gidx);
+
+/**
+ * nt_disable_messaging() - Disable messaging support.
+ * @ntb:	NTB device context.
+ * @gidx:	Global device Index
+ *
+ * Check message type(spad/message) and disable messaging support.
+ *
+ */
+void nt_disable_messaging(struct ntb_dev *ndev, int gidx);
+
+/**
+ * nt_init_messaging() - Enable Messaging
+ * @ntb:	NTB device context.
+ * @msg_ptr:	Handle to function pointers Scratchpad or Message.
+ *
+ *
+ * Enable Scratchpad/Message IO operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+int nt_init_messaging(struct ntb_dev *ndev, struct msg_type *msg_ptr);
+
 #endif
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 2/4] NTB :  Add message library NTB API
  2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
  2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
@ 2018-05-06 19:20 ` Atul Raut
  2018-05-11 22:44   ` Serge Semin
  2018-05-06 19:20 ` [PATCH v2 3/4] NTB : Modification to ntb_perf module Atul Raut
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Atul Raut @ 2018-05-06 19:20 UTC (permalink / raw)
  To: linux-ntb; +Cc: Atul Raut

This patch brings in function definations for
the NTB library API.

Signed-off-by: Atul Raut <araut@codeaurora.org>
---
 drivers/ntb/ntb.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)

diff --git a/drivers/ntb/ntb.c b/drivers/ntb/ntb.c
index 2581ab7..c025e03 100644
--- a/drivers/ntb/ntb.c
+++ b/drivers/ntb/ntb.c
@@ -258,6 +258,228 @@ int ntb_default_peer_port_idx(struct ntb_dev *ntb, int port)
 }
 EXPORT_SYMBOL(ntb_default_peer_port_idx);
 
+int nt_spad_cmd_send(struct ntb_dev *ntb, int pidx, enum nt_cmd cmd,
+			    int cmd_gidx, u64 data)
+{
+	int try;
+	u32 sts;
+	int gidx = ntb_port_number(ntb);
+
+	for (try = 0; try < MSG_TRIES; try++) {
+		if (!ntb_link_is_up(ntb, NULL, NULL))
+			return -ENOLINK;
+
+		sts = ntb_peer_spad_read(ntb, pidx,
+					 NT_SPAD_CMD(gidx));
+		if (sts != NT_CMD_INVAL) {
+			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
+			continue;
+		}
+
+		ntb_peer_spad_write(ntb, pidx,
+				    NT_SPAD_LDATA(gidx),
+				    lower_32_bits(data));
+		ntb_peer_spad_write(ntb, pidx,
+				    NT_SPAD_HDATA(gidx),
+				    upper_32_bits(data));
+		mmiowb();
+		ntb_peer_spad_write(ntb, pidx,
+				    NT_SPAD_CMD(gidx),
+				    cmd);
+
+		ntb_peer_db_set(ntb, NT_SPAD_NOTIFY(cmd_gidx));
+
+		break;
+	}
+
+	return try < MSG_TRIES ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL(nt_spad_cmd_send);
+
+int nt_spad_cmd_recv(struct ntb_dev *ntb, int *pidx,
+			enum nt_cmd *cmd, int *cmd_wid, u64 *data)
+{
+	u32 val;
+	int gidx = 0;
+	int key = ntb_port_number(ntb);
+
+	ntb_db_clear(ntb, NT_SPAD_NOTIFY(key));
+
+	for (*pidx = 0; *pidx < ntb_peer_port_count(ntb); (*pidx)++, gidx++) {
+		if ((*pidx) == key)
+			++gidx;
+
+		if (!ntb_link_is_up(ntb, NULL, NULL))
+			continue;
+
+		val = ntb_spad_read(ntb, NT_SPAD_CMD(gidx));
+		if (val == NT_CMD_INVAL)
+			continue;
+
+		*cmd = val;
+
+		val = ntb_spad_read(ntb, NT_SPAD_LDATA(gidx));
+		*data = val;
+
+		val = ntb_spad_read(ntb, NT_SPAD_HDATA(gidx));
+		*data |= (u64)val << 32;
+
+		/* Next command can be retrieved from now */
+		ntb_spad_write(ntb, NT_SPAD_CMD(gidx),
+			NT_CMD_INVAL);
+
+		return 0;
+	}
+
+	return -ENODATA;
+}
+EXPORT_SYMBOL(nt_spad_cmd_recv);
+
+int nt_msg_cmd_send(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
+int cmd_wid, u64 data)
+{
+	int try, ret;
+	u64 outbits;
+
+	outbits = ntb_msg_outbits(nt);
+	for (try = 0; try < MSG_TRIES; try++) {
+		if (!ntb_link_is_up(nt, NULL, NULL))
+			return -ENOLINK;
+
+		ret = ntb_msg_clear_sts(nt, outbits);
+		if (ret)
+			return ret;
+
+		ntb_peer_msg_write(nt, pidx, NT_MSG_LDATA,
+			cpu_to_le32(lower_32_bits(data)));
+
+		if (ntb_msg_read_sts(nt) & outbits) {
+			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
+			continue;
+		}
+
+		ntb_peer_msg_write(nt, pidx, NT_MSG_HDATA,
+			cpu_to_le32(upper_32_bits(data)));
+		mmiowb();
+
+		ntb_peer_msg_write(nt, pidx, NT_MSG_CMD_WID,
+			cpu_to_le32(cmd_wid));
+
+		/* This call shall trigger peer message event */
+		ntb_peer_msg_write(nt, pidx, NT_MSG_CMD,
+			cpu_to_le32(cmd));
+
+		break;
+	}
+
+	return try < MSG_TRIES ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL(nt_msg_cmd_send);
+
+int nt_msg_cmd_recv(struct ntb_dev *nt, int *pidx,
+			enum nt_cmd *cmd, int *cmd_wid, u64 *data)
+{
+	u64 inbits;
+	u32 val;
+
+	inbits = ntb_msg_inbits(nt);
+
+	if (hweight64(ntb_msg_read_sts(nt) & inbits) < 4)
+		return -ENODATA;
+
+	val = ntb_msg_read(nt, pidx, NT_MSG_CMD);
+	*cmd = le32_to_cpu(val);
+
+	val = ntb_msg_read(nt, pidx, NT_MSG_CMD_WID);
+	*cmd_wid = le32_to_cpu(val);
+
+	val = ntb_msg_read(nt, pidx, NT_MSG_LDATA);
+	*data = le32_to_cpu(val);
+
+	val = ntb_msg_read(nt, pidx, NT_MSG_HDATA);
+	*data |= (u64)le32_to_cpu(val) << 32;
+
+	/* Next command can be retrieved from now */
+	ntb_msg_clear_sts(nt, inbits);
+
+	return 0;
+}
+EXPORT_SYMBOL(nt_msg_cmd_recv);
+
+int nt_enable_messaging(struct ntb_dev *ndev, int gidx)
+{
+	u64 mask, incmd_bit;
+	int ret, sidx, scnt;
+
+	mask = ntb_db_valid_mask(ndev);
+	(void)ntb_db_set_mask(ndev, mask);
+
+	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
+		u64 inbits, outbits;
+
+		inbits = ntb_msg_inbits(ndev);
+		outbits = ntb_msg_outbits(ndev);
+		(void)ntb_msg_set_mask(ndev, inbits | outbits);
+
+		incmd_bit = BIT_ULL(__ffs64(inbits));
+		ret = ntb_msg_clear_mask(ndev, incmd_bit);
+	} else {
+		scnt = ntb_spad_count(ndev);
+		for (sidx = 0; sidx < scnt; sidx++)
+			ntb_spad_write(ndev, sidx, NT_CMD_INVAL);
+		incmd_bit = NT_SPAD_NOTIFY(gidx);
+		ret = ntb_db_clear_mask(ndev, incmd_bit);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(nt_enable_messaging);
+
+void nt_disable_messaging(struct ntb_dev *ndev, int gidx)
+{
+	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
+		u64 inbits;
+
+		inbits = ntb_msg_inbits(ndev);
+		(void)ntb_msg_set_mask(ndev, inbits);
+	} else {
+		(void)ntb_db_set_mask(ndev, NT_SPAD_NOTIFY(gidx));
+	}
+
+}
+EXPORT_SYMBOL(nt_disable_messaging);
+
+int nt_init_messaging(struct ntb_dev *ndev, struct msg_type *msg_ptr)
+{
+	u64 mask;
+	int pcnt = ntb_peer_port_count(ndev);
+
+	if (ntb_peer_mw_count(ndev) < (pcnt + 1)) {
+		dev_err(&ndev->dev, "Not enough memory windows\n");
+		return -EINVAL;
+	}
+
+	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
+		msg_ptr->cmd_send = nt_msg_cmd_send;
+		msg_ptr->cmd_recv = nt_msg_cmd_recv;
+
+		return 0;
+	}
+
+	mask = GENMASK_ULL(pcnt, 0);
+	if (ntb_spad_count(ndev) >= NT_SPAD_CNT(pcnt) &&
+	    (ntb_db_valid_mask(ndev) & mask) == mask) {
+		msg_ptr->cmd_send = nt_spad_cmd_send;
+		msg_ptr->cmd_recv = nt_spad_cmd_recv;
+
+		return 0;
+	}
+	dev_err(&ndev->dev, "Command services unsupported\n");
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(nt_init_messaging);
+
 static int ntb_probe(struct device *dev)
 {
 	struct ntb_dev *ntb;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 3/4] NTB : Modification to ntb_perf module
  2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
  2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
  2018-05-06 19:20 ` [PATCH v2 2/4] NTB : Add message library NTB API Atul Raut
@ 2018-05-06 19:20 ` Atul Raut
  2018-05-06 19:20 ` [PATCH v2 4/4] NTB : Add support to message registers based devices Atul Raut
  2018-05-11 22:39 ` [PATCH v2 0/4] NTB : Introduce message library Serge Semin
  4 siblings, 0 replies; 25+ messages in thread
From: Atul Raut @ 2018-05-06 19:20 UTC (permalink / raw)
  To: linux-ntb; +Cc: Atul Raut

Refactor ntb_perf module to get library code
so that other client can make use of it.

Signed-off-by: Atul Raut <araut@codeaurora.org>
---
 drivers/ntb/test/ntb_perf.c | 347 +++++---------------------------------------
 1 file changed, 40 insertions(+), 307 deletions(-)

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 2a9d6b0..c65f81e 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -100,10 +100,6 @@
 #define DMA_TRIES		100
 #define DMA_MDELAY		10
 
-#define MSG_TRIES		500
-#define MSG_UDELAY_LOW		1000
-#define MSG_UDELAY_HIGH		2000
-
 #define PERF_BUF_LEN 1024
 
 static unsigned long max_mw_size;
@@ -127,17 +123,6 @@
  *==============================================================================
  */
 
-enum perf_cmd {
-	PERF_CMD_INVAL = -1,/* invalid spad command */
-	PERF_CMD_SSIZE = 0, /* send out buffer size */
-	PERF_CMD_RSIZE = 1, /* recv in  buffer size */
-	PERF_CMD_SXLAT = 2, /* send in  buffer xlat */
-	PERF_CMD_RXLAT = 3, /* recv out buffer xlat */
-	PERF_CMD_CLEAR = 4, /* clear allocated memory */
-	PERF_STS_DONE  = 5, /* init is done */
-	PERF_STS_LNKUP = 6, /* link up state flag */
-};
-
 struct perf_ctx;
 
 struct perf_peer {
@@ -197,36 +182,11 @@ struct perf_ctx {
 	struct perf_peer *test_peer;
 	struct perf_thread threads[MAX_THREADS_CNT];
 
-	/* Scratchpad/Message IO operations */
-	int (*cmd_send)(struct perf_peer *peer, enum perf_cmd cmd, u64 data);
-	int (*cmd_recv)(struct perf_ctx *perf, int *pidx, enum perf_cmd *cmd,
-			u64 *data);
+	struct msg_type handle;
 
 	struct dentry *dbgfs_dir;
 };
 
-/*
- * Scratchpads-base commands interface
- */
-#define PERF_SPAD_CNT(_pcnt) \
-	(3*((_pcnt) + 1))
-#define PERF_SPAD_CMD(_gidx) \
-	(3*(_gidx))
-#define PERF_SPAD_LDATA(_gidx) \
-	(3*(_gidx) + 1)
-#define PERF_SPAD_HDATA(_gidx) \
-	(3*(_gidx) + 2)
-#define PERF_SPAD_NOTIFY(_gidx) \
-	(BIT_ULL(_gidx))
-
-/*
- * Messages-base commands interface
- */
-#define PERF_MSG_CNT		3
-#define PERF_MSG_CMD		0
-#define PERF_MSG_LDATA		1
-#define PERF_MSG_HDATA		2
-
 /*==============================================================================
  *                           Static data declarations
  *==============================================================================
@@ -251,192 +211,27 @@ static inline bool perf_link_is_up(struct perf_peer *peer)
 	return !!(link & BIT_ULL_MASK(peer->pidx));
 }
 
-static int perf_spad_cmd_send(struct perf_peer *peer, enum perf_cmd cmd,
-			      u64 data)
-{
-	struct perf_ctx *perf = peer->perf;
-	int try;
-	u32 sts;
-
-	dev_dbg(&perf->ntb->dev, "CMD send: %d 0x%llx\n", cmd, data);
-
-	/*
-	 * Perform predefined number of attempts before give up.
-	 * We are sending the data to the port specific scratchpad, so
-	 * to prevent a multi-port access race-condition. Additionally
-	 * there is no need in local locking since only thread-safe
-	 * service work is using this method.
-	 */
-	for (try = 0; try < MSG_TRIES; try++) {
-		if (!perf_link_is_up(peer))
-			return -ENOLINK;
-
-		sts = ntb_peer_spad_read(perf->ntb, peer->pidx,
-					 PERF_SPAD_CMD(perf->gidx));
-		if (sts != PERF_CMD_INVAL) {
-			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
-			continue;
-		}
-
-		ntb_peer_spad_write(perf->ntb, peer->pidx,
-				    PERF_SPAD_LDATA(perf->gidx),
-				    lower_32_bits(data));
-		ntb_peer_spad_write(perf->ntb, peer->pidx,
-				    PERF_SPAD_HDATA(perf->gidx),
-				    upper_32_bits(data));
-		mmiowb();
-		ntb_peer_spad_write(perf->ntb, peer->pidx,
-				    PERF_SPAD_CMD(perf->gidx),
-				    cmd);
-		mmiowb();
-		ntb_peer_db_set(perf->ntb, PERF_SPAD_NOTIFY(peer->gidx));
-
-		dev_dbg(&perf->ntb->dev, "DB ring peer %#llx\n",
-			PERF_SPAD_NOTIFY(peer->gidx));
-
-		break;
-	}
-
-	return try < MSG_TRIES ? 0 : -EAGAIN;
-}
-
-static int perf_spad_cmd_recv(struct perf_ctx *perf, int *pidx,
-			      enum perf_cmd *cmd, u64 *data)
-{
-	struct perf_peer *peer;
-	u32 val;
-
-	ntb_db_clear(perf->ntb, PERF_SPAD_NOTIFY(perf->gidx));
-
-	/*
-	 * We start scanning all over, since cleared DB may have been set
-	 * by any peer. Yes, it makes peer with smaller index being
-	 * serviced with greater priority, but it's convenient for spad
-	 * and message code unification and simplicity.
-	 */
-	for (*pidx = 0; *pidx < perf->pcnt; (*pidx)++) {
-		peer = &perf->peers[*pidx];
-
-		if (!perf_link_is_up(peer))
-			continue;
-
-		val = ntb_spad_read(perf->ntb, PERF_SPAD_CMD(peer->gidx));
-		if (val == PERF_CMD_INVAL)
-			continue;
-
-		*cmd = val;
-
-		val = ntb_spad_read(perf->ntb, PERF_SPAD_LDATA(peer->gidx));
-		*data = val;
-
-		val = ntb_spad_read(perf->ntb, PERF_SPAD_HDATA(peer->gidx));
-		*data |= (u64)val << 32;
-
-		/* Next command can be retrieved from now */
-		ntb_spad_write(perf->ntb, PERF_SPAD_CMD(peer->gidx),
-			       PERF_CMD_INVAL);
-
-		dev_dbg(&perf->ntb->dev, "CMD recv: %d 0x%llx\n", *cmd, *data);
-
-		return 0;
-	}
-
-	return -ENODATA;
-}
-
-static int perf_msg_cmd_send(struct perf_peer *peer, enum perf_cmd cmd,
-			     u64 data)
-{
-	struct perf_ctx *perf = peer->perf;
-	int try, ret;
-	u64 outbits;
-
-	dev_dbg(&perf->ntb->dev, "CMD send: %d 0x%llx\n", cmd, data);
-
-	/*
-	 * Perform predefined number of attempts before give up. Message
-	 * registers are free of race-condition problem when accessed
-	 * from different ports, so we don't need splitting registers
-	 * by global device index. We also won't have local locking,
-	 * since the method is used from service work only.
-	 */
-	outbits = ntb_msg_outbits(perf->ntb);
-	for (try = 0; try < MSG_TRIES; try++) {
-		if (!perf_link_is_up(peer))
-			return -ENOLINK;
-
-		ret = ntb_msg_clear_sts(perf->ntb, outbits);
-		if (ret)
-			return ret;
-
-		ntb_peer_msg_write(perf->ntb, peer->pidx, PERF_MSG_LDATA,
-				   lower_32_bits(data));
-
-		if (ntb_msg_read_sts(perf->ntb) & outbits) {
-			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
-			continue;
-		}
-
-		ntb_peer_msg_write(perf->ntb, peer->pidx, PERF_MSG_HDATA,
-				   upper_32_bits(data));
-		mmiowb();
-
-		/* This call shall trigger peer message event */
-		ntb_peer_msg_write(perf->ntb, peer->pidx, PERF_MSG_CMD, cmd);
-
-		break;
-	}
-
-	return try < MSG_TRIES ? 0 : -EAGAIN;
-}
-
-static int perf_msg_cmd_recv(struct perf_ctx *perf, int *pidx,
-			     enum perf_cmd *cmd, u64 *data)
-{
-	u64 inbits;
-	u32 val;
-
-	inbits = ntb_msg_inbits(perf->ntb);
-
-	if (hweight64(ntb_msg_read_sts(perf->ntb) & inbits) < 3)
-		return -ENODATA;
-
-	val = ntb_msg_read(perf->ntb, pidx, PERF_MSG_CMD);
-	*cmd = val;
-
-	val = ntb_msg_read(perf->ntb, pidx, PERF_MSG_LDATA);
-	*data = val;
-
-	val = ntb_msg_read(perf->ntb, pidx, PERF_MSG_HDATA);
-	*data |= (u64)val << 32;
-
-	/* Next command can be retrieved from now */
-	ntb_msg_clear_sts(perf->ntb, inbits);
-
-	dev_dbg(&perf->ntb->dev, "CMD recv: %d 0x%llx\n", *cmd, *data);
-
-	return 0;
-}
-
-static int perf_cmd_send(struct perf_peer *peer, enum perf_cmd cmd, u64 data)
+static int perf_cmd_send(struct perf_peer *peer, enum nt_cmd cmd,
+		int cmd_wid, u64 data)
 {
 	struct perf_ctx *perf = peer->perf;
 
-	if (cmd == PERF_CMD_SSIZE || cmd == PERF_CMD_SXLAT)
-		return perf->cmd_send(peer, cmd, data);
+	if (cmd == NT_CMD_SSIZE || cmd == NT_CMD_SXLAT)
+		return perf->handle.cmd_send(perf->ntb, peer->pidx,
+			cmd, cmd_wid, data);
 
 	dev_err(&perf->ntb->dev, "Send invalid command\n");
 	return -EINVAL;
 }
 
-static int perf_cmd_exec(struct perf_peer *peer, enum perf_cmd cmd)
+static int perf_cmd_exec(struct perf_peer *peer, enum nt_cmd cmd)
 {
 	switch (cmd) {
-	case PERF_CMD_SSIZE:
-	case PERF_CMD_RSIZE:
-	case PERF_CMD_SXLAT:
-	case PERF_CMD_RXLAT:
-	case PERF_CMD_CLEAR:
+	case NT_CMD_SSIZE:
+	case NT_CMD_RSIZE:
+	case NT_CMD_SXLAT:
+	case NT_CMD_RXLAT:
+	case NT_CMD_CLEAR:
 		break;
 	default:
 		dev_err(&peer->perf->ntb->dev, "Exec invalid command\n");
@@ -456,19 +251,20 @@ static int perf_cmd_exec(struct perf_peer *peer, enum perf_cmd cmd)
 static int perf_cmd_recv(struct perf_ctx *perf)
 {
 	struct perf_peer *peer;
-	int ret, pidx, cmd;
+	int ret, pidx, cmd, cmd_wid;
 	u64 data;
 
-	while (!(ret = perf->cmd_recv(perf, &pidx, &cmd, &data))) {
+	while (!(ret = perf->handle.cmd_recv(perf->ntb, &pidx, &cmd,
+			&cmd_wid, &data))) {
 		peer = &perf->peers[pidx];
 
 		switch (cmd) {
-		case PERF_CMD_SSIZE:
+		case NT_CMD_SSIZE:
 			peer->inbuf_size = data;
-			return perf_cmd_exec(peer, PERF_CMD_RSIZE);
-		case PERF_CMD_SXLAT:
+			return perf_cmd_exec(peer, NT_CMD_RSIZE);
+		case NT_CMD_SXLAT:
 			peer->outbuf_xlat = data;
-			return perf_cmd_exec(peer, PERF_CMD_RXLAT);
+			return perf_cmd_exec(peer, NT_CMD_RXLAT);
 		default:
 			dev_err(&perf->ntb->dev, "Recv invalid command\n");
 			return -EINVAL;
@@ -492,11 +288,11 @@ static void perf_link_event(void *ctx)
 		lnk_up = perf_link_is_up(peer);
 
 		if (lnk_up &&
-		    !test_and_set_bit(PERF_STS_LNKUP, &peer->sts)) {
-			perf_cmd_exec(peer, PERF_CMD_SSIZE);
+		    !test_and_set_bit(NT_STS_LNKUP, &peer->sts)) {
+			perf_cmd_exec(peer, NT_CMD_SSIZE);
 		} else if (!lnk_up &&
-			   test_and_clear_bit(PERF_STS_LNKUP, &peer->sts)) {
-			perf_cmd_exec(peer, PERF_CMD_CLEAR);
+			   test_and_clear_bit(NT_STS_LNKUP, &peer->sts)) {
+			perf_cmd_exec(peer, NT_CMD_CLEAR);
 		}
 	}
 }
@@ -548,7 +344,7 @@ static int perf_setup_outbuf(struct perf_peer *peer)
 	}
 
 	/* Initialization is finally done */
-	set_bit(PERF_STS_DONE, &peer->sts);
+	set_bit(NT_STS_DONE, &peer->sts);
 
 	return 0;
 }
@@ -612,7 +408,7 @@ static int perf_setup_inbuf(struct perf_peer *peer)
 	 * the code architecture, even though this method is called from service
 	 * work itself so the command will be executed right after it returns.
 	 */
-	(void)perf_cmd_exec(peer, PERF_CMD_SXLAT);
+	(void)perf_cmd_exec(peer, NT_CMD_SXLAT);
 
 	return 0;
 
@@ -626,20 +422,21 @@ static void perf_service_work(struct work_struct *work)
 {
 	struct perf_peer *peer = to_peer_service(work);
 
-	if (test_and_clear_bit(PERF_CMD_SSIZE, &peer->sts))
-		perf_cmd_send(peer, PERF_CMD_SSIZE, peer->outbuf_size);
+	if (test_and_clear_bit(NT_CMD_SSIZE, &peer->sts))
+		perf_cmd_send(peer, NT_CMD_SSIZE, peer->gidx,
+			peer->outbuf_size);
 
-	if (test_and_clear_bit(PERF_CMD_RSIZE, &peer->sts))
+	if (test_and_clear_bit(NT_CMD_RSIZE, &peer->sts))
 		perf_setup_inbuf(peer);
 
-	if (test_and_clear_bit(PERF_CMD_SXLAT, &peer->sts))
-		perf_cmd_send(peer, PERF_CMD_SXLAT, peer->inbuf_xlat);
+	if (test_and_clear_bit(NT_CMD_SXLAT, &peer->sts))
+		perf_cmd_send(peer, NT_CMD_SXLAT, peer->gidx, peer->inbuf_xlat);
 
-	if (test_and_clear_bit(PERF_CMD_RXLAT, &peer->sts))
+	if (test_and_clear_bit(NT_CMD_RXLAT, &peer->sts))
 		perf_setup_outbuf(peer);
 
-	if (test_and_clear_bit(PERF_CMD_CLEAR, &peer->sts)) {
-		clear_bit(PERF_STS_DONE, &peer->sts);
+	if (test_and_clear_bit(NT_CMD_CLEAR, &peer->sts)) {
+		clear_bit(NT_STS_DONE, &peer->sts);
 		if (test_bit(0, &peer->perf->busy_flag) &&
 		    peer == peer->perf->test_peer) {
 			dev_warn(&peer->perf->ntb->dev,
@@ -651,44 +448,6 @@ static void perf_service_work(struct work_struct *work)
 	}
 }
 
-static int perf_init_service(struct perf_ctx *perf)
-{
-	u64 mask;
-
-	if (ntb_peer_mw_count(perf->ntb) < perf->pcnt + 1) {
-		dev_err(&perf->ntb->dev, "Not enough memory windows\n");
-		return -EINVAL;
-	}
-
-	if (ntb_msg_count(perf->ntb) >= PERF_MSG_CNT) {
-		perf->cmd_send = perf_msg_cmd_send;
-		perf->cmd_recv = perf_msg_cmd_recv;
-
-		dev_dbg(&perf->ntb->dev, "Message service initialized\n");
-
-		return 0;
-	}
-
-	dev_dbg(&perf->ntb->dev, "Message service unsupported\n");
-
-	mask = GENMASK_ULL(perf->pcnt, 0);
-	if (ntb_spad_count(perf->ntb) >= PERF_SPAD_CNT(perf->pcnt) &&
-	    (ntb_db_valid_mask(perf->ntb) & mask) == mask) {
-		perf->cmd_send = perf_spad_cmd_send;
-		perf->cmd_recv = perf_spad_cmd_recv;
-
-		dev_dbg(&perf->ntb->dev, "Scratchpad service initialized\n");
-
-		return 0;
-	}
-
-	dev_dbg(&perf->ntb->dev, "Scratchpad service unsupported\n");
-
-	dev_err(&perf->ntb->dev, "Command services unsupported\n");
-
-	return -EINVAL;
-}
-
 static int perf_enable_service(struct perf_ctx *perf)
 {
 	u64 mask, incmd_bit;
@@ -701,26 +460,7 @@ static int perf_enable_service(struct perf_ctx *perf)
 	if (ret)
 		return ret;
 
-	if (perf->cmd_send == perf_msg_cmd_send) {
-		u64 inbits, outbits;
-
-		inbits = ntb_msg_inbits(perf->ntb);
-		outbits = ntb_msg_outbits(perf->ntb);
-		(void)ntb_msg_set_mask(perf->ntb, inbits | outbits);
-
-		incmd_bit = BIT_ULL(__ffs64(inbits));
-		ret = ntb_msg_clear_mask(perf->ntb, incmd_bit);
-
-		dev_dbg(&perf->ntb->dev, "MSG sts unmasked %#llx\n", incmd_bit);
-	} else {
-		scnt = ntb_spad_count(perf->ntb);
-		for (sidx = 0; sidx < scnt; sidx++)
-			ntb_spad_write(perf->ntb, sidx, PERF_CMD_INVAL);
-		incmd_bit = PERF_SPAD_NOTIFY(perf->gidx);
-		ret = ntb_db_clear_mask(perf->ntb, incmd_bit);
-
-		dev_dbg(&perf->ntb->dev, "DB bits unmasked %#llx\n", incmd_bit);
-	}
+	ret = nt_enable_messaging(perf->ntb, perf->gidx);
 	if (ret) {
 		ntb_clear_ctx(perf->ntb);
 		return ret;
@@ -739,19 +479,12 @@ static void perf_disable_service(struct perf_ctx *perf)
 
 	ntb_link_disable(perf->ntb);
 
-	if (perf->cmd_send == perf_msg_cmd_send) {
-		u64 inbits;
-
-		inbits = ntb_msg_inbits(perf->ntb);
-		(void)ntb_msg_set_mask(perf->ntb, inbits);
-	} else {
-		(void)ntb_db_set_mask(perf->ntb, PERF_SPAD_NOTIFY(perf->gidx));
-	}
+	nt_disable_messaging(perf->ntb, perf->gidx);
 
 	ntb_clear_ctx(perf->ntb);
 
 	for (pidx = 0; pidx < perf->pcnt; pidx++)
-		perf_cmd_exec(&perf->peers[pidx], PERF_CMD_CLEAR);
+		perf_cmd_exec(&perf->peers[pidx], NT_CMD_CLEAR);
 
 	for (pidx = 0; pidx < perf->pcnt; pidx++)
 		flush_work(&perf->peers[pidx].service);
@@ -1046,7 +779,7 @@ static int perf_submit_test(struct perf_peer *peer)
 	struct perf_thread *pthr;
 	int tidx, ret;
 
-	if (!test_bit(PERF_STS_DONE, &peer->sts))
+	if (!test_bit(NT_STS_DONE, &peer->sts))
 		return -ENOLINK;
 
 	if (test_and_set_bit_lock(0, &perf->busy_flag))
@@ -1184,7 +917,7 @@ static ssize_t perf_dbgfs_read_info(struct file *filep, char __user *ubuf,
 
 		pos += scnprintf(buf + pos, buf_size - pos,
 			"\tLink status: %s\n",
-			test_bit(PERF_STS_LNKUP, &peer->sts) ? "up" : "down");
+			test_bit(NT_STS_LNKUP, &peer->sts) ? "up" : "down");
 
 		pos += scnprintf(buf + pos, buf_size - pos,
 			"\tOut buffer addr 0x%pK\n", peer->outbuf);
@@ -1443,7 +1176,7 @@ static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
 
 	perf_init_threads(perf);
 
-	ret = perf_init_service(perf);
+	ret = nt_init_messaging(ntb, &perf->handle);
 	if (ret)
 		return ret;
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 4/4] NTB : Add support to message registers based devices
  2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
                   ` (2 preceding siblings ...)
  2018-05-06 19:20 ` [PATCH v2 3/4] NTB : Modification to ntb_perf module Atul Raut
@ 2018-05-06 19:20 ` Atul Raut
  2018-05-11 22:39 ` [PATCH v2 0/4] NTB : Introduce message library Serge Semin
  4 siblings, 0 replies; 25+ messages in thread
From: Atul Raut @ 2018-05-06 19:20 UTC (permalink / raw)
  To: linux-ntb; +Cc: Atul Raut

ntb_transport driver works only with Scartchpads based devices.
This patch add support to devices which uses Message registers
for data exchange.

Signed-off-by: Atul Raut <araut@codeaurora.org>
---
 drivers/ntb/ntb_transport.c | 357 ++++++++++++++++++++++++++++++++------------
 1 file changed, 260 insertions(+), 97 deletions(-)

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 9878c48..b8dcd29 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -47,8 +47,8 @@
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
  */
+
 #include <linux/debugfs.h>
-#include <linux/delay.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/errno.h>
@@ -189,6 +189,7 @@ struct ntb_transport_qp {
 };
 
 struct ntb_transport_mw {
+	u64 outbuf_xlat;
 	phys_addr_t phys_addr;
 	resource_size_t phys_size;
 	void __iomem *vbase;
@@ -222,6 +223,16 @@ struct ntb_transport_ctx {
 	struct work_struct link_cleanup;
 
 	struct dentry *debugfs_node_dir;
+	struct msg_type handle;
+
+	unsigned int peer_mw_count;
+	unsigned int peer_qp_count;
+	unsigned int peer_qp_links;
+	u32 peer_ntb_version;
+
+	/* NTB connection setup service */
+	struct work_struct	service;
+	unsigned long	sts;
 };
 
 enum {
@@ -254,6 +265,9 @@ enum {
 #define NTB_QP_DEF_NUM_ENTRIES	100
 #define NTB_LINK_DOWN_TIMEOUT	10
 
+#define to_ntb_transport_service(__work) \
+	container_of(__work, struct ntb_transport_ctx, service)
+
 static void ntb_transport_rxc_db(unsigned long data);
 static const struct ntb_ctx_ops ntb_transport_ops;
 static struct ntb_client ntb_transport_client;
@@ -263,7 +277,6 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
 static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset);
 static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset);
 
-
 static int ntb_transport_bus_match(struct device *dev,
 				   struct device_driver *drv)
 {
@@ -679,19 +692,50 @@ static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
 	mw->virt_addr = NULL;
 }
 
-static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
-		      resource_size_t size)
+static int ntb_transport_cmd_exec(struct ntb_transport_ctx *nt, enum nt_cmd cmd)
+{
+	struct pci_dev *pdev = nt->ndev->pdev;
+
+	switch (cmd) {
+	case NT_CMD_SSIZE:
+	case NT_CMD_RSIZE:
+	case NT_CMD_SXLAT:
+	case NT_CMD_RXLAT:
+	case NT_CMD_CLEAR:
+	case NT_CMD_NUM_MWS:
+	case NT_CMD_NUM_QPS:
+	case NT_CMD_NTB_VERSION:
+		break;
+	default:
+		dev_err(&pdev->dev, "Exec invalid command\n");
+		return -EINVAL;
+	}
+
+	/* No need of memory barrier, since bit ops have invernal lock */
+	set_bit(cmd, &nt->sts);
+
+	dev_dbg(&pdev->dev, "CMD exec: %d\n", cmd);
+
+	(void)queue_work(system_highpri_wq, &nt->service);
+
+	return 0;
+}
+
+static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw)
 {
 	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
 	struct pci_dev *pdev = nt->ndev->pdev;
 	size_t xlat_size, buff_size;
 	resource_size_t xlat_align;
 	resource_size_t xlat_align_size;
+	resource_size_t size;
 	int rc;
 
+	size = mw->buff_size;
 	if (!size)
 		return -EINVAL;
 
+	/* Get inbound MW parameters */
 	rc = ntb_mw_get_align(nt->ndev, PIDX, num_mw, &xlat_align,
 			      &xlat_align_size, NULL);
 	if (rc)
@@ -743,9 +787,72 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
 		return -EIO;
 	}
 
+	if (num_mw ==  (nt->mw_count-1)) {
+		set_bit(NT_STS_DONE, &nt->sts);
+		dev_dbg(&pdev->dev, " NT_STS_DONE sts = %d\n", nt->sts);
+		(void)ntb_transport_cmd_exec(nt, NT_CMD_SXLAT);
+	}
+
 	return 0;
 }
 
+static int ntb_transport_cmd_send(struct ntb_transport_ctx *nt, enum nt_cmd cmd,
+		int cmd_wid, u64 data)
+{
+	struct pci_dev *pdev = nt->ndev->pdev;
+	struct ntb_dev *ndev = nt->ndev;
+
+	if (cmd == NT_CMD_SSIZE || cmd == NT_CMD_SXLAT || cmd == NT_CMD_NUM_MWS
+		|| cmd == NT_CMD_NUM_QPS || cmd == NT_CMD_NTB_VERSION
+		|| cmd == NT_QP_LINKS)
+		return nt->handle.cmd_send(ndev, PIDX, cmd, cmd_wid, data);
+
+	dev_err(&pdev->dev, "Send invalid command\n");
+	return -EINVAL;
+}
+
+static int ntb_transport_cmd_recv(struct ntb_transport_ctx *nt)
+{
+	struct pci_dev *pdev = nt->ndev->pdev;
+	struct ntb_dev *ndev = nt->ndev;
+	int ret, pidx, cmd, cmd_wid;
+	u64 data;
+
+	while (!(ret = nt->handle.cmd_recv(ndev, &pidx, &cmd, &cmd_wid,
+			&data))) {
+		switch (cmd) {
+		case NT_CMD_SSIZE:
+			nt->mw_vec[cmd_wid].buff_size = data;
+			return ntb_transport_cmd_exec(nt, NT_CMD_RSIZE);
+		case NT_CMD_SXLAT:
+			nt->mw_vec[cmd_wid].outbuf_xlat = data;
+			if (cmd_wid ==  (nt->mw_count-1))
+				return ntb_transport_cmd_exec(nt, NT_CMD_RXLAT);
+			break;
+		case NT_CMD_NUM_MWS:
+			nt->peer_mw_count = data;
+			break;
+		case NT_CMD_NUM_QPS:
+			nt->peer_qp_count = data;
+			break;
+		case NT_CMD_NTB_VERSION:
+			if (data == NTB_TRANSPORT_VERSION)
+				nt->peer_ntb_version  = data;
+			break;
+		case NT_QP_LINKS:
+			nt->peer_qp_links = data;
+			break;
+		default:
+			dev_dbg(&pdev->dev, "[%s] Recv invalid command cmd-> %d\n",
+				__func__, cmd);
+			return -EINVAL;
+		}
+	}
+
+	/* Return 0 if no data left to process, otherwise an error */
+	return ret == -ENODATA ? 0 : ret;
+}
+
 static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
 {
 	qp->link_is_up = false;
@@ -839,6 +946,94 @@ static void ntb_transport_link_cleanup_work(struct work_struct *work)
 	ntb_transport_link_cleanup(nt);
 }
 
+static int ntb_transport_setup_outbuf(struct ntb_transport_ctx *nt, int num_mw)
+{
+	struct ntb_dev *ndev = nt->ndev;
+	int ret;
+
+	/* Outbuf size can be unaligned due to custom max_mw_size */
+	ret = ntb_peer_mw_set_trans(nt->ndev, PIDX, num_mw,
+		nt->mw_vec[num_mw].outbuf_xlat, nt->mw_vec[num_mw].phys_size);
+	if (ret) {
+		dev_err(&ndev->dev, "Failed to set outbuf translation\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ntb_qp_link_work(struct work_struct *work)
+{
+	struct ntb_transport_qp *qp = container_of(work,
+						   struct ntb_transport_qp,
+						   link_work.work);
+	struct pci_dev *pdev = qp->ndev->pdev;
+	struct ntb_transport_ctx *nt = qp->transport;
+	u64 qp_bitmap_alloc;
+	int val = -1;
+
+	WARN_ON(!nt->link_is_up);
+
+	qp_bitmap_alloc = (nt->qp_bitmap & ~nt->qp_bitmap_free);
+	ntb_transport_cmd_send(nt, NT_QP_LINKS, 0, qp_bitmap_alloc);
+	if (nt->peer_qp_links)
+		val = nt->peer_qp_links;
+
+	/* See if the remote side is up */
+	if (val & BIT(qp->qp_num)) {
+		dev_info(&pdev->dev, "qp %d: Link Up\n", qp->qp_num);
+		qp->link_is_up = true;
+		qp->active = true;
+
+		if (qp->event_handler)
+			qp->event_handler(qp->cb_data, qp->link_is_up);
+
+		if (qp->active)
+			tasklet_schedule(&qp->rxc_db_work);
+	} else if (nt->link_is_up)
+		schedule_delayed_work(&qp->link_work,
+				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
+}
+
+static void ntb_transport_service_work(struct work_struct *work)
+{
+	struct ntb_transport_ctx *nt = to_ntb_transport_service(work);
+	resource_size_t size;
+	int i;
+
+	if (test_and_clear_bit(NT_CMD_SSIZE, &nt->sts)) {
+		for (i = 0; i < nt->mw_count; i++) {
+			size = nt->mw_vec[i].phys_size;
+			if (max_mw_size && size > max_mw_size)
+				size = max_mw_size;
+			ntb_transport_cmd_send(nt, NT_CMD_SSIZE, i, size);
+		}
+	}
+
+	if (test_and_clear_bit(NT_CMD_RSIZE, &nt->sts))
+		for (i = 0; i < nt->mw_count; i++)
+			ntb_set_mw(nt, i);
+
+	if (test_and_clear_bit(NT_CMD_SXLAT, &nt->sts))
+		for (i = 0; i < nt->mw_count; i++)
+			ntb_transport_cmd_send(nt, NT_CMD_SXLAT, i,
+				nt->mw_vec[i].dma_addr);
+
+	if (test_and_clear_bit(NT_CMD_RXLAT, &nt->sts))
+		for (i = 0; i < nt->mw_count; i++)
+			ntb_transport_setup_outbuf(nt, i);
+
+	if (test_and_clear_bit(NT_CMD_NUM_MWS, &nt->sts))
+		ntb_transport_cmd_send(nt, NT_CMD_NUM_MWS, 0, nt->mw_count);
+
+	if (test_and_clear_bit(NT_CMD_NUM_QPS, &nt->sts))
+		ntb_transport_cmd_send(nt, NT_CMD_NUM_QPS, 0,  nt->qp_count);
+
+	if (test_and_clear_bit(NT_CMD_NTB_VERSION, &nt->sts))
+		ntb_transport_cmd_send(nt, NT_CMD_NTB_VERSION, 0,
+			NTB_TRANSPORT_VERSION);
+}
+
 static void ntb_transport_event_callback(void *data)
 {
 	struct ntb_transport_ctx *nt = data;
@@ -855,72 +1050,43 @@ static void ntb_transport_link_work(struct work_struct *work)
 		container_of(work, struct ntb_transport_ctx, link_work.work);
 	struct ntb_dev *ndev = nt->ndev;
 	struct pci_dev *pdev = ndev->pdev;
-	resource_size_t size;
-	u32 val;
-	int rc = 0, i, spad;
+	int rc = 0, i;
 
 	/* send the local info, in the opposite order of the way we read it */
-	for (i = 0; i < nt->mw_count; i++) {
-		size = nt->mw_vec[i].phys_size;
-
-		if (max_mw_size && size > max_mw_size)
-			size = max_mw_size;
-
-		spad = MW0_SZ_HIGH + (i * 2);
-		ntb_peer_spad_write(ndev, PIDX, spad, upper_32_bits(size));
-
-		spad = MW0_SZ_LOW + (i * 2);
-		ntb_peer_spad_write(ndev, PIDX, spad, lower_32_bits(size));
-	}
-
-	ntb_peer_spad_write(ndev, PIDX, NUM_MWS, nt->mw_count);
-
-	ntb_peer_spad_write(ndev, PIDX, NUM_QPS, nt->qp_count);
-
-	ntb_peer_spad_write(ndev, PIDX, VERSION, NTB_TRANSPORT_VERSION);
+	ntb_transport_cmd_exec(nt, NT_CMD_SSIZE);
+	ntb_transport_cmd_exec(nt, NT_CMD_NUM_MWS);
+	ntb_transport_cmd_exec(nt, NT_CMD_NUM_QPS);
+	ntb_transport_cmd_exec(nt, NT_CMD_NTB_VERSION);
 
 	/* Query the remote side for its info */
-	val = ntb_spad_read(ndev, VERSION);
-	dev_dbg(&pdev->dev, "Remote version = %d\n", val);
-	if (val != NTB_TRANSPORT_VERSION)
+	dev_dbg(&pdev->dev, "Remote version = %d\n", nt->peer_ntb_version);
+	if (nt->peer_ntb_version != NTB_TRANSPORT_VERSION)
 		goto out;
 
-	val = ntb_spad_read(ndev, NUM_QPS);
-	dev_dbg(&pdev->dev, "Remote max number of qps = %d\n", val);
-	if (val != nt->qp_count)
+	dev_dbg(&pdev->dev, "Remote max number of qps = %d\n",
+	nt->peer_qp_count);
+	if (nt->peer_qp_count != nt->qp_count)
 		goto out;
 
-	val = ntb_spad_read(ndev, NUM_MWS);
-	dev_dbg(&pdev->dev, "Remote number of mws = %d\n", val);
-	if (val != nt->mw_count)
+	dev_dbg(&pdev->dev, "Remote number of mws = %d\n", nt->peer_mw_count);
+	if (nt->peer_mw_count != nt->mw_count)
 		goto out;
 
-	for (i = 0; i < nt->mw_count; i++) {
-		u64 val64;
+	if (test_and_clear_bit(NT_STS_DONE, &nt->sts)) {
+		nt->link_is_up = true;
 
-		val = ntb_spad_read(ndev, MW0_SZ_HIGH + (i * 2));
-		val64 = (u64)val << 32;
+		for (i = 0; i < nt->qp_count; i++) {
+			struct ntb_transport_qp *qp = &nt->qp_vec[i];
 
-		val = ntb_spad_read(ndev, MW0_SZ_LOW + (i * 2));
-		val64 |= val;
-
-		dev_dbg(&pdev->dev, "Remote MW%d size = %#llx\n", i, val64);
-
-		rc = ntb_set_mw(nt, i, val64);
+		rc = ntb_transport_setup_qp_mw(nt, i);
 		if (rc)
 			goto out1;
-	}
-
-	nt->link_is_up = true;
-
-	for (i = 0; i < nt->qp_count; i++) {
-		struct ntb_transport_qp *qp = &nt->qp_vec[i];
-
-		ntb_transport_setup_qp_mw(nt, i);
 
 		if (qp->client_ready)
 			schedule_delayed_work(&qp->link_work, 0);
-	}
+		}
+	} else
+		goto out;
 
 	return;
 
@@ -938,40 +1104,6 @@ static void ntb_transport_link_work(struct work_struct *work)
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
 
-static void ntb_qp_link_work(struct work_struct *work)
-{
-	struct ntb_transport_qp *qp = container_of(work,
-						   struct ntb_transport_qp,
-						   link_work.work);
-	struct pci_dev *pdev = qp->ndev->pdev;
-	struct ntb_transport_ctx *nt = qp->transport;
-	int val;
-
-	WARN_ON(!nt->link_is_up);
-
-	val = ntb_spad_read(nt->ndev, QP_LINKS);
-
-	ntb_peer_spad_write(nt->ndev, PIDX, QP_LINKS, val | BIT(qp->qp_num));
-
-	/* query remote spad for qp ready bits */
-	dev_dbg_ratelimited(&pdev->dev, "Remote QP link status = %x\n", val);
-
-	/* See if the remote side is up */
-	if (val & BIT(qp->qp_num)) {
-		dev_info(&pdev->dev, "qp %d: Link Up\n", qp->qp_num);
-		qp->link_is_up = true;
-		qp->active = true;
-
-		if (qp->event_handler)
-			qp->event_handler(qp->cb_data, qp->link_is_up);
-
-		if (qp->active)
-			tasklet_schedule(&qp->rxc_db_work);
-	} else if (nt->link_is_up)
-		schedule_delayed_work(&qp->link_work,
-				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
-}
-
 static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 				    unsigned int qp_num)
 {
@@ -1060,14 +1192,14 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 {
 	struct ntb_transport_ctx *nt;
 	struct ntb_transport_mw *mw;
-	unsigned int mw_count, qp_count, spad_count, max_mw_count_for_spads;
+	unsigned int mw_count, qp_count, msg_count, max_mw_count_for_spads;
 	u64 qp_bitmap;
 	int node;
 	int rc, i;
 
 	mw_count = ntb_peer_mw_count(ndev);
 
-	if (!ndev->ops->mw_set_trans) {
+	if (!ndev->ops->mw_set_trans && !ndev->ops->peer_mw_set_trans) {
 		dev_err(&ndev->dev, "Inbound MW based NTB API is required\n");
 		return -EINVAL;
 	}
@@ -1089,18 +1221,25 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 		return -ENOMEM;
 
 	nt->ndev = ndev;
-	spad_count = ntb_spad_count(ndev);
+	if (ntb_msg_count(ndev) >= NT_MSG_CNT)
+		msg_count = ntb_msg_count(ndev);
+	else
+		msg_count = ntb_spad_count(ndev);
 
 	/* Limit the MW's based on the availability of scratchpads */
 
-	if (spad_count < NTB_TRANSPORT_MIN_SPADS) {
+	if (msg_count < NTB_TRANSPORT_MIN_SPADS && msg_count < NT_MSG_CNT) {
 		nt->mw_count = 0;
 		rc = -EINVAL;
 		goto err;
 	}
 
-	max_mw_count_for_spads = (spad_count - MW0_SZ_HIGH) / 2;
-	nt->mw_count = min(mw_count, max_mw_count_for_spads);
+	if (ntb_msg_count(ndev)) {
+		nt->mw_count = msg_count;
+	} else {
+		max_mw_count_for_spads = (msg_count - MW0_SZ_HIGH) / 2;
+		nt->mw_count = min(mw_count, max_mw_count_for_spads);
+	}
 
 	nt->mw_vec = kzalloc_node(mw_count * sizeof(*nt->mw_vec),
 				  GFP_KERNEL, node);
@@ -1128,6 +1267,7 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 		mw->virt_addr = NULL;
 		mw->dma_addr = 0;
 	}
+	INIT_WORK(&nt->service, ntb_transport_service_work);
 
 	qp_bitmap = ntb_db_valid_mask(ndev);
 
@@ -1142,6 +1282,7 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 	nt->qp_count = qp_count;
 	nt->qp_bitmap = qp_bitmap;
 	nt->qp_bitmap_free = qp_bitmap;
+	nt->peer_qp_links = -1;
 
 	nt->qp_vec = kzalloc_node(qp_count * sizeof(*nt->qp_vec),
 				  GFP_KERNEL, node);
@@ -1169,6 +1310,15 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 	if (rc)
 		goto err2;
 
+	/* Enable Messaging */
+	rc = nt_init_messaging(ndev, &nt->handle);
+	if (rc)
+		goto err2;
+
+	rc = nt_enable_messaging(ndev, ntb_port_number(ndev));
+	if (rc)
+		goto err2;
+
 	INIT_LIST_HEAD(&nt->client_devs);
 	rc = ntb_bus_init(nt);
 	if (rc)
@@ -1217,6 +1367,7 @@ static void ntb_transport_free(struct ntb_client *self, struct ntb_dev *ndev)
 	}
 
 	ntb_link_disable(ndev);
+	nt_disable_messaging(ndev, ntb_port_number(ndev));
 	ntb_clear_ctx(ndev);
 
 	ntb_bus_remove(nt);
@@ -2100,16 +2251,16 @@ void ntb_transport_link_up(struct ntb_transport_qp *qp)
  */
 void ntb_transport_link_down(struct ntb_transport_qp *qp)
 {
-	int val;
+	u64 qp_bitmap_alloc;
 
 	if (!qp)
 		return;
+	struct ntb_transport_ctx *nt = qp->transport;
 
 	qp->client_ready = false;
 
-	val = ntb_spad_read(qp->ndev, QP_LINKS);
-
-	ntb_peer_spad_write(qp->ndev, PIDX, QP_LINKS, val & ~BIT(qp->qp_num));
+	qp_bitmap_alloc = (nt->qp_bitmap & ~nt->qp_bitmap_free);
+	ntb_transport_cmd_send(nt, NT_QP_LINKS, 0, qp_bitmap_alloc);
 
 	if (qp->link_is_up)
 		ntb_send_link_down(qp);
@@ -2213,9 +2364,21 @@ static void ntb_transport_doorbell_callback(void *data, int vector)
 	}
 }
 
+static void ntb_transport_msg_event_callback(void *data)
+{
+	struct ntb_transport_ctx *nt = data;
+
+	dev_dbg(&nt->ndev->dev, "Msg status bits %#llx\n",
+		ntb_msg_read_sts(nt->ndev));
+
+	/* Messages are only sent one-by-one */
+	(void)ntb_transport_cmd_recv(nt);
+}
+
 static const struct ntb_ctx_ops ntb_transport_ops = {
 	.link_event = ntb_transport_event_callback,
 	.db_event = ntb_transport_doorbell_callback,
+	.msg_event = ntb_transport_msg_event_callback,
 };
 
 static struct ntb_client ntb_transport_client = {
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/4] NTB : Introduce message library
  2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
@ 2018-05-07  5:29   ` Logan Gunthorpe
  2018-05-10  2:10     ` Atul Raut
  2018-05-11 22:40   ` Serge Semin
  1 sibling, 1 reply; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-07  5:29 UTC (permalink / raw)
  To: Atul Raut, linux-ntb

Hey Atul,

Thanks for this work. I had a couple thoughts:

On 06/05/18 01:20 PM, Atul Raut wrote:
> +enum nt_cmd {
> +	NT_CMD_INVAL = -1,/* invalid spad command */
> +	NT_CMD_SSIZE = 0, /* send out buffer size */
> +	NT_CMD_RSIZE = 1, /* recv in  buffer size */
> +	NT_CMD_SXLAT = 2, /* send in  buffer xlat */
> +	NT_CMD_RXLAT = 3, /* recv out buffer xlat */
> +	NT_CMD_CLEAR = 4, /* clear allocated memory */
> +	NT_STS_DONE  = 5, /* init is done */
> +	NT_STS_LNKUP = 6, /* link up state flag */
> +	NT_QP_LINKS        = 7, /* available QP link */
> +	NT_CMD_NUM_MWS        = 8, /* number of memory windows */
> +	NT_CMD_NUM_QPS        = 9, /* number of QP */
> +	NT_CMD_NTB_VERSION    = 10, /* ntb version */
> +};
>

Should we really encode the type of data sent in the library? I feel
like a good API should be agnostic to what's being sent and allow for
generic data so future users can do different things and this library
doesn't has to list all types of values for all future clients.

> +/**
> + * Messages-base commands interface
> + */
> +#define NT_MSG_CMD		0
> +#define NT_MSG_CMD_WID	        1
> +#define NT_MSG_LDATA		2
> +#define NT_MSG_HDATA		3
> +#define NT_MSG_CNT		4

The current Switchtec hardware has 4 messages but the driver reserves
one for link management. So I'd rather not increase the number of
messages required from 3 to 4. Do we really need a cmd_wid message? Can
this not be encoded in the cmd message?

Logan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/4] NTB : Introduce message library
  2018-05-07  5:29   ` Logan Gunthorpe
@ 2018-05-10  2:10     ` Atul Raut
  2018-05-10  4:35       ` Logan Gunthorpe
  0 siblings, 1 reply; 25+ messages in thread
From: Atul Raut @ 2018-05-10  2:10 UTC (permalink / raw)
  To: Logan Gunthorpe, linux-ntb

Hi Logan,

Thanks for reviewing patches.

>> Should we really encode the type of data sent in the library?
Agree, few points would like to bring it here & see if that makes
sense ?
1. Here Library brings some useful things, that  currently two 
client are using it with assumption that these things may
need for feature clients as well.
2. The main objective of library is to solve common problem
that any client driver may have, and that is configuring
inbound/outbound memory window by sharing some info using these
registers in certain way, one way is my patch that derived from
Sergey's ntb_perf module.
The idea is to have basic infrastructure in place which solves
basic configuration problem for any client driver.
Not sure these answers your question, but if any one has some
other suggestions on library implementation I will give an try.

>> Do we really need a cmd_wid message? 
These will act dummy for scratchpad based registers, added just to 
have common API's for scratchpad & message registers.
 
Regards,
Atul

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/4] NTB : Introduce message library
  2018-05-10  2:10     ` Atul Raut
@ 2018-05-10  4:35       ` Logan Gunthorpe
  2018-05-10 18:13         ` Atul Raut
  0 siblings, 1 reply; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-10  4:35 UTC (permalink / raw)
  To: Atul Raut, linux-ntb



On 09/05/18 08:10 PM, Atul Raut wrote:
> Agree, few points would like to bring it here & see if that makes
> sense ?
> 1. Here Library brings some useful things, that  currently two 
> client are using it with assumption that these things may
> need for feature clients as well.
> 2. The main objective of library is to solve common problem
> that any client driver may have, and that is configuring
> inbound/outbound memory window by sharing some info using these
> registers in certain way, one way is my patch that derived from
> Sergey's ntb_perf module.
> The idea is to have basic infrastructure in place which solves
> basic configuration problem for any client driver.
> Not sure these answers your question, but if any one has some
> other suggestions on library implementation I will give an try.

Well, I'd much rather try for something a little better and actually see
the code get cleaner instead of just copying what was in one client into
a library. Case in point: you're adding way more lines to
ntb_transport.c despite creating a library that's supposed to make it's
code simpler.

I'd suggest an API that accepts a small block of memory and sends it
over either spads or msgs (whichever is available). The protocol for
this can be defined by the library and the clients won't care how the
data is sent. Clients then only need to create and populate a small
structure with data it requires to initialize itself. The structure can
then be sent and received with minimal code by the clients. This will
clean up the clients and avoid having them parse and send all these CMD
IDs. It also lets clients define the data they want to send themselves
instead of encoding all possible commands in the library. Such an API
would also be much more future proof as future clients will be able to
use it for any kind of initialization data.


>>> Do we really need a cmd_wid message? 
> These will act dummy for scratchpad based registers, added just to 
> have common API's for scratchpad & message registers.

I don't understand this. But the fact is, it will not work with
Switchtec as there is only 3 messages available. So you will have to
think of something else.

Logan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/4] NTB : Introduce message library
  2018-05-10  4:35       ` Logan Gunthorpe
@ 2018-05-10 18:13         ` Atul Raut
  0 siblings, 0 replies; 25+ messages in thread
From: Atul Raut @ 2018-05-10 18:13 UTC (permalink / raw)
  To: Logan Gunthorpe, linux-ntb

Hi Logan,

Thanks for your feedback on this.

>> the code get cleaner instead of just copying what was in one client into 
>>a library. 
The reason I did this way, because I got suggestion over IRC, to reuse existing
code as library, but as you explain seems it didnt scale well.

>> I'd suggest an API that accepts a small block of memory and sends it
>> over either spads or msgs (whichever is available). 
I will revisit my changes and try out your suggestion.

Regards,
Atul Raut
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 0/4] NTB : Introduce message library
  2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
                   ` (3 preceding siblings ...)
  2018-05-06 19:20 ` [PATCH v2 4/4] NTB : Add support to message registers based devices Atul Raut
@ 2018-05-11 22:39 ` Serge Semin
  2018-05-11 23:00   ` Logan Gunthorpe
  4 siblings, 1 reply; 25+ messages in thread
From: Serge Semin @ 2018-05-11 22:39 UTC (permalink / raw)
  To: Atul Raut; +Cc: linux-ntb

Hello Atul,
Thanks for the patchset and work you've done in the framework of it.
We really appreciate this.

First of all I'll give you a generic review of the whole library design.
The specific comments will be added within the particular code.

As I already told you in the chat, the ntb_perf driver can be used as a basis
or as a reference for the whole library:

>> [2018-03-14 04:54:10] <fancer> rauji: regarding the library. You can find the
>> reference code in the new ntb_perf driver in the section named service. At least
>> it is the basic functionality which implements the cross-interface communications.
>> [2018-03-14 04:58:00] <fancer> Regarding the whole library interface and ntb.h
>> alterations I'd strongly recommend to discuss the architecture here so to avoid
>> an excess patchets iterations and make all of us happy about the change)
>> [2018-03-14 20:34:02] <rauji> hi fancer : yes
>> [2018-03-14 20:34:10] <rauji> am using ntb_perf as reference and already implemented
>> it, right now just refactoring the code
>> [2018-03-14 20:40:11] <rauji> fancer: I am following your suggestions so there wont
>> be any issues with architectural changes,

Definitely I didn't tell you just to copy-and-paste the whole service functions
from ntb_perf. One of the ntb_perf driver purpose was to create an example of
possible cross-interface communication code, which turned to be the "service"
subsystem of the driver. My idea was to have it used as a proof of concept and
a stably working code for the future cross-iface library. The library is going
to be a part of the NTB API itself - a part of the generic interface for the
whole NTB subsystem, so I recommended to have the design discussed first, otherwise
you'd risk to face review requests which would cause a serious code refactoring.
As far as I can see, it turned out to be true.

The whole idea of the cross-iface library was to encapsulate the
scratchpad/message registers interface, so to have a generic way of data
exchange between NTB peers before NTB link is actually initialized and up for
MW usage. The main purpose, of course, was to send/receive memory-window
information across, but in general the hardware doesn't prevent it from
sending any type of data. A lot of scenario come to my mind, when the random
data exchange can be utilized before actual MW initialization. For instance
imagine if you first needed to identify a peer (by using an rsa-key or
something else) before you'd sent a pure memory region base address.

Even though this more like a "philosophic" part, but this time Logan's opinion
happens to agree with the design we discussed a while ago with the rest of the
NTB maintainers:

1) Send/receive random data by means of the library 
2) Use either Scratchpads or Message registers for data exchange. If both
types of registers are available (like in case of IDT 89HPES8NT2) it's better
to use the Message ones, since they are better suitable for the library concept.
3) Use something like the following methods as the library interface:
- ntb_lib_exch_capability(ntb_dev) - return the maximum length of the data
array, which can be sent over the NTB device (or an error).
- ntb_lib_exch_init(ntb_dev, ntb_data_recv) - initialize the Cross-Iface library of
NTB device. It initializes the library for the specified NTB-device and sets the
callback method to have the client driver notified when an incoming data is
available,
- ntb_lib_exch_clear(ntb_dev) - clear the library: discard the library descriptor,
make sure there is no data pending to send/retrieve, the callback handler can be
safely discarded,
- ntb_lib_exch_send_sync(ntb_dev, pidx, len, data) - send data array of the passed
length over the NTB device using the Cross-Iface library. The method can sleep, so
can't be used in the atomic context,
- ntb_lib_exch_send_async(ntb_dev, pidx, len, data, callback) - the same function
as before, but can be executed with IRQs disable. When data is sent, the
callback method is executed for instance to free the memory allocated for the data.
- ntb_data_recv(ntb_dev, pidx, len, data) - callback method passed to the
ntb_lib_exch_init() function and called when new data arrived from the peer with
pidx index.

* the naming is of course discussable as well as the whole design, since we
were talking about it nearly 10 months ago.

Here are some notes personally from me with a suggestion of the design specifics:
- I'd add a race-protected pointer to the Cross-Iface library descriptor
(something like struct ntb_lib_exch) in the ntb_dev structure and have it
dynamically allocated in the ntb_lib_exch_init(). This way we wouldn't waste
a memory in case if client driver didn't use the library
- I'd declare the data type being u8 instead of dword (definitely not u64).
- Despite of the ntb_perf service subsystem design, I would better use pure
Scratchpads for the library functions implementation without Doorbell
utilization. The only drawback of this is to have one byte of some
Scratchpad register being reserved to be used for full/empty status.
Additional there must be a status checking work-thread running to poll
the status.

There are also additional notes from me about the library code in general:
1) Use ntb_-prefix in all the functions/types within NTB API including the library.
It is something like an unspoken API convention, which as you can see is followed
all over the NTB API code.
2) I'd suggest to move the library code into a separate directory:
drivers/ntb/lib/exchange.c
and create a separate header file in:
include/linux/ntb_lib_exchange.h
This way we'd have a special place to put an abstractive libraries like this
one and for future ones, for instance, for possible scratchpads emulation
library and so on.

Atul, if you refactored your code, so it would have fitted something similar to
the design presented here, it would be definitely accepted (with possible minor
code alterations=)).

Regards,
-Sergey

On Sun, May 06, 2018 at 12:20:16PM -0700, Atul Raut <araut@codeaurora.org> wrote:
> Hi All,
> 
> This is v2 of cleanup series, where have enabled support to ntb_transport layer
> for message register based devices. Refactor ntb_perf module to get library
> out of it, so that the other client module can make use of it.
> 
> This cleanup addresses some of comments by Allen and Dave.
> 
> Thanks & Regards,
> Atul Raut
> 
> Atul Raut (4):
>   NTB : Introduce message library
>   NTB :  Add message library NTB API
>   NTB : Modification to ntb_perf module
>   NTB : Add support to message registers based devices
> 
>  drivers/ntb/ntb.c           | 222 +++++++++++++++++++++++++++
>  drivers/ntb/ntb_transport.c | 357 ++++++++++++++++++++++++++++++++------------
>  drivers/ntb/test/ntb_perf.c | 347 +++++-------------------------------------
>  include/linux/ntb.h         | 163 ++++++++++++++++++++
>  4 files changed, 685 insertions(+), 404 deletions(-)
> 
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 
> -- 
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/1525634420-19370-1-git-send-email-araut%40codeaurora.org.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/4] NTB : Introduce message library
  2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
  2018-05-07  5:29   ` Logan Gunthorpe
@ 2018-05-11 22:40   ` Serge Semin
  1 sibling, 0 replies; 25+ messages in thread
From: Serge Semin @ 2018-05-11 22:40 UTC (permalink / raw)
  To: Atul Raut; +Cc: linux-ntb

On Sun, May 06, 2018 at 12:20:17PM -0700, Atul Raut <araut@codeaurora.org> wrote:
> Library created by refactoring common code from
> ntb_perf module so that all client can make use
> of it.
> The library is based on scratchpad and message registers
> based apis.
> 

See the comment in the cover letter. I suggest to have a separate header file
for the library interface somewhere like:
include/linux/ntb_lib_exch.h

> Signed-off-by: Atul Raut <araut@codeaurora.org>
> ---
>  include/linux/ntb.h | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 163 insertions(+)
> 
> diff --git a/include/linux/ntb.h b/include/linux/ntb.h
> index 181d166..19fe973 100644
> --- a/include/linux/ntb.h
> +++ b/include/linux/ntb.h
> @@ -58,6 +58,8 @@
>  
>  #include <linux/completion.h>
>  #include <linux/device.h>
> +#include <linux/delay.h>
> +#include <linux/io.h>
>  
>  struct ntb_client;
>  struct ntb_dev;
> @@ -163,6 +165,56 @@ enum ntb_default_port {
>  #define NTB_DEF_PEER_CNT	(1)
>  #define NTB_DEF_PEER_IDX	(0)
>  
> +enum nt_cmd {
> +	NT_CMD_INVAL = -1,/* invalid spad command */
> +	NT_CMD_SSIZE = 0, /* send out buffer size */
> +	NT_CMD_RSIZE = 1, /* recv in  buffer size */
> +	NT_CMD_SXLAT = 2, /* send in  buffer xlat */
> +	NT_CMD_RXLAT = 3, /* recv out buffer xlat */
> +	NT_CMD_CLEAR = 4, /* clear allocated memory */
> +	NT_STS_DONE  = 5, /* init is done */
> +	NT_STS_LNKUP = 6, /* link up state flag */
> +	NT_QP_LINKS        = 7, /* available QP link */
> +	NT_CMD_NUM_MWS        = 8, /* number of memory windows */
> +	NT_CMD_NUM_QPS        = 9, /* number of QP */
> +	NT_CMD_NTB_VERSION    = 10, /* ntb version */
> +};
> +
> +struct msg_type {
> +/* Scratchpad/Message IO operations */
> +	int (*cmd_send)(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
> +			int cmd_wid, u64 data);
> +	int (*cmd_recv)(struct ntb_dev *nt, int *pidx, enum nt_cmd *cmd,
> +			int *cmd_wid, u64 *data);
> +};
> +
> +#define MSG_TRIES		50
> +#define MSG_UDELAY_LOW		1000
> +#define MSG_UDELAY_HIGH		2000
> +

>> Allen Hubbe
>> 500 * 2000us = 1s is a long delay for a caller that might not have the
>> luxury of being able to wait.

Since this is the driver configurable parameter we can make it available
in the kernel config menu. Allen?

> +/**
> + * Scratchpads-base commands interface
> + */
> +#define NT_SPAD_CNT(_pcnt) \
> +	(3*((_pcnt) + 1))
> +#define NT_SPAD_CMD(_gidx) \
> +	(3*(_gidx))
> +#define NT_SPAD_LDATA(_gidx) \
> +	(3*(_gidx) + 1)
> +#define NT_SPAD_HDATA(_gidx) \
> +	(3*(_gidx) + 2)
> +#define NT_SPAD_NOTIFY(_gidx) \
> +	(BIT_ULL(_gidx))
> +
> +/**
> + * Messages-base commands interface
> + */
> +#define NT_MSG_CMD		0
> +#define NT_MSG_CMD_WID	        1
> +#define NT_MSG_LDATA		2
> +#define NT_MSG_HDATA		3
> +#define NT_MSG_CNT		4
> +

These macro are specific to the client drivers, particularly to the
ntb_perf driver, so they need to stay there and being used on top
of the Cross-Interface library.

>  /**
>   * struct ntb_client_ops - ntb client operations
>   * @probe:		Notify client of a new device.
> @@ -1502,4 +1554,115 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx,
>  	return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
>  }
>  
> +/**
> + * nt_spad_cmd_send() - send messages to peer using spad register.
> + * @ntb:	NTB device context.
> + * @pidx:	Port index of peer device.
> + * @cmd:	ntb commands.
> + * @cmd_gidx:	Global device index.
> + * @data:	message data.
> + *
> + * Send data to the port specific scratchpad
> + *
> + * Perform predefined number of attempts before give up.
> + * We are sending the data to the port specific scratchpad, so
> + * to prevent a multi-port access race-condition. Additionally
> + * there is no need in local locking since only thread-safe
> + * service work is using this method.
> + *
> + * Set peer db to inform data is ready.
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_spad_cmd_send(struct ntb_dev *ntb, int pidx, enum nt_cmd cmd,
> +		     int cmd_gidx, u64 data);
> +
> +/**
> + * nt_spad_cmd_recv() - Receive the messages using spad register.
> + * @ntb:	NTB device context.
> + * @pidx:	Port index of peer device a message being receive
> + * @cmd:	NTB command
> + * @cmd_wid:	Gloable device Index
> + * @data:	Received data
> + *
> + * Clear bits in the peer doorbell register, arming the bits for the next
> + * doorbell.
> + *
> + * We start scanning all over, since cleared DB may have been set
> + * by any peer. Yes, it makes peer with smaller index being
> + * serviced with greater priority, but it's convenient for spad
> + * and message code unification and simplicity.
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_spad_cmd_recv(struct ntb_dev *ntb, int *pidx,
> +		     enum nt_cmd *cmd, int *cmd_wid, u64 *data);
> +
> +/**
> + * nt_msg_cmd_send() - send messages to peer using message register.
> + * @ntb:	NTB device context.
> + * @pidx:	Port index of peer device.
> + * @cmd:	ntb commands.
> + * @cmd_gidx:	Memory window index.
> + * @data:	message data.
> + *
> + * Perform predefined number of attempts before give up. Message
> + * registers are free of race-condition problem when accessed
> + * from different ports, so we don't need splitting registers
> + * by global device index. We also won't have local locking,
> + * since the method is used from service work only.
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_msg_cmd_send(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
> +		    int cmd_wid, u64 data);
> +
> +/**
> + * nt_msg_cmd_recv() - Receive the messages using message register.
> + * @ntb:	NTB device context.
> + * @pidx:	Port index of peer device a message being receive
> + * @cmd:	NT command
> + * @cmd_wid:	Memory window Index
> + * @data:	Received data
> + *
> + * Get memory window index and data.
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_msg_cmd_recv(struct ntb_dev *nt, int *pidx,
> +		    enum nt_cmd *cmd, int *cmd_wid, u64 *data);
> +
> +/**
> + * nt_enable_messaging() - Enable messaging support.
> + * @ntb:	NTB device context.
> + * @gitx:	Global device Index.
> + *
> + * Check which messaging support to enable
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_enable_messaging(struct ntb_dev *ndev, int gidx);
> +
> +/**
> + * nt_disable_messaging() - Disable messaging support.
> + * @ntb:	NTB device context.
> + * @gidx:	Global device Index
> + *
> + * Check message type(spad/message) and disable messaging support.
> + *
> + */
> +void nt_disable_messaging(struct ntb_dev *ndev, int gidx);
> +
> +/**
> + * nt_init_messaging() - Enable Messaging
> + * @ntb:	NTB device context.
> + * @msg_ptr:	Handle to function pointers Scratchpad or Message.
> + *
> + *
> + * Enable Scratchpad/Message IO operations.
> + *
> + * Return: Zero on success, otherwise an error number.
> + */
> +int nt_init_messaging(struct ntb_dev *ndev, struct msg_type *msg_ptr);
> +

See the cover letter comment with the design considerations.

Regards,
-Sergey

>  #endif
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 
> -- 
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/1525634420-19370-2-git-send-email-araut%40codeaurora.org.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB :  Add message library NTB API
  2018-05-06 19:20 ` [PATCH v2 2/4] NTB : Add message library NTB API Atul Raut
@ 2018-05-11 22:44   ` Serge Semin
  2018-05-11 23:11     ` Logan Gunthorpe
  2018-05-13  0:25     ` Allen Hubbe
  0 siblings, 2 replies; 25+ messages in thread
From: Serge Semin @ 2018-05-11 22:44 UTC (permalink / raw)
  To: Atul Raut; +Cc: linux-ntb

On Sun, May 06, 2018 at 12:20:18PM -0700, Atul Raut <araut@codeaurora.org> wrote:
> This patch brings in function definations for
> the NTB library API.
> 
> Signed-off-by: Atul Raut <araut@codeaurora.org>
> ---
>  drivers/ntb/ntb.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 222 insertions(+)
> 
> diff --git a/drivers/ntb/ntb.c b/drivers/ntb/ntb.c
> index 2581ab7..c025e03 100644
> --- a/drivers/ntb/ntb.c
> +++ b/drivers/ntb/ntb.c
> @@ -258,6 +258,228 @@ int ntb_default_peer_port_idx(struct ntb_dev *ntb, int port)
>  }
>  EXPORT_SYMBOL(ntb_default_peer_port_idx);
>  
> +int nt_spad_cmd_send(struct ntb_dev *ntb, int pidx, enum nt_cmd cmd,
> +			    int cmd_gidx, u64 data)
> +{

These functions aren't used within a work-thread anymore (like it was in the
ntb_perf driver). It means there might be a race-condition of the Scratchpad/Message
registers access from different threads. You must make sure, that the simultaneous
access is ordered.
See the Allen Hubbe comment at the patchset v1.

> +	int try;
> +	u32 sts;
> +	int gidx = ntb_port_number(ntb);
> +
> +	for (try = 0; try < MSG_TRIES; try++) {

>> Allen Hubbe
>> Maybe one try, or msg_tries is a parameter instead of constant.

Allen, there can't be just one try. Imagine you've sent some data to a peer and made it
being notified of incoming data pending in the scratchpad/message registers. Then
the peer handler starts fetching the data from the registers while you or some
other peer tries to send data there before the peer finished the retrieval operation.
In this case you'd need to perform several retries before giving up and return
something like -EAGAIN. As I suggested before, it might be better to move number
of retries parameter definition to the kernel menuconfig.

> +		if (!ntb_link_is_up(ntb, NULL, NULL))
> +			return -ENOLINK;
> +
> +		sts = ntb_peer_spad_read(ntb, pidx,
> +					 NT_SPAD_CMD(gidx));

>> Allen Hubbe
>> Can it be done without reading peer spads?

Alas, it can't within the current communication layer design. We need to check
the status of the sent data. When the peer marked it as read, only then we can
send a next portion of pending outbound data.

> +		if (sts != NT_CMD_INVAL) {
> +			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);

>> Allen Hubbe
>> Even though it doesn't sleep, the delay limits what context this can
>> reasonably be called in.  Imagine if any kind of interrupts were
>> disabled in the caller (eg rcu_read_lock or spin_lock_bh), and then
>> the call in the ntb api delays the caller for 1s, that's not very
>> nice.
>> How bad would it be to take out the retries, or would it just stop working.
>> If taking out the retries breaks it, is there a better way to design
>> the whole thing to be less fragile with respect to timing?  Or maybe
>> schedule_timeout_interruptible would be ok, and just require the
>> caller to be in a context where that is allowed.

Allen, since we have just one channel of communication and have to wait while
a peer reads the data from inbound registers, I don't see other choice but
either to use the status polling design or specifically allocate some Doorbell
bits for notifications (Message registers case doesn't need it). I'd rather
stay with polling case, since it is simpler, consumes less hardware resources,
and well fits both Scratchpad and Message registers hardware. See the comment
in the cover letter with the design suggestion.

> +			continue;
> +		}
> +
> +		ntb_peer_spad_write(ntb, pidx,
> +				    NT_SPAD_LDATA(gidx),
> +				    lower_32_bits(data));
> +		ntb_peer_spad_write(ntb, pidx,
> +				    NT_SPAD_HDATA(gidx),
> +				    upper_32_bits(data));
> +		mmiowb();

>> Allen Hubbe
>> spad_write is like iowrite32, so writes are ordered without extra mmiowb.

Allen, I may misunderstand something, but according to the kernel Documentation in
general they aren't (specifically for prefetchable MWs - hello to the emulated
scratchpads) so we need the MMIO barrier here. See the doc about mmiowb():
https://www.kernel.org/doc/Documentation/memory-barriers.txt
We must make sure, that the data is sent before the status change and
before the peer is notified by means of Doorbell setting.

> +		ntb_peer_spad_write(ntb, pidx,
> +				    NT_SPAD_CMD(gidx),
> +				    cmd);
> +

See the comment above. mmiowb() should be here.

Regards,
-Sergey

> +		ntb_peer_db_set(ntb, NT_SPAD_NOTIFY(cmd_gidx));
> +
> +		break;
> +	}
> +
> +	return try < MSG_TRIES ? 0 : -EAGAIN;
> +}
> +EXPORT_SYMBOL(nt_spad_cmd_send);
> +
> +int nt_spad_cmd_recv(struct ntb_dev *ntb, int *pidx,
> +			enum nt_cmd *cmd, int *cmd_wid, u64 *data)
> +{
> +	u32 val;
> +	int gidx = 0;
> +	int key = ntb_port_number(ntb);
> +
> +	ntb_db_clear(ntb, NT_SPAD_NOTIFY(key));
> +
> +	for (*pidx = 0; *pidx < ntb_peer_port_count(ntb); (*pidx)++, gidx++) {
> +		if ((*pidx) == key)
> +			++gidx;
> +
> +		if (!ntb_link_is_up(ntb, NULL, NULL))
> +			continue;
> +
> +		val = ntb_spad_read(ntb, NT_SPAD_CMD(gidx));
> +		if (val == NT_CMD_INVAL)
> +			continue;
> +
> +		*cmd = val;
> +
> +		val = ntb_spad_read(ntb, NT_SPAD_LDATA(gidx));
> +		*data = val;
> +
> +		val = ntb_spad_read(ntb, NT_SPAD_HDATA(gidx));
> +		*data |= (u64)val << 32;
> +
> +		/* Next command can be retrieved from now */
> +		ntb_spad_write(ntb, NT_SPAD_CMD(gidx),
> +			NT_CMD_INVAL);
> +
> +		return 0;
> +	}
> +
> +	return -ENODATA;
> +}
> +EXPORT_SYMBOL(nt_spad_cmd_recv);
> +
> +int nt_msg_cmd_send(struct ntb_dev *nt, int pidx, enum nt_cmd cmd,
> +int cmd_wid, u64 data)
> +{
> +	int try, ret;
> +	u64 outbits;
> +
> +	outbits = ntb_msg_outbits(nt);
> +	for (try = 0; try < MSG_TRIES; try++) {
> +		if (!ntb_link_is_up(nt, NULL, NULL))
> +			return -ENOLINK;
> +
> +		ret = ntb_msg_clear_sts(nt, outbits);
> +		if (ret)
> +			return ret;
> +
> +		ntb_peer_msg_write(nt, pidx, NT_MSG_LDATA,
> +			cpu_to_le32(lower_32_bits(data)));
> +
> +		if (ntb_msg_read_sts(nt) & outbits) {
> +			usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
> +			continue;
> +		}
> +
> +		ntb_peer_msg_write(nt, pidx, NT_MSG_HDATA,
> +			cpu_to_le32(upper_32_bits(data)));
> +		mmiowb();
> +
> +		ntb_peer_msg_write(nt, pidx, NT_MSG_CMD_WID,
> +			cpu_to_le32(cmd_wid));
> +
> +		/* This call shall trigger peer message event */
> +		ntb_peer_msg_write(nt, pidx, NT_MSG_CMD,
> +			cpu_to_le32(cmd));
> +
> +		break;
> +	}
> +
> +	return try < MSG_TRIES ? 0 : -EAGAIN;
> +}
> +EXPORT_SYMBOL(nt_msg_cmd_send);
> +
> +int nt_msg_cmd_recv(struct ntb_dev *nt, int *pidx,
> +			enum nt_cmd *cmd, int *cmd_wid, u64 *data)
> +{
> +	u64 inbits;
> +	u32 val;
> +
> +	inbits = ntb_msg_inbits(nt);
> +
> +	if (hweight64(ntb_msg_read_sts(nt) & inbits) < 4)
> +		return -ENODATA;
> +
> +	val = ntb_msg_read(nt, pidx, NT_MSG_CMD);
> +	*cmd = le32_to_cpu(val);
> +
> +	val = ntb_msg_read(nt, pidx, NT_MSG_CMD_WID);
> +	*cmd_wid = le32_to_cpu(val);
> +
> +	val = ntb_msg_read(nt, pidx, NT_MSG_LDATA);
> +	*data = le32_to_cpu(val);
> +
> +	val = ntb_msg_read(nt, pidx, NT_MSG_HDATA);
> +	*data |= (u64)le32_to_cpu(val) << 32;
> +
> +	/* Next command can be retrieved from now */
> +	ntb_msg_clear_sts(nt, inbits);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(nt_msg_cmd_recv);
> +
> +int nt_enable_messaging(struct ntb_dev *ndev, int gidx)
> +{
> +	u64 mask, incmd_bit;
> +	int ret, sidx, scnt;
> +
> +	mask = ntb_db_valid_mask(ndev);
> +	(void)ntb_db_set_mask(ndev, mask);
> +
> +	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
> +		u64 inbits, outbits;
> +
> +		inbits = ntb_msg_inbits(ndev);
> +		outbits = ntb_msg_outbits(ndev);
> +		(void)ntb_msg_set_mask(ndev, inbits | outbits);
> +
> +		incmd_bit = BIT_ULL(__ffs64(inbits));
> +		ret = ntb_msg_clear_mask(ndev, incmd_bit);
> +	} else {
> +		scnt = ntb_spad_count(ndev);
> +		for (sidx = 0; sidx < scnt; sidx++)
> +			ntb_spad_write(ndev, sidx, NT_CMD_INVAL);
> +		incmd_bit = NT_SPAD_NOTIFY(gidx);
> +		ret = ntb_db_clear_mask(ndev, incmd_bit);
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(nt_enable_messaging);
> +
> +void nt_disable_messaging(struct ntb_dev *ndev, int gidx)
> +{
> +	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
> +		u64 inbits;
> +
> +		inbits = ntb_msg_inbits(ndev);
> +		(void)ntb_msg_set_mask(ndev, inbits);
> +	} else {
> +		(void)ntb_db_set_mask(ndev, NT_SPAD_NOTIFY(gidx));
> +	}
> +
> +}
> +EXPORT_SYMBOL(nt_disable_messaging);
> +
> +int nt_init_messaging(struct ntb_dev *ndev, struct msg_type *msg_ptr)
> +{
> +	u64 mask;
> +	int pcnt = ntb_peer_port_count(ndev);
> +
> +	if (ntb_peer_mw_count(ndev) < (pcnt + 1)) {
> +		dev_err(&ndev->dev, "Not enough memory windows\n");
> +		return -EINVAL;
> +	}
> +
> +	if (ntb_msg_count(ndev) >= NT_MSG_CNT) {
> +		msg_ptr->cmd_send = nt_msg_cmd_send;
> +		msg_ptr->cmd_recv = nt_msg_cmd_recv;
> +
> +		return 0;
> +	}
> +
> +	mask = GENMASK_ULL(pcnt, 0);
> +	if (ntb_spad_count(ndev) >= NT_SPAD_CNT(pcnt) &&
> +	    (ntb_db_valid_mask(ndev) & mask) == mask) {
> +		msg_ptr->cmd_send = nt_spad_cmd_send;
> +		msg_ptr->cmd_recv = nt_spad_cmd_recv;
> +
> +		return 0;
> +	}
> +	dev_err(&ndev->dev, "Command services unsupported\n");
> +
> +	return -EINVAL;
> +}
> +EXPORT_SYMBOL(nt_init_messaging);
> +
>  static int ntb_probe(struct device *dev)
>  {
>  	struct ntb_dev *ntb;
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 
> -- 
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/1525634420-19370-3-git-send-email-araut%40codeaurora.org.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 0/4] NTB : Introduce message library
  2018-05-11 22:39 ` [PATCH v2 0/4] NTB : Introduce message library Serge Semin
@ 2018-05-11 23:00   ` Logan Gunthorpe
  2018-05-14 20:40     ` Serge Semin
  0 siblings, 1 reply; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-11 23:00 UTC (permalink / raw)
  To: Serge Semin, Atul Raut; +Cc: linux-ntb

On 5/11/2018 4:39 PM, Serge Semin wrote:
> Definitely I didn't tell you just to copy-and-paste the whole service functions
> from ntb_perf. One of the ntb_perf driver purpose was to create an example of
> possible cross-interface communication code, which turned to be the "service"
> subsystem of the driver. My idea was to have it used as a proof of concept and
> a stably working code for the future cross-iface library. The library is going
> to be a part of the NTB API itself - a part of the generic interface for the
> whole NTB subsystem, so I recommended to have the design discussed first, otherwise
> you'd risk to face review requests which would cause a serious code refactoring.
> As far as I can see, it turned out to be true.

IMO, and in my experience, when working on APIs that have multiple users 
in the kernel you should *expect* to refactor and rewrite the code a 
couple times before we land on something that is clean enough to merge. 
I have patch sets that have gone through many *significant* changes and 
they are still works in progress. This is normal and don't expect 
discussion ahead of time to converge on the right answer right away.
> Even though this more like a "philosophic" part, but this time Logan's opinion
> happens to agree with the design we discussed a while ago with the rest of the
> NTB maintainers:

Glad we agree for a change ;)

Logan


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-11 22:44   ` Serge Semin
@ 2018-05-11 23:11     ` Logan Gunthorpe
  2018-05-14 20:25       ` Serge Semin
  2018-05-13  0:25     ` Allen Hubbe
  1 sibling, 1 reply; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-11 23:11 UTC (permalink / raw)
  To: Serge Semin, Atul Raut; +Cc: linux-ntb

>> +		mmiowb();
> 
>>> Allen Hubbe
>>> spad_write is like iowrite32, so writes are ordered without extra mmiowb.
> 
> Allen, I may misunderstand something, but according to the kernel Documentation in
> general they aren't (specifically for prefetchable MWs - hello to the emulated
> scratchpads) so we need the MMIO barrier here. See the doc about mmiowb():
> https://www.kernel.org/doc/Documentation/memory-barriers.txt
> We must make sure, that the data is sent before the status change and
> before the peer is notified by means of Doorbell setting.

Actually, I'm pretty sure Allen is right and I had meant to comment on 
this as well. I've also gotten this wrong before and have been corrected 
on it.

Read the "Acquires vs I/O accesses" section in the same document you 
quote as well as [1].

ioread and iowrites are guaranteed to be strongly ordered with each 
other as long as they happen from the same CPU. However, seeing the PCI 
bus does not typically interact with the cache coherency protocol, io 
requests are not guaranteed, on all arches, to be in order with accesses 
to RAM. This is what mmiowb() is for. So if you have an IO access that 
is protected with a spin lock you must make sure to call mmiowb() before 
releasing the spin lock otherwise IO writes might happen after the lock 
is released. As far as I know, this is the only common use for mmiowb().

Logan

[1] https://www.kernel.org/doc/Documentation/driver-api/device-io.rst

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-11 22:44   ` Serge Semin
  2018-05-11 23:11     ` Logan Gunthorpe
@ 2018-05-13  0:25     ` Allen Hubbe
  2018-05-13  0:31       ` Allen Hubbe
  2018-05-14 23:16       ` Serge Semin
  1 sibling, 2 replies; 25+ messages in thread
From: Allen Hubbe @ 2018-05-13  0:25 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On Fri, May 11, 2018 at 6:44 PM, Serge Semin <fancer.lancer@gmail.com> wrote:
> On Sun, May 06, 2018 at 12:20:18PM -0700, Atul Raut <araut@codeaurora.org> wrote:
>>> Allen Hubbe
>>> Maybe one try, or msg_tries is a parameter instead of constant.
>
> Allen, there can't be just one try. Imagine you've sent some data to a peer and made it
> being notified of incoming data pending in the scratchpad/message registers. Then
> the peer handler starts fetching the data from the registers while you or some
> other peer tries to send data there before the peer finished the retrieval operation.
> In this case you'd need to perform several retries before giving up and return
> something like -EAGAIN. As I suggested before, it might be better to move number
> of retries parameter definition to the kernel menuconfig.

Making it configurable just seems like a convenient way to blame the
user if there is a problem.  It returned -EAGAIN?  You didn't
configure the wait long enough.  It hung your os and spewed warnings
to dmesg?  Configured the wait to be too long.

>
>> +             if (!ntb_link_is_up(ntb, NULL, NULL))
>> +                     return -ENOLINK;
>> +
>> +             sts = ntb_peer_spad_read(ntb, pidx,
>> +                                      NT_SPAD_CMD(gidx));
>
>>> Allen Hubbe
>>> Can it be done without reading peer spads?
>
> Alas, it can't within the current communication layer design. We need to check
> the status of the sent data. When the peer marked it as read, only then we can
> send a next portion of pending outbound data.

Just trying to avoid a read across the bus between two NTBs.  I get
that the peer needs to ack the message before the spads can be
overwritten.

Focusing on the command spad, will any use case really have 2^32
commands?  Can we spare two bits?  Let one bit be the command sequence
number (zero or one), and the other bit be the remote command ack
number (zero or one).

Sending a command: flip the sequence number bit from the previous
command, leave the ack bit alone, and write the command number in the
rest of the bits.

Receiving a command: did the sequence number bit change?  If so,
service the command, and then acknowledge.  Acknowledge is like
sending a command because the write destination is the command spad,
but flip the ack bit only: leave the current sequence bit and command
number alone.

Following is an imagined implementation for spads.  For msg registers,
instead of seq/ack bits, it would check the status register after
trying to send the first data.  For send_ack(), msg registers would
clear the status.  To make it work for both msg and spads, just
require that cmd_num never have seq/ack bits set.

spinlock_t cmd_lock;
u32 cmd_val;

enum {
        CMD_SEQ_BIT = BIT(30),
        CMD_ACK_BIT = BIT(31),
        CMD_NUM_MASK = ~CMD_SEQ_BIT & ~CMD_ACK_BIT
};

int send_cmd(u32 cmd_num, void *data, size_t size)
{
        u32 ack_val;
        int rc;

        if (WARN_ON(cmd_num & ~CMD_NUM_MASK))
                return -EINVAL;

        spin_lock(cmd_lock);

        ack_val = spad_read(cmd_spad);

        rc = -EAGAIN;
        if (!(ack_val & CMD_ACK_BIT) == !(cmd_val & CMD_SEQ_BIT))
                goto out;

        cmd_val ^= CMD_SEQ_BIT;
        cmd_val &= ~CMD_NUM_MASK;
        cmd_val |= CMD_NUM_MASK & cmd_num;

        peer_spad_write(data_spad_0, cmd_data);
        peer_spad_write(data_spad_1, cmd_data + 4);
        ... and so on ...

        peer_spad_write(cmd_spad, cmd_val);
        peer_db_set(cmd_doorbell_bit);

        rc = 0;

out:
        spin_unlock(cmd_lock);

        return rc;
}

void send_ack(void)
{
        spin_lock(cmd_lock);
        cmd_val ^= CMD_ACK_BIT;
        spad_write(cmd_spad, cmd_val);
        spin_unlock(cmd_lock);
}

>
>> +             if (sts != NT_CMD_INVAL) {
>> +                     usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
>
>>> Allen Hubbe
>>> Even though it doesn't sleep, the delay limits what context this can
>>> reasonably be called in.  Imagine if any kind of interrupts were
>>> disabled in the caller (eg rcu_read_lock or spin_lock_bh), and then
>>> the call in the ntb api delays the caller for 1s, that's not very
>>> nice.
>>> How bad would it be to take out the retries, or would it just stop working.
>>> If taking out the retries breaks it, is there a better way to design
>>> the whole thing to be less fragile with respect to timing?  Or maybe
>>> schedule_timeout_interruptible would be ok, and just require the
>>> caller to be in a context where that is allowed.
>
> Allen, since we have just one channel of communication and have to wait while
> a peer reads the data from inbound registers, I don't see other choice but
> either to use the status polling design or specifically allocate some Doorbell
> bits for notifications (Message registers case doesn't need it). I'd rather
> stay with polling case, since it is simpler, consumes less hardware resources,
> and well fits both Scratchpad and Message registers hardware. See the comment
> in the cover letter with the design suggestion.

Since my main issue with this is the retry, it's rightly on me to
suggest an alternative approach.

Instead, write the message to a ring buffer of messages to send, then
schedule some delayed work struct to actually send the message.  Let
the work struct be rescheduled with some delay, as long as the buffer
is not empty.  Each retry may or may not send the message, depending
on the ack or availability of the msg register.  If it doesn't send,
just reschedule the work until it does.

size_t msg_max_size()
{
        // depends on how many spads / msg registers the device has
that are not in use for some other purpose like link management.
}

int post_msg(u32 cmd_num, void *data, size_t size, bool can_wait)
{
        int rc;

        if (cmd_num & ~CMD_NUM_MASK)
                return -EINVAL;

        if (size > msg_max_size())
                return -EINVAL;

        spin_lock(waitq.lock);

        rc = -EAGAIN;
        if (can_wait)
                wait_event_interruptible(waitq, queue is not full);
        else if (queue is full)
                goto out;

        enqueue(the command and data);
        schedule_delayed_work(cmd_ws, 0);

        rc = 0;

out:
        spin_unlock(waitq.lock);

        return rc;
}

void cmd_work(ws)
{
        int rc;

        spin_lock(waitq.lock);

        if (queue is empty)
                goto out;

        rc = send_cmd(next in the queue);
        if (!rc) {
                advance(the queue);
                wake_up(waitq);
        } else if (rc != -EAGAIN) {
                goto out;
        }

        if (! queue is empty)
                schedule_delayed_work(cmd_ws, CMD_DELAY);

out:
        spin_unlock(waitq.lock);
}

No need to make the delay configurable, just pick something
reasonable.  If anything is configurable, let it be the size of the
ring buffer.  Or let the caller provide some msg stucture that we can
add to a list, instead of a ring buffer.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-13  0:25     ` Allen Hubbe
@ 2018-05-13  0:31       ` Allen Hubbe
  2018-05-14 23:16       ` Serge Semin
  1 sibling, 0 replies; 25+ messages in thread
From: Allen Hubbe @ 2018-05-13  0:31 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On Sat, May 12, 2018 at 8:25 PM, Allen Hubbe <allenbh@gmail.com> wrote:
> int send_cmd(u32 cmd_num, void *data, size_t size)
...
>         if (!(ack_val & CMD_ACK_BIT) == !(cmd_val & CMD_SEQ_BIT))
>                 goto out;

Let that be !=.

If the ack bit is received by the peer is the same as the seq bit sent
to the peer, let that indicate acknowledement of the command sent to
the peer.  In other words.  If ==, ok to send the next message.  If
!=, need to wait for peer to ack the current message.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-11 23:11     ` Logan Gunthorpe
@ 2018-05-14 20:25       ` Serge Semin
  2018-05-14 20:59         ` Logan Gunthorpe
  0 siblings, 1 reply; 25+ messages in thread
From: Serge Semin @ 2018-05-14 20:25 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: Atul Raut, linux-ntb

On Fri, May 11, 2018 at 05:11:28PM -0600, Logan Gunthorpe <logang@deltatee.com> wrote:
> >>+		mmiowb();
> >
> >>>Allen Hubbe
> >>>spad_write is like iowrite32, so writes are ordered without extra mmiowb.
> >
> >Allen, I may misunderstand something, but according to the kernel Documentation in
> >general they aren't (specifically for prefetchable MWs - hello to the emulated
> >scratchpads) so we need the MMIO barrier here. See the doc about mmiowb():
> >https://www.kernel.org/doc/Documentation/memory-barriers.txt
> >We must make sure, that the data is sent before the status change and
> >before the peer is notified by means of Doorbell setting.
> 
> Actually, I'm pretty sure Allen is right and I had meant to comment on this
> as well. I've also gotten this wrong before and have been corrected on it.
> 
> Read the "Acquires vs I/O accesses" section in the same document you quote
> as well as [1].
> 
> ioread and iowrites are guaranteed to be strongly ordered with each other as
> long as they happen from the same CPU. However, seeing the PCI bus does not
> typically interact with the cache coherency protocol, io requests are not
> guaranteed, on all arches, to be in order with accesses to RAM. This is what
> mmiowb() is for. So if you have an IO access that is protected with a spin
> lock you must make sure to call mmiowb() before releasing the spin lock
> otherwise IO writes might happen after the lock is released. As far as I
> know, this is the only common use for mmiowb().
> 
> Logan
> 
> [1] https://www.kernel.org/doc/Documentation/driver-api/device-io.rst

Hello Logan,
I've read that manual many times and do know about the case you've cited. First
of all, if Atul left the same design of the code, he would have needed to add
spin-locks inside the send/recv methods while accessing the scratchpads/message
anyway, which means he must have added 'mmiowb' as well before releasing the lock.*

* Getting ahead a bit If I took the following case incorrectly, Alen' design of
data ring-buffer will set us free of mmiowb usage at all.

Still there is another place in the document [2], which concerns mmiowb usage.
I may still misunderstand something, but according to the section
'KERNEL I/O BARRIER EFFECTS' (see "(*) readX(), writeX():" description):
"Whether these are guaranteed to be fully ordered and uncombined with
 respect to each other on the issuing CPU depends on the characteristics
 defined for the memory window through which they're accessing."
 ...
"Ordinarily, these will be guaranteed to be fully ordered and uncombined,
 provided they're not accessing a prefetchable device.

 However, intermediary hardware (such as a PCI bridge) may indulge in
 deferral if it so wishes; to flush a store, a load from the same location
 is preferred."
 ...
"Used with prefetchable I/O memory, an mmiowb() barrier may be required to
 force stores to be ordered."

So to speak, the readX()/writeX() ordering rules depend on the CPU
characteristics of the memory windows. Particularly the doc mentions
prefetchable I/O memory regions, which might be our case if the Scratchpads
are emulated using memory windows. If I am not mistaken if memory region
is mapped by 64-bits BARs it can be prefetchable only. It means, that
at least the pure writeX() instructions can be reordered by an intermediate
bridge internal logic, so mmiowb() should be used to flush the stores.
Correct me if I'm wrong.

[2] https://www.kernel.org/doc/Documentation/memory-barriers.txt

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 0/4] NTB : Introduce message library
  2018-05-11 23:00   ` Logan Gunthorpe
@ 2018-05-14 20:40     ` Serge Semin
  2018-05-14 21:04       ` Logan Gunthorpe
  0 siblings, 1 reply; 25+ messages in thread
From: Serge Semin @ 2018-05-14 20:40 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: Atul Raut, linux-ntb

On Fri, May 11, 2018 at 05:00:59PM -0600, Logan Gunthorpe <logang@deltatee.com> wrote:
> On 5/11/2018 4:39 PM, Serge Semin wrote:
> >Definitely I didn't tell you just to copy-and-paste the whole service functions
> >from ntb_perf. One of the ntb_perf driver purpose was to create an example of
> >possible cross-interface communication code, which turned to be the "service"
> >subsystem of the driver. My idea was to have it used as a proof of concept and
> >a stably working code for the future cross-iface library. The library is going
> >to be a part of the NTB API itself - a part of the generic interface for the
> >whole NTB subsystem, so I recommended to have the design discussed first, otherwise
> >you'd risk to face review requests which would cause a serious code refactoring.
> >As far as I can see, it turned out to be true.
> 
> IMO, and in my experience, when working on APIs that have multiple users in
> the kernel you should *expect* to refactor and rewrite the code a couple
> times before we land on something that is clean enough to merge. I have
> patch sets that have gone through many *significant* changes and they are
> still works in progress. This is normal and don't expect discussion ahead of
> time to converge on the right answer right away.

I've got the same experience. But one thing I also got from it. If I knew a
maintainers notion about a change, I would have developed a code in accordance
with it. At least I'd try to understand the maintainer ideas and after discussions
we'd come up to some agreement. So if I had a way to chat with subsystem support
team, I would have used it without a doubt. Of course this doesn't mean that a
patchset would be accepted right after a first submit, but you must admit it would
lead to a code closer to acceptance than if it was created without the consultation.

-Sergey

> >Even though this more like a "philosophic" part, but this time Logan's opinion
> >happens to agree with the design we discussed a while ago with the rest of the
> >NTB maintainers:
> 
> Glad we agree for a change ;)
> 
> Logan
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-14 20:25       ` Serge Semin
@ 2018-05-14 20:59         ` Logan Gunthorpe
  2018-05-14 21:39           ` Serge Semin
  0 siblings, 1 reply; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-14 20:59 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On 5/14/2018 2:25 PM, Serge Semin wrote:
> Hello Logan,
> I've read that manual many times and do know about the case you've cited. First
> of all, if Atul left the same design of the code, he would have needed to add
> spin-locks inside the send/recv methods while accessing the scratchpads/message
> anyway, which means he must have added 'mmiowb' as well before releasing the lock.*

Well, we shouldn't be using any locking in the library but, even if we
did, a spin lock would probably not be appropriate. Locking would only
be necessary if multiple CPUs could be accessing the interface at the
same time and seeing current users only use it for initialization this
should not be necessary. If future users want to use this API in a
concurrent manner I think they should have to do the locking themselves.


>  However, intermediary hardware (such as a PCI bridge) may indulge in
>  deferral if it so wishes; to flush a store, a load from the same location
>  is preferred."

Read this more carefully. It says a load from the same device is the
preferred way to order stores (when necessary). Not a call to mmiowb()
which is much more expensive.

> "Used with prefetchable I/O memory, an mmiowb() barrier may be required to
>  force stores to be ordered."

NTB drivers should not be using prefetchable memory for registers. If
one does, it's probably very broken.

See also [1], especially quiz #4.

Logan

[1] https://lwn.net/Articles/698014/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 0/4] NTB : Introduce message library
  2018-05-14 20:40     ` Serge Semin
@ 2018-05-14 21:04       ` Logan Gunthorpe
  0 siblings, 0 replies; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-14 21:04 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On 5/14/2018 2:40 PM, Serge Semin wrote:
> I've got the same experience. But one thing I also got from it. If I knew a
> maintainers notion about a change, I would have developed a code in accordance
> with it. At least I'd try to understand the maintainer ideas and after discussions
> we'd come up to some agreement. So if I had a way to chat with subsystem support
> team, I would have used it without a doubt. Of course this doesn't mean that a
> patchset would be accepted right after a first submit, but you must admit it would
> lead to a code closer to acceptance than if it was created without the consultation.

I'm obviously not saying you shouldn't discuss design decisions with
stake holders at all points in the process. The point is you shouldn't
expect that just having the discussion ahead of time will mean everyone
will get it right the first time and your patchset will be immediately
merged. Often times the end of a discussion is to wait and see the
code/implementation and then it can be evaluated further. My main point
is: always expect you will do re-writes at least until everyone has seen
it and a consensus is forming.

Logan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-14 20:59         ` Logan Gunthorpe
@ 2018-05-14 21:39           ` Serge Semin
  2018-05-14 22:04             ` Logan Gunthorpe
  0 siblings, 1 reply; 25+ messages in thread
From: Serge Semin @ 2018-05-14 21:39 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: Atul Raut, linux-ntb

On Mon, May 14, 2018 at 02:59:23PM -0600, Logan Gunthorpe <logang@deltatee.com> wrote:
> On 5/14/2018 2:25 PM, Serge Semin wrote:
> > Hello Logan,
> > I've read that manual many times and do know about the case you've cited. First
> > of all, if Atul left the same design of the code, he would have needed to add
> > spin-locks inside the send/recv methods while accessing the scratchpads/message
> > anyway, which means he must have added 'mmiowb' as well before releasing the lock.*
> 
> Well, we shouldn't be using any locking in the library but, even if we
> did, a spin lock would probably not be appropriate. Locking would only
> be necessary if multiple CPUs could be accessing the interface at the
> same time and seeing current users only use it for initialization this
> should not be necessary. If future users want to use this API in a
> concurrent manner I think they should have to do the locking themselves.
> 

Even if current users do it for initialization only, it doesn't mean it
should not be thread-IRQ-safe (see Alen and my comments sent within this patch).
It is especially true, if the library is going to be created for generic data
transfer, as you also suggested to implement.

> 
> >  However, intermediary hardware (such as a PCI bridge) may indulge in
> >  deferral if it so wishes; to flush a store, a load from the same location
> >  is preferred."
> 
> Read this more carefully. It says a load from the same device is the
> preferred way to order stores (when necessary). Not a call to mmiowb()
> which is much more expensive.
> 

Yes, it does. But we don't use load instruction here either. That was my
concern. (see the next comment as continue of this one)

> > "Used with prefetchable I/O memory, an mmiowb() barrier may be required to
> >  force stores to be ordered."
> 
> NTB drivers should not be using prefetchable memory for registers. If
> one does, it's probably very broken.
> 

Of course configuration space should be memory-mapped to a non-prefetchable
region. But as far as I remember the Switchtec driver uses one of the memory
windows to emulate the Scratchpad registers. So if that memory window mapped
to a prefetchable region (which is always true for 64-bit BARs), then we
either need to have a fake load instruction or mmiowb() to flush all the
stores. I can judge by IDT devices, which MW BARs can be setup either as
prefetchable or non-prefetchable. If you are sure, that that the Swithtec
Scratchpad MW is always mapped to a non-prefetcable I/O memory (it should
be accessed through a 32-bits BAR with non-prefetchable bit set), then we can
close the discussion.

-Sergey

> See also [1], especially quiz #4.
> 
> Logan
> 
> [1] https://lwn.net/Articles/698014/
> 
> -- 
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/9373c212-433c-818d-3fbd-b1f95cede59d%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-14 21:39           ` Serge Semin
@ 2018-05-14 22:04             ` Logan Gunthorpe
  0 siblings, 0 replies; 25+ messages in thread
From: Logan Gunthorpe @ 2018-05-14 22:04 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On 5/14/2018 3:39 PM, Serge Semin wrote:
> Even if current users do it for initialization only, it doesn't mean it
> should not be thread-IRQ-safe (see Alen and my comments sent within this patch).
> It is especially true, if the library is going to be created for generic data
> transfer, as you also suggested to implement.

IMO this should be up to the caller to do the locking at least until we
have a _lot_ of example clients that need it. Generally, in the kernel
we wait for users to need instead of trying to predict the future.

IRQ safe is probably more than overkill...

> Yes, it does. But we don't use load instruction here either. That was my
> concern. (see the next comment as continue of this one)

Then add a load instruction and a comment. mmiowb() raises eyebrows
these days.

> Of course configuration space should be memory-mapped to a non-prefetchable
> region. But as far as I remember the Switchtec driver uses one of the memory
> windows to emulate the Scratchpad registers. So if that memory window mapped
> to a prefetchable region (which is always true for 64-bit BARs), then we
> either need to have a fake load instruction or mmiowb() to flush all the
> stores. I can judge by IDT devices, which MW BARs can be setup either as
> prefetchable or non-prefetchable. If you are sure, that that the Swithtec
> Scratchpad MW is always mapped to a non-prefetcable I/O memory (it should
> be accessed through a 32-bits BAR with non-prefetchable bit set), then we can
> close the discussion.

Switchtec uses pci_iomap() which does not map it prefetchable in anyway.
If it was prefetchable, the scratchpads would probably be very buggy,
even with current clients.

Logan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-13  0:25     ` Allen Hubbe
  2018-05-13  0:31       ` Allen Hubbe
@ 2018-05-14 23:16       ` Serge Semin
  2018-05-15 14:21         ` Allen Hubbe
  1 sibling, 1 reply; 25+ messages in thread
From: Serge Semin @ 2018-05-14 23:16 UTC (permalink / raw)
  To: Allen Hubbe; +Cc: Atul Raut, linux-ntb

On Sat, May 12, 2018 at 08:25:50PM -0400, Allen Hubbe <allenbh@gmail.com> wrote:
> On Fri, May 11, 2018 at 6:44 PM, Serge Semin <fancer.lancer@gmail.com> wrote:
> > On Sun, May 06, 2018 at 12:20:18PM -0700, Atul Raut <araut@codeaurora.org> wrote:
> >>> Allen Hubbe
> >>> Maybe one try, or msg_tries is a parameter instead of constant.
> >
> > Allen, there can't be just one try. Imagine you've sent some data to a peer and made it
> > being notified of incoming data pending in the scratchpad/message registers. Then
> > the peer handler starts fetching the data from the registers while you or some
> > other peer tries to send data there before the peer finished the retrieval operation.
> > In this case you'd need to perform several retries before giving up and return
> > something like -EAGAIN. As I suggested before, it might be better to move number
> > of retries parameter definition to the kernel menuconfig.
> 
> Making it configurable just seems like a convenient way to blame the
> user if there is a problem.  It returned -EAGAIN?  You didn't
> configure the wait long enough.  It hung your os and spewed warnings
> to dmesg?  Configured the wait to be too long.
> 

Yes, it is) Your question was about making just one try, which due to the
asynchronous nature of communications, would be a bad idea most likely causing
the error returned. Anyway your design with ring-buffer shall eliminate the need
of worrying about the number of retries by setting it reasonably big.

> >
> >> +             if (!ntb_link_is_up(ntb, NULL, NULL))
> >> +                     return -ENOLINK;
> >> +
> >> +             sts = ntb_peer_spad_read(ntb, pidx,
> >> +                                      NT_SPAD_CMD(gidx));
> >
> >>> Allen Hubbe
> >>> Can it be done without reading peer spads?
> >
> > Alas, it can't within the current communication layer design. We need to check
> > the status of the sent data. When the peer marked it as read, only then we can
> > send a next portion of pending outbound data.
> 
> Just trying to avoid a read across the bus between two NTBs.  I get
> that the peer needs to ack the message before the spads can be
> overwritten.
> 

Yes, that's right.

> Focusing on the command spad, will any use case really have 2^32
> commands?  Can we spare two bits?  Let one bit be the command sequence
> number (zero or one), and the other bit be the remote command ack
> number (zero or one).
> 

Of course we can. That's what I meant when suggested to send data of type u8.
In this case we could reserve one of the Scratchpad byte for status bits. I would
also suggest not to have cmd_num at all, since it is the client driver abstraction
whether define any command (see the library prototypes I cited from our old
discussions in the v2 cover letter comment). It is better just to send an array
of data. Client drivers shall decide whether to have cmd. This would give us a
more generic interface with at least 6 bits reserved for possible future use.

> Sending a command: flip the sequence number bit from the previous
> command, leave the ack bit alone, and write the command number in the
> rest of the bits.
> 
> Receiving a command: did the sequence number bit change?  If so,
> service the command, and then acknowledge.  Acknowledge is like
> sending a command because the write destination is the command spad,
> but flip the ack bit only: leave the current sequence bit and command
> number alone.

Emmm, why is it so complicated? As for me, it is easier to have just one status
bit: 0 - buffer is free to send a data to a peer (can be cleared by the peer only,
when it is done with reading the data), 1 - buffer contains new data for peer
(can be set by the local side only when it's done with writing the data).
So to speak 0->1 event would mean a new data is available and can be read,
1->0 event would mean the data has been read and the next portion can be sent.

> 
> Following is an imagined implementation for spads.  For msg registers,
> instead of seq/ack bits, it would check the status register after
> trying to send the first data.  For send_ack(), msg registers would
> clear the status.  To make it work for both msg and spads, just
> require that cmd_num never have seq/ack bits set.
> 
> spinlock_t cmd_lock;
> u32 cmd_val;
> 
> enum {
>         CMD_SEQ_BIT = BIT(30),
>         CMD_ACK_BIT = BIT(31),
>         CMD_NUM_MASK = ~CMD_SEQ_BIT & ~CMD_ACK_BIT
> };
> 
> int send_cmd(u32 cmd_num, void *data, size_t size)
> {
>         u32 ack_val;
>         int rc;
> 
>         if (WARN_ON(cmd_num & ~CMD_NUM_MASK))
>                 return -EINVAL;
> 
>         spin_lock(cmd_lock);
> 

If send_cmd() method is used by a dedicated kernel work-thread (according to
the design it will be since you've used schedule_delayed_work() method to
submit the handler), then we don't need to have the lock here, since the
system work-queues aren't reentrant by default. It means this method will be
executed by only one CPU at a time, so we won't have the race-condition here
as we do in current Atul code (my code in the ntb_perf also used a dedicated
non-reentrant kernel work-thread for service work).

>         ack_val = spad_read(cmd_spad);
> 
>         rc = -EAGAIN;
>         if (!(ack_val & CMD_ACK_BIT) == !(cmd_val & CMD_SEQ_BIT))
>                 goto out;
> 
>         cmd_val ^= CMD_SEQ_BIT;
>         cmd_val &= ~CMD_NUM_MASK;
>         cmd_val |= CMD_NUM_MASK & cmd_num;
> 
>         peer_spad_write(data_spad_0, cmd_data);
>         peer_spad_write(data_spad_1, cmd_data + 4);
>         ... and so on ...
> 
>         peer_spad_write(cmd_spad, cmd_val);
>         peer_db_set(cmd_doorbell_bit);
> 

I would also have discarded the peer_db_set() method usage here and created
a dedicated kernel-thread, which would have polled the status bit of the
cmd_spad looking for the status bit change from zero to one. It would increase the
latency of the handler reaction, but would also set the Doorbell resource free
from using by the library. This is what I meant when said about the polling design.

>         rc = 0;
> 
> out:
>         spin_unlock(cmd_lock);
> 
>         return rc;
> }
> 
> void send_ack(void)
> {
>         spin_lock(cmd_lock);
>         cmd_val ^= CMD_ACK_BIT;
>         spad_write(cmd_spad, cmd_val);
>         spin_unlock(cmd_lock);
> }
> 
> >
> >> +             if (sts != NT_CMD_INVAL) {
> >> +                     usleep_range(MSG_UDELAY_LOW, MSG_UDELAY_HIGH);
> >
> >>> Allen Hubbe
> >>> Even though it doesn't sleep, the delay limits what context this can
> >>> reasonably be called in.  Imagine if any kind of interrupts were
> >>> disabled in the caller (eg rcu_read_lock or spin_lock_bh), and then
> >>> the call in the ntb api delays the caller for 1s, that's not very
> >>> nice.
> >>> How bad would it be to take out the retries, or would it just stop working.
> >>> If taking out the retries breaks it, is there a better way to design
> >>> the whole thing to be less fragile with respect to timing?  Or maybe
> >>> schedule_timeout_interruptible would be ok, and just require the
> >>> caller to be in a context where that is allowed.
> >
> > Allen, since we have just one channel of communication and have to wait while
> > a peer reads the data from inbound registers, I don't see other choice but
> > either to use the status polling design or specifically allocate some Doorbell
> > bits for notifications (Message registers case doesn't need it). I'd rather
> > stay with polling case, since it is simpler, consumes less hardware resources,
> > and well fits both Scratchpad and Message registers hardware. See the comment
> > in the cover letter with the design suggestion.
> 
> Since my main issue with this is the retry, it's rightly on me to
> suggest an alternative approach.
> 
> Instead, write the message to a ring buffer of messages to send, then
> schedule some delayed work struct to actually send the message.  Let
> the work struct be rescheduled with some delay, as long as the buffer
> is not empty.  Each retry may or may not send the message, depending
> on the ack or availability of the msg register.  If it doesn't send,
> just reschedule the work until it does.
> 

Yes, that's what I also intended by suggesting two types of send methods
in the comment of the cover letter:
  ntb_lib_exch_send_sync(ntb_dev, pidx, len, data),
  ntb_lib_exch_send_async(ntb_dev, pidx, len, data, callback).

They can only be implemented either by using the ring-buffer or the
messages list design. Synchronous method can sleep waiting for the
transfer completed. Asynchronous method just adds the data to the
list of data pending to be sent and returns straight away. The
corresponding callback method shall be called when the operation
is either successfully completed or failed.

IMHO I'd prefer to have the messages list, since we'd need to save
the outbound data/length, the "task_struct pointer/completion
variable and return value of the operation" so to wake up
the corresponding thread when the transfer completed/failed.

> size_t msg_max_size()
> {
>         // depends on how many spads / msg registers the device has
> that are not in use for some other purpose like link management.
> }
> 
> int post_msg(u32 cmd_num, void *data, size_t size, bool can_wait)
> {
>         int rc;
> 
>         if (cmd_num & ~CMD_NUM_MASK)
>                 return -EINVAL;
> 
>         if (size > msg_max_size())
>                 return -EINVAL;
> 
>         spin_lock(waitq.lock);
> 

Using spin_lock and then wait_event()-like method doesn't seem right...
Anyway I'd create a completion variable for each sending message.

>         rc = -EAGAIN;
>         if (can_wait)
>                 wait_event_interruptible(waitq, queue is not full);
>         else if (queue is full)
>                 goto out;
> 
>         enqueue(the command and data);
>         schedule_delayed_work(cmd_ws, 0);
> 
>         rc = 0;
> 
> out:
>         spin_unlock(waitq.lock);
> 
>         return rc;
> }
> 
> void cmd_work(ws)
> {
>         int rc;
> 
>         spin_lock(waitq.lock);
> 
>         if (queue is empty)
>                 goto out;
> 
>         rc = send_cmd(next in the queue);
>         if (!rc) {
>                 advance(the queue);
>                 wake_up(waitq);
>         } else if (rc != -EAGAIN) {
>                 goto out;
>         }
> 

I'd also either have distributed the messages into different
queues or created a different work-thread for each port, since in
general each port can be connected to an independent system, which
asynchronously handles the dedicated incoming data.

>         if (! queue is empty)
>                 schedule_delayed_work(cmd_ws, CMD_DELAY);
> 
> out:
>         spin_unlock(waitq.lock);
> }
> 
> No need to make the delay configurable, just pick something
> reasonable.  If anything is configurable, let it be the size of the
> ring buffer.  Or let the caller provide some msg stucture that we can
> add to a list, instead of a ring buffer.

As I said I'd prefer to have the messages list. It seems much simpler
and more convenient to implement the sync and async send-methods.

As for me the design doesn't eliminate the need of the retry counter.
I think we need to have a stop-condition when the cmd_work() gives up
sending a message, sets the -EBUSY return status of the message and
wakes the corresponding task up. It might be pretty big value occupying
5 or even 10 seconds of CPU, but still it would prevent the work-thread
from being infinitely rescheduled.

-Sergey

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-14 23:16       ` Serge Semin
@ 2018-05-15 14:21         ` Allen Hubbe
  2018-05-31 22:27           ` Serge Semin
  0 siblings, 1 reply; 25+ messages in thread
From: Allen Hubbe @ 2018-05-15 14:21 UTC (permalink / raw)
  To: Serge Semin; +Cc: Atul Raut, linux-ntb

On Mon, May 14, 2018 at 4:16 PM, Serge Semin <fancer.lancer@gmail.com> wrote:
> Emmm, why is it so complicated? As for me, it is easier to have just one status
> bit: 0 - buffer is free to send a data to a peer (can be cleared by the peer only,
> when it is done with reading the data), 1 - buffer contains new data for peer
> (can be set by the local side only when it's done with writing the data).
> So to speak 0->1 event would mean a new data is available and can be read,
> 1->0 event would mean the data has been read and the next portion can be sent.

That can work with shared memory, like a single shared spad.
But using single spad will require a read across the bus when there
are two ntbs.

Would you explain how you intend it to work when two spads are used,
one in each direction?

Node A wants to send the next message, so A writes the message and "1"
to B's spads.
A immediately reads B's response in A's spads, and what if the current
value there is "0"?

Did B read the message and respond "0", or does "0" still mean the
/previous/ message was read?
A can have an incorrect model of B's momentary state (and vice versa).


With seq/ack bits:

A wants to send the next message, so A writes the message and flips
the seq bit in B's spads.
A immediately reads B's reponse in A's spads, but the ack bit is the
same as the previous ack.

Did B read the message: Maybe!
Because the state is distributed, A can't know the momentary state of B.

Eventually, the message will leave node A, and eventually the seq bit
will change in node B's spad, and eventually node B will observe that
change.  The moment that node B acutally observes the change is the
moment the message is received. Node A will not directly observe the
moment that the message is received by B.

Did A receive acknowledgement from B: No.
A can directly observe its own state, and its state is that it did not
observe the ack bit flip.

Eventually, node B will receive the message, and eventually node B
will acknowledge it, and eventually the ack bit will change in node
A's spad, and finally node A will observe that change.  The moment
node A actually observes that change is the moment when the ack is
received.

> If send_cmd() method is used by a dedicated kernel work-thread (according to
> the design it will be since you've used schedule_delayed_work() method to
> submit the handler), then we don't need to have the lock here, since the
> system work-queues aren't reentrant by default. It means this method will be
> executed by only one CPU at a time, so we won't have the race-condition here
> as we do in current Atul code (my code in the ntb_perf also used a dedicated
> non-reentrant kernel work-thread for service work).

The lock is mainly for send_cmd() and send_ack().

The cmd_val is updated in a nonatmoic way, and the updates need to be
written to the peer spad in-order.

> I would also have discarded the peer_db_set() method usage here and created
> a dedicated kernel-thread, which would have polled the status bit of the
> cmd_spad looking for the status bit change from zero to one. It would increase the
> latency of the handler reaction, but would also set the Doorbell resource free
> from using by the library. This is what I meant when said about the polling design.

The msg register behavior is to interrupt the peer.

>> void send_ack(void)
>> {
>>         spin_lock(cmd_lock);
>>         cmd_val ^= CMD_ACK_BIT;
>>         spad_write(cmd_spad, cmd_val);

This is where send_ack() would race with post_msg(), if no lock.

>   ntb_lib_exch_send_sync(ntb_dev, pidx, len, data),
>   ntb_lib_exch_send_async(ntb_dev, pidx, len, data, callback).

Requiring a callback for completion complicates the interface if the
client would rather simply post a message and be done with it.

> They can only be implemented either by using the ring-buffer or the
> messages list design. Synchronous method can sleep waiting for the
> transfer completed.

The ack is so that one message is sent at a time, not to clobber a
previously sent message.

If a response to a message is required for some application, that
could be accomplished by sending a message in the other direction.

> IMHO I'd prefer to have the messages list, since we'd need to save
> the outbound data/length, the "task_struct pointer/completion
> variable and return value of the operation" so to wake up
> the corresponding thread when the transfer completed/failed.

If there is a failure in communication with the peer, that should be a
link event or port/peer event.  It's impossible to know if the message
was delivered or received by the peer after a failure.  Maybe the
message was received, but the ack was lost.

Failure of post_msg() should mean that the message really was not sent
and will not be sent.  Eg, failure to allocate memory for a list
element, or invalid parameter.  If the message was sent, or could be
sent later, then the result of post_send() is success, although the
fate of the message is not determined.

> Using spin_lock and then wait_event()-like method doesn't seem right...

wait_event_interruptible_locked()
wake_up_locked()

this is just pseudocode, not a patch.  in an actual patch, I would hope that:
- cmd_val is not just some global var
- api needs to specify ntb device, peer, etc...
- check error conditions (rc == -EINTR for example)
- actually tested, etc

> As I said I'd prefer to have the messages list. It seems much simpler
> and more convenient to implement the sync and async send-methods.
> As for me the design doesn't eliminate the need of the retry counter.
> I think we need to have a stop-condition when the cmd_work() gives up
> sending a message, sets the -EBUSY return status of the message and
> wakes the corresponding task up. It might be pretty big value occupying
> 5 or even 10 seconds of CPU, but still it would prevent the work-thread
> from being infinitely rescheduled.

Eventually, it should either send the message, or a peer/link event
will indicate a failure in communication with the peer.  The peer/link
event should be sufficient to indicate the failure.  No need for
individual status for each message after it is posted.

I don't think infinite retries is likely.  I'm more concerned with the
retries up to 1sec in a context that can't afford to wait that long.
In any case, infinite retries with a delayed work struct won't hang
the os.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] NTB : Add message library NTB API
  2018-05-15 14:21         ` Allen Hubbe
@ 2018-05-31 22:27           ` Serge Semin
  0 siblings, 0 replies; 25+ messages in thread
From: Serge Semin @ 2018-05-31 22:27 UTC (permalink / raw)
  To: Allen Hubbe; +Cc: Atul Raut, linux-ntb

On Tue, May 15, 2018 at 07:21:08AM -0700, Allen Hubbe <allenbh@gmail.com> wrote:
> On Mon, May 14, 2018 at 4:16 PM, Serge Semin <fancer.lancer@gmail.com> wrote:
> > Emmm, why is it so complicated? As for me, it is easier to have just one status
> > bit: 0 - buffer is free to send a data to a peer (can be cleared by the peer only,
> > when it is done with reading the data), 1 - buffer contains new data for peer
> > (can be set by the local side only when it's done with writing the data).
> > So to speak 0->1 event would mean a new data is available and can be read,
> > 1->0 event would mean the data has been read and the next portion can be sent.
> 
> That can work with shared memory, like a single shared spad.
> But using single spad will require a read across the bus when there
> are two ntbs.
> 
> Would you explain how you intend it to work when two spads are used,
> one in each direction?
> 
> Node A wants to send the next message, so A writes the message and "1"
> to B's spads.
> A immediately reads B's response in A's spads, and what if the current
> value there is "0"?
> 
> Did B read the message and respond "0", or does "0" still mean the
> /previous/ message was read?
> A can have an incorrect model of B's momentary state (and vice versa).
> 
> 
> With seq/ack bits:
> 
> A wants to send the next message, so A writes the message and flips
> the seq bit in B's spads.
> A immediately reads B's reponse in A's spads, but the ack bit is the
> same as the previous ack.
> 
> Did B read the message: Maybe!
> Because the state is distributed, A can't know the momentary state of B.
> 
> Eventually, the message will leave node A, and eventually the seq bit
> will change in node B's spad, and eventually node B will observe that
> change.  The moment that node B acutally observes the change is the
> moment the message is received. Node A will not directly observe the
> moment that the message is received by B.
> 
> Did A receive acknowledgement from B: No.
> A can directly observe its own state, and its state is that it did not
> observe the ack bit flip.
> 
> Eventually, node B will receive the message, and eventually node B
> will acknowledge it, and eventually the ack bit will change in node
> A's spad, and finally node A will observe that change.  The moment
> node A actually observes that change is the moment when the ack is
> received.
> 

Alright, I saw your point when you explained why the read operation is preferable:
>> allen: read on older platforms, if the remote side crashed and didn't respond
>> to the read, resulted in nmi bringing down the second node too.
We'd better stick with your edition of the sync algo.

> > If send_cmd() method is used by a dedicated kernel work-thread (according to
> > the design it will be since you've used schedule_delayed_work() method to
> > submit the handler), then we don't need to have the lock here, since the
> > system work-queues aren't reentrant by default. It means this method will be
> > executed by only one CPU at a time, so we won't have the race-condition here
> > as we do in current Atul code (my code in the ntb_perf also used a dedicated
> > non-reentrant kernel work-thread for service work).
> 
> The lock is mainly for send_cmd() and send_ack().
> 
> The cmd_val is updated in a nonatmoic way, and the updates need to be
> written to the peer spad in-order.
> 

Yep, spin-lock is necessary since the command scratchpad register is used
for sequence bit and data retrieval acknowledgement. But we don't need to
use it for the whole set of Scratchpads.

> > I would also have discarded the peer_db_set() method usage here and created
> > a dedicated kernel-thread, which would have polled the status bit of the
> > cmd_spad looking for the status bit change from zero to one. It would increase the
> > latency of the handler reaction, but would also set the Doorbell resource free
> > from using by the library. This is what I meant when said about the polling design.
> 
> The msg register behavior is to interrupt the peer.
> 

Yes it is, but the interruption is done by means of Message registers internals,
without Doorbells usage. If we wanna have the library being more portable without
making any limitation on Doorbells usage while the lib activated, in case of
scratchpads it's better to have just a thread, which would check the status
of inbound status byte.

> >> void send_ack(void)
> >> {
> >>         spin_lock(cmd_lock);
> >>         cmd_val ^= CMD_ACK_BIT;
> >>         spad_write(cmd_spad, cmd_val);
> 
> This is where send_ack() would race with post_msg(), if no lock.
> 

You meant peer_spad_write() right? Otherwise we'd get a race condition.

> >   ntb_lib_exch_send_sync(ntb_dev, pidx, len, data),
> >   ntb_lib_exch_send_async(ntb_dev, pidx, len, data, callback).
> 
> Requiring a callback for completion complicates the interface if the
> client would rather simply post a message and be done with it.
> 
> > They can only be implemented either by using the ring-buffer or the
> > messages list design. Synchronous method can sleep waiting for the
> > transfer completed.
> 
> The ack is so that one message is sent at a time, not to clobber a
> previously sent message.
> 
> If a response to a message is required for some application, that
> could be accomplished by sending a message in the other direction.
> 
> > IMHO I'd prefer to have the messages list, since we'd need to save
> > the outbound data/length, the "task_struct pointer/completion
> > variable and return value of the operation" so to wake up
> > the corresponding thread when the transfer completed/failed.
> 
> If there is a failure in communication with the peer, that should be a
> link event or port/peer event.  It's impossible to know if the message
> was delivered or received by the peer after a failure.  Maybe the
> message was received, but the ack was lost.
> 
> Failure of post_msg() should mean that the message really was not sent
> and will not be sent.  Eg, failure to allocate memory for a list
> element, or invalid parameter.  If the message was sent, or could be
> sent later, then the result of post_send() is success, although the
> fate of the message is not determined.
>

I meant to use a completion variable to notify the caller if the message
successfully written to the peer inbound scratchpads/message registers.
I didn't want to create the whole communication protocol. Anyway lets make
the library as simple as possible in this matter for now. The knowledge
of data just written to the peer registers doesn't give much of useful
info. The completion notification of ack retrieval would be much more useful
though, but we can leave it for future development.

I think such long discussion is caused just by misunderstanding. My idea
doesn't differ much from yours, but with a bit different details of 
implementation. For instance, I wouldn't recommend to flip the acknowledge
bit, but just copy the retrieved data sequence bit back to the place of the
ack-bit as acknowledgement. Otherwise on start we'd need to know the initial
value the peer scratchpad ack-bit, which means to read it across the link.
See the pseudo code in the end of this e-mail.

In addition I'd recommend to have an individual list of outbound messages
for each peer port index, since peers are asynchronous domains, which means
they handle the messages independent from each other and got non-intersecting
communication hardware.

> > Using spin_lock and then wait_event()-like method doesn't seem right...
> 
> wait_event_interruptible_locked()
> wake_up_locked()
> 
> this is just pseudocode, not a patch.  in an actual patch, I would hope that:
> - cmd_val is not just some global var
> - api needs to specify ntb device, peer, etc...
> - check error conditions (rc == -EINTR for example)
> - actually tested, etc
> 
> > As I said I'd prefer to have the messages list. It seems much simpler
> > and more convenient to implement the sync and async send-methods.
> > As for me the design doesn't eliminate the need of the retry counter.
> > I think we need to have a stop-condition when the cmd_work() gives up
> > sending a message, sets the -EBUSY return status of the message and
> > wakes the corresponding task up. It might be pretty big value occupying
> > 5 or even 10 seconds of CPU, but still it would prevent the work-thread
> > from being infinitely rescheduled.
> 
> Eventually, it should either send the message, or a peer/link event
> will indicate a failure in communication with the peer.  The peer/link
> event should be sufficient to indicate the failure.  No need for
> individual status for each message after it is posted.
> 
> I don't think infinite retries is likely.  I'm more concerned with the
> retries up to 1sec in a context that can't afford to wait that long.
> In any case, infinite retries with a delayed work struct won't hang
> the os.

Do you suggest to make the library usable only when link is up? If
so we'd need to empty the queues on the link down event and at the
time the library got disabled. Additionally we need to allow the client
to send/recv data only when the link is up.

Here is my pseudo code of possible library interface implementation
(without modifications concerning the link events limitations and
cleanups).

struct msg {
    data;
    list;
}

rwlock_t cmd_lock;
cmd_cache;

init() {
    rwlock_init(cmd_lock);

    cmd = spad_read(CMD_SPAD = 0);

    cmd_cache = ack_to_seq(cmd);
}

/* Use in non-reentrant context (like kernel work-threads) */
send_msg(ntb, pidx, msg)
{
    status = 0;

    cmd = spad_read(ntb, pidx, CMD_SPAD = 0);

    read_lock(cmd_lock);

    if (get_seq(cmd_cache) != get_ack(cmd))
        status = -EAGAIN;

    read_unlock(cmd_lock);

    if (status)
        return status;

    peer_spad_write(ntb, pidx, 1, to_u32(msg->data[3]));
    peer_spad_write(ntb, pidx, 2, to_u32(msg->data[3 + 4]));
    ...

    write_lock(cmd_lock);
    cmd_cache = seq_invert_clear_data(cmd_cache) | three_bytes_to_u32(msg->data);
    peer_spad_write(ntb, pidx, CMD_SPAD, cmd_cache);
    write_unlock(cmd_lock);

    /* If you wanna use Doorbells for peer notification otherwise sequence
     * number inversion can be used as an event for polling thread */
    peer_db_set(CMD_DB);

    return 0;
}

/* Use in non-reentrant context (like DB/Message event handler) */
recv_msg(ntb, pidx, msg)
{
    msg->data[3] = spad_read(ntb, pidx, 1);
    msg->data[3 + 4] = spad_read(ntb, pidx, 2);
    ...
    cmd = spad_read(ntb, pidx, CMD_SPAD);
    msg->data[0] = to_array(cmd);

    /* Following is like Allen's send_ack */
    write_lock(cmd_lock);
    cmd_cache = seq_to_ack(cmd) | clear_ack(cmd_cache);
    peer_spad_write(ntb, pidx, CMD_SPAD, cmd_cache);
    write_unlock(cmd_lock);
}

post_msg(ntb, pidx, data, size, can_wait, callback)
{

    if (size > msg_max_size() || pidx > max_pidx)
        return -EINVAL;

    msg = alloc_and_init_msg(data, size);

    spin_lock(queue(ntb, pidx).lock);
    enqueue(ntb, pidx, msg);
    spin_unlock(queue(ntb, pidx).lock);

    schedule_delayed_work(deliver_work, 0);

    return 0;
}

deliver_work()
{
    for_each_port_index(pidx) {
        for_each_msg_safe(ntb, pidx, msg) {
            spin_lock(queue(ntb, pidx).lock);
            dequeue(ntb, pidx, msg);
            spin_unlock(queue(ntb, pidx).lock);

            rc = send_msg(ntb, pidx, msg);
            if (!rc) {
                free(msg);
            } else {
                spin_lock(queue(ntb, pidx).lock);
                enqueue(ntb, pidx, msg);
                spin_unlock(queue(ntb, pidx).lock);
            }
        }
    }

    if (queues aren't empty)
        schedule_delayed_work(deliver_work, CMD_DELAY);
}

-Sergey

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2018-05-31 22:27 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-06 19:20 [PATCH v2 0/4] NTB : Introduce message library Atul Raut
2018-05-06 19:20 ` [PATCH v2 1/4] " Atul Raut
2018-05-07  5:29   ` Logan Gunthorpe
2018-05-10  2:10     ` Atul Raut
2018-05-10  4:35       ` Logan Gunthorpe
2018-05-10 18:13         ` Atul Raut
2018-05-11 22:40   ` Serge Semin
2018-05-06 19:20 ` [PATCH v2 2/4] NTB : Add message library NTB API Atul Raut
2018-05-11 22:44   ` Serge Semin
2018-05-11 23:11     ` Logan Gunthorpe
2018-05-14 20:25       ` Serge Semin
2018-05-14 20:59         ` Logan Gunthorpe
2018-05-14 21:39           ` Serge Semin
2018-05-14 22:04             ` Logan Gunthorpe
2018-05-13  0:25     ` Allen Hubbe
2018-05-13  0:31       ` Allen Hubbe
2018-05-14 23:16       ` Serge Semin
2018-05-15 14:21         ` Allen Hubbe
2018-05-31 22:27           ` Serge Semin
2018-05-06 19:20 ` [PATCH v2 3/4] NTB : Modification to ntb_perf module Atul Raut
2018-05-06 19:20 ` [PATCH v2 4/4] NTB : Add support to message registers based devices Atul Raut
2018-05-11 22:39 ` [PATCH v2 0/4] NTB : Introduce message library Serge Semin
2018-05-11 23:00   ` Logan Gunthorpe
2018-05-14 20:40     ` Serge Semin
2018-05-14 21:04       ` Logan Gunthorpe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.