All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] habanalabs: sync stream collective infrastructure
@ 2020-11-03 15:00 Oded Gabbay
  2020-11-03 15:00 ` [PATCH] habanalabs: use enum for CB allocation options Oded Gabbay
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-11-03 15:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Define new API for collective wait support and modify sync stream
common flow. In addition add kernel CB allocation support for
internal queues.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 38 ++++++++++---
 drivers/misc/habanalabs/common/habanalabs.h   | 55 +++++++++++++++++--
 drivers/misc/habanalabs/common/hw_queue.c     | 44 ++++++++++++++-
 drivers/misc/habanalabs/gaudi/gaudi.c         | 22 +++++++-
 drivers/misc/habanalabs/goya/goya.c           | 22 +++++++-
 include/uapi/misc/habanalabs.h                | 17 ++++--
 6 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ea480b14703f..e3fb8f51fefa 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -85,7 +85,8 @@ static void hl_fence_release(struct kref *kref)
 		goto free;
 
 	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
-			(hl_cs_cmpl->type == CS_TYPE_WAIT)) {
+		(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
+		(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
 
 		dev_dbg(hdev->dev,
 			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
@@ -112,6 +113,10 @@ static void hl_fence_release(struct kref *kref)
 		 * hence the above scenario is avoided.
 		 */
 		kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
+
+		if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
+			hdev->asic_funcs->reset_sob_group(hdev,
+					hl_cs_cmpl->sob_group);
 	}
 
 free:
@@ -247,9 +252,11 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
 	 * enabled, the user CB isn't released in cs_parser() and thus should be
 	 * released here.
+	 * This is also true for INT queues jobs which were allocated by driver
 	 */
-	if (job->queue_type == QUEUE_TYPE_HW &&
-			job->is_kernel_allocated_cb && hdev->mmu_enable) {
+	if (job->is_kernel_allocated_cb &&
+		((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
+				job->queue_type == QUEUE_TYPE_INT)) {
 		spin_lock(&job->user_cb->lock);
 		job->user_cb->cs_cnt--;
 		spin_unlock(&job->user_cb->lock);
@@ -932,7 +939,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_cs_compl *sig_waitcs_cmpl;
 	struct hl_cs *cs;
 	enum hl_queue_type q_type;
-	u32 size_to_copy, q_idx;
+	u32 size_to_copy, q_idx, collective_engine_id;
 	u64 signal_seq;
 	int rc;
 
@@ -983,7 +990,18 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 		goto free_cs_chunk_array;
 	}
 
-	if (cs_type == CS_TYPE_WAIT) {
+	if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
+		if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
+			dev_err(hdev->dev,
+				"Queue index %d is invalid\n", q_idx);
+			rc = -EINVAL;
+			goto free_cs_chunk_array;
+		}
+
+		collective_engine_id = chunk->collective_engine_id;
+	}
+
+	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
 		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq);
 		if (rc)
 			goto free_cs_chunk_array;
@@ -1028,7 +1046,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	rc = allocate_cs(hdev, ctx, cs_type, &cs);
 	if (rc) {
-		if (cs_type == CS_TYPE_WAIT)
+		if (cs_type == CS_TYPE_WAIT ||
+			cs_type == CS_TYPE_COLLECTIVE_WAIT)
 			hl_fence_put(sig_fence);
 		hl_ctx_put(ctx);
 		goto free_cs_chunk_array;
@@ -1038,7 +1057,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	 * Save the signal CS fence for later initialization right before
 	 * hanging the wait CS on the queue.
 	 */
-	if (cs_type == CS_TYPE_WAIT)
+	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT)
 		cs->signal_fence = sig_fence;
 
 	hl_debugfs_add_cs(cs);
@@ -1048,6 +1067,9 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
 		rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
 				q_idx);
+	else
+		rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
+				cs, q_idx, collective_engine_id);
 
 	if (rc)
 		goto put_cs;
@@ -1122,6 +1144,8 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		cs_type = CS_TYPE_SIGNAL;
 	else if (args->in.cs_flags & HL_CS_FLAGS_WAIT)
 		cs_type = CS_TYPE_WAIT;
+	else if (args->in.cs_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
+		cs_type = CS_TYPE_COLLECTIVE_WAIT;
 	else
 		cs_type = CS_TYPE_DEFAULT;
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 5983850d3f8f..91a5de1c8805 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -68,6 +68,11 @@
 #define HL_RSVD_SOBS			4
 #define HL_RSVD_MONS			2
 
+/*
+ * HL_COLLECTIVE_RSVD_MSTR_MONS 'collective' reserved monitors per QMAN stream
+ */
+#define HL_COLLECTIVE_RSVD_MSTR_MONS	2
+
 #define HL_MAX_SOB_VAL			(1 << 15)
 
 #define IS_POWER_OF_2(n)		(n != 0 && ((n & (n - 1)) == 0))
@@ -177,7 +182,8 @@ enum hl_queue_type {
 enum hl_cs_type {
 	CS_TYPE_DEFAULT,
 	CS_TYPE_SIGNAL,
-	CS_TYPE_WAIT
+	CS_TYPE_WAIT,
+	CS_TYPE_COLLECTIVE_WAIT
 };
 
 /*
@@ -231,6 +237,12 @@ struct hl_hw_sob {
 	u32			q_idx;
 };
 
+enum hl_collective_mode {
+	HL_COLLECTIVE_NOT_SUPPORTED = 0x0,
+	HL_COLLECTIVE_MASTER = 0x1,
+	HL_COLLECTIVE_SLAVE = 0x2
+};
+
 /**
  * struct hw_queue_properties - queue information.
  * @type: queue type.
@@ -238,6 +250,7 @@ struct hl_hw_sob {
  *                        that allocated by the Kernel driver and therefore,
  *                        a CB handle can be provided for jobs on this queue.
  *                        Otherwise, a CB address must be provided.
+ * @collective_mode: collective mode of current queue
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
  * @supports_sync_stream: True if queue supports sync stream
@@ -245,6 +258,7 @@ struct hl_hw_sob {
 struct hw_queue_properties {
 	enum hl_queue_type	type;
 	enum queue_cb_alloc_flags cb_alloc_flags;
+	enum hl_collective_mode	collective_mode;
 	u8			driver_only;
 	u8			supports_sync_stream;
 };
@@ -358,6 +372,8 @@ struct hl_mmu_properties {
  * @cb_pool_cb_size: size of each CB in the CB pool.
  * @max_pending_cs: maximum of concurrent pending command submissions
  * @max_queues: maximum amount of queues in the system
+ * @collective_first_sob: first sync object available for collective use
+ * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
  * @sync_stream_first_mon: first monitor available for sync stream use
  * @first_available_user_sob: first sob available for the user
@@ -410,6 +426,8 @@ struct asic_fixed_properties {
 	u32				cb_pool_cb_size;
 	u32				max_pending_cs;
 	u32				max_queues;
+	u16				collective_first_sob;
+	u16				collective_first_mon;
 	u16				sync_stream_first_sob;
 	u16				sync_stream_first_mon;
 	u16				first_available_user_sob[HL_MAX_DCORES];
@@ -441,6 +459,7 @@ struct hl_fence {
  * @cs_seq: command submission sequence number.
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
+ * @sob_group: the SOB group that is used in this collective wait CS.
  */
 struct hl_cs_compl {
 	struct hl_fence		base_fence;
@@ -450,6 +469,7 @@ struct hl_cs_compl {
 	u64			cs_seq;
 	enum hl_cs_type		type;
 	u16			sob_val;
+	u16			sob_group;
 };
 
 /*
@@ -512,6 +532,7 @@ struct hl_cb {
  * QUEUES
  */
 
+struct hl_cs;
 struct hl_cs_job;
 
 /* Queue length of external and HW queues */
@@ -540,15 +561,24 @@ struct hl_cs_job;
  * @next_sob_val: the next value to use for the currently used SOB.
  * @base_sob_id: the base SOB id of the SOBs used by this queue.
  * @base_mon_id: the base MON id of the MONs used by this queue.
+ * @collective_mstr_mon_id: the MON ids of the MONs used by this master queue
+ *                          in order to sync with all slave queues.
+ * @collective_slave_mon_id: the MON id used by this slave queue in order to
+ *                           sync with its master queue.
+ * @collective_sob_id: current SOB id used by this collective slave queue
+ *                     to signal its collective master queue upon completion.
  * @curr_sob_offset: the id offset to the currently used SOB from the
  *                   HL_RSVD_SOBS that are being used by this queue.
  */
 struct hl_sync_stream_properties {
-	struct hl_hw_sob	hw_sob[HL_RSVD_SOBS];
-	u16			next_sob_val;
-	u16			base_sob_id;
-	u16			base_mon_id;
-	u8			curr_sob_offset;
+	struct hl_hw_sob hw_sob[HL_RSVD_SOBS];
+	u16		next_sob_val;
+	u16		base_sob_id;
+	u16		base_mon_id;
+	u16		collective_mstr_mon_id[HL_COLLECTIVE_RSVD_MSTR_MONS];
+	u16		collective_slave_mon_id;
+	u16		collective_sob_id;
+	u8		curr_sob_offset;
 };
 
 /**
@@ -556,6 +586,7 @@ struct hl_sync_stream_properties {
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
  * @sync_stream_prop: sync stream queue properties
  * @queue_type: type of queue.
+ * @collective_mode: collective mode of current queue
  * @kernel_address: holds the queue's kernel virtual address.
  * @bus_address: holds the queue's DMA address.
  * @pi: holds the queue's pi value.
@@ -572,6 +603,7 @@ struct hl_hw_queue {
 	struct hl_cs_job			**shadow_queue;
 	struct hl_sync_stream_properties	sync_stream_prop;
 	enum hl_queue_type			queue_type;
+	enum hl_collective_mode			collective_mode;
 	u64					kernel_address;
 	dma_addr_t				bus_address;
 	u32					pi;
@@ -764,9 +796,13 @@ enum div_select_defs {
  * @gen_signal_cb: Generate a signal CB.
  * @gen_wait_cb: Generate a wait CB.
  * @reset_sob: Reset a SOB.
+ * @reset_sob_group: Reset SOB group
  * @set_dma_mask_from_fw: set the DMA mask in the driver according to the
  *                        firmware configuration
  * @get_device_time: Get the device time.
+ * @collective_wait_init_cs: Generate collective master/slave packets
+ *                           and place them in the relevant cs jobs
+ * @collective_wait_create_jobs: allocate collective wait cs jobs
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -868,8 +904,13 @@ struct hl_asic_funcs {
 	u32 (*gen_wait_cb)(struct hl_device *hdev,
 			struct hl_gen_wait_properties *prop);
 	void (*reset_sob)(struct hl_device *hdev, void *data);
+	void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group);
 	void (*set_dma_mask_from_fw)(struct hl_device *hdev);
 	u64 (*get_device_time)(struct hl_device *hdev);
+	void (*collective_wait_init_cs)(struct hl_cs *cs);
+	int (*collective_wait_create_jobs)(struct hl_device *hdev,
+			struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
+			u32 collective_engine_id);
 };
 
 
@@ -1656,6 +1697,7 @@ struct hl_mmu_funcs {
  * @stop_on_err: true if engines should stop on error.
  * @supports_sync_stream: is sync stream supported.
  * @sync_stream_queue_idx: helper index for sync stream queues initialization.
+ * @collective_mon_idx: helper index for collective initialization
  * @supports_coresight: is CoreSight supported.
  * @supports_soft_reset: is soft reset supported.
  * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
@@ -1756,6 +1798,7 @@ struct hl_device {
 	u8				stop_on_err;
 	u8				supports_sync_stream;
 	u8				sync_stream_queue_idx;
+	u8				collective_mon_idx;
 	u8				supports_coresight;
 	u8				supports_soft_reset;
 	u8				supports_cb_mapping;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index ca625789d78d..8a15db724116 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -333,7 +333,14 @@ static void int_queue_schedule_job(struct hl_cs_job *job)
 
 	bd.ctl = 0;
 	bd.len = cpu_to_le32(job->job_cb_size);
-	bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
+
+	if (job->is_kernel_allocated_cb)
+		/* bus_address is actually a mmu mapped address
+		 * allocated from an internal pool
+		 */
+		bd.ptr = cpu_to_le64(job->user_cb->bus_address);
+	else
+		bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
 
 	pi = (__le64 *) (uintptr_t) (q->kernel_address +
 		((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
@@ -563,6 +570,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
 	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT))
 		init_signal_wait_cs(cs);
+	else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
+		hdev->asic_funcs->collective_wait_init_cs(cs);
 
 	spin_lock(&hdev->hw_queues_mirror_lock);
 	list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
@@ -742,12 +751,40 @@ static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
 	struct hl_sync_stream_properties *sync_stream_prop;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_hw_sob *hw_sob;
-	int sob, queue_idx;
+	int sob, reserved_mon_idx, queue_idx;
+
+	sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	/* We use 'collective_mon_idx' as a running index in order to reserve
+	 * monitors for collective master/slave queues.
+	 * collective master queue gets 2 reserved monitors
+	 * collective slave queue gets 1 reserved monitor
+	 */
+	if (hdev->kernel_queues[q_idx].collective_mode ==
+			HL_COLLECTIVE_MASTER) {
+		reserved_mon_idx = hdev->collective_mon_idx;
+
+		/* reserve the first monitor for collective master queue */
+		sync_stream_prop->collective_mstr_mon_id[0] =
+			prop->collective_first_mon + reserved_mon_idx;
+
+		/* reserve the second monitor for collective master queue */
+		sync_stream_prop->collective_mstr_mon_id[1] =
+			prop->collective_first_mon + reserved_mon_idx + 1;
+
+		hdev->collective_mon_idx += HL_COLLECTIVE_RSVD_MSTR_MONS;
+	} else if (hdev->kernel_queues[q_idx].collective_mode ==
+			HL_COLLECTIVE_SLAVE) {
+		reserved_mon_idx = hdev->collective_mon_idx++;
+
+		/* reserve a monitor for collective slave queue */
+		sync_stream_prop->collective_slave_mon_id =
+			prop->collective_first_mon + reserved_mon_idx;
+	}
 
 	if (!hdev->kernel_queues[q_idx].supports_sync_stream)
 		return;
 
-	sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
 	queue_idx = hdev->sync_stream_queue_idx++;
 
 	sync_stream_prop->base_sob_id = prop->sync_stream_first_sob +
@@ -898,6 +935,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 		q->queue_type = asic->hw_queues_props[i].type;
 		q->supports_sync_stream =
 				asic->hw_queues_props[i].supports_sync_stream;
+		q->collective_mode = asic->hw_queues_props[i].collective_mode;
 		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 559e22a5696c..91916040faac 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -794,6 +794,23 @@ static int gaudi_init_tpc_mem(struct hl_device *hdev)
 	return rc;
 }
 
+static void gaudi_reset_sob_group(struct hl_device *hdev, u16 sob_groupt)
+{
+
+}
+
+static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
+{
+
+}
+
+static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
+		struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
+		u32 collective_engine_id)
+{
+	return -EINVAL;
+}
+
 static int gaudi_late_init(struct hl_device *hdev)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
@@ -7375,8 +7392,11 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.gen_signal_cb = gaudi_gen_signal_cb,
 	.gen_wait_cb = gaudi_gen_wait_cb,
 	.reset_sob = gaudi_reset_sob,
+	.reset_sob_group = gaudi_reset_sob_group,
 	.set_dma_mask_from_fw = gaudi_set_dma_mask_from_fw,
-	.get_device_time = gaudi_get_device_time
+	.get_device_time = gaudi_get_device_time,
+	.collective_wait_init_cs = gaudi_collective_wait_init_cs,
+	.collective_wait_create_jobs = gaudi_collective_wait_create_jobs
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 7012fcdab837..757a55215b0f 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5309,6 +5309,11 @@ static void goya_reset_sob(struct hl_device *hdev, void *data)
 
 }
 
+void goya_reset_sob_group(struct hl_device *hdev, u16 sob_group)
+{
+
+}
+
 static void goya_set_dma_mask_from_fw(struct hl_device *hdev)
 {
 	if (RREG32(mmPSOC_GLOBAL_CONF_NON_RST_FLOPS_0) ==
@@ -5330,6 +5335,18 @@ u64 goya_get_device_time(struct hl_device *hdev)
 	return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL);
 }
 
+void goya_collective_wait_init_cs(struct hl_cs *cs)
+{
+
+}
+
+int goya_collective_wait_create_jobs(struct hl_device *hdev,
+		struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
+		u32 collective_engine_id)
+{
+	return -EINVAL;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5401,8 +5418,11 @@ static const struct hl_asic_funcs goya_funcs = {
 	.gen_signal_cb = goya_gen_signal_cb,
 	.gen_wait_cb = goya_gen_wait_cb,
 	.reset_sob = goya_reset_sob,
+	.reset_sob_group = goya_reset_sob_group,
 	.set_dma_mask_from_fw = goya_set_dma_mask_from_fw,
-	.get_device_time = goya_get_device_time
+	.get_device_time = goya_get_device_time,
+	.collective_wait_init_cs = goya_collective_wait_init_cs,
+	.collective_wait_create_jobs = goya_collective_wait_create_jobs
 };
 
 /*
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 5753157e71b3..2b244d0bdc26 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -523,7 +523,8 @@ struct hl_cs_chunk {
 		 */
 		__u64 cb_handle;
 
-		/* Relevant only when HL_CS_FLAGS_WAIT is set.
+		/* Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set.
 		 * This holds address of array of u64 values that contain
 		 * signal CS sequence numbers. The wait described by this job
 		 * will listen on all those signals (wait event per signal)
@@ -541,7 +542,8 @@ struct hl_cs_chunk {
 		 */
 		__u32 cb_size;
 
-		/* Relevant only when HL_CS_FLAGS_WAIT is set.
+		/* Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set.
 		 * Number of entries in signal_seq_arr
 		 */
 		__u32 num_signal_seq_arr;
@@ -550,14 +552,21 @@ struct hl_cs_chunk {
 	/* HL_CS_CHUNK_FLAGS_* */
 	__u32 cs_chunk_flags;
 
+	/* Relevant only when HL_CS_FLAGS_COLLECTIVE_WAIT is set.
+	 * This holds the collective engine ID. The wait described by this job
+	 * will sync with this engine and with all NICs before completion.
+	 */
+	__u32 collective_engine_id;
+
 	/* Align structure to 64 bytes */
-	__u32 pad[11];
+	__u32 pad[10];
 };
 
-/* SIGNAL and WAIT flags are mutually exclusive */
+/* SIGNAL and WAIT/COLLECTIVE_WAIT flags are mutually exclusive */
 #define HL_CS_FLAGS_FORCE_RESTORE	0x1
 #define HL_CS_FLAGS_SIGNAL		0x2
 #define HL_CS_FLAGS_WAIT		0x4
+#define HL_CS_FLAGS_COLLECTIVE_WAIT	0x8
 
 #define HL_CS_STATUS_SUCCESS		0
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH] habanalabs: use enum for CB allocation options
  2020-11-03 15:00 [PATCH 1/3] habanalabs: sync stream collective infrastructure Oded Gabbay
@ 2020-11-03 15:00 ` Oded Gabbay
  2020-11-03 15:00 ` [PATCH 2/3] habanalabs/gaudi: Set DMA5 QMAN internal Oded Gabbay
  2020-11-03 15:00 ` [PATCH 3/3] habanalabs: sync stream collective support Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-11-03 15:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

In the future there will be situations where queues can accept either
kernel allocated CBs or user allocated CBs, depending on different
states.

Therefore, instead of using a boolean variable of kernel/user allocated
CB, we need to use a bitmask to indicate that, which will allow to
combine the two options.

Add a flag to the uapi so the user will be able to indicate whether
the CB was allocated by kernel or by user. Of course the driver
validates that.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 31 +++++++++++++++++--
 drivers/misc/habanalabs/common/habanalabs.h   | 19 ++++++++++--
 drivers/misc/habanalabs/gaudi/gaudi.c         | 13 +++++---
 drivers/misc/habanalabs/goya/goya.c           |  6 ++--
 include/uapi/misc/habanalabs.h                | 16 ++++++++++
 5 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 1f8b53d42e3a..ea480b14703f 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -568,9 +568,36 @@ static int validate_queue_index(struct hl_device *hdev,
 		return -EINVAL;
 	}
 
-	*queue_type = hw_queue_prop->type;
-	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+	/* When hw queue type isn't QUEUE_TYPE_HW,
+	 * USER_ALLOC_CB flag shall be referred as "don't care".
+	 */
+	if (hw_queue_prop->type == QUEUE_TYPE_HW) {
+		if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
+			if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
+				dev_err(hdev->dev,
+					"Queue index %d doesn't support user CB\n",
+					chunk->queue_index);
+				return -EINVAL;
+			}
 
+			*is_kernel_allocated_cb = false;
+		} else {
+			if (!(hw_queue_prop->cb_alloc_flags &
+					CB_ALLOC_KERNEL)) {
+				dev_err(hdev->dev,
+					"Queue index %d doesn't support kernel CB\n",
+					chunk->queue_index);
+				return -EINVAL;
+			}
+
+			*is_kernel_allocated_cb = true;
+		}
+	} else {
+		*is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
+						& CB_ALLOC_KERNEL);
+	}
+
+	*queue_type = hw_queue_prop->type;
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 968431c7ce20..5983850d3f8f 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -206,6 +206,17 @@ struct hl_outbound_pci_region {
 	u64	size;
 };
 
+/*
+ * enum queue_cb_alloc_flags - Indicates queue support for CBs that
+ * allocated by Kernel or by User
+ * @CB_ALLOC_KERNEL: support only CBs that allocated by Kernel
+ * @CB_ALLOC_USER: support only CBs that allocated by User
+ */
+enum queue_cb_alloc_flags {
+	CB_ALLOC_KERNEL = 0x1,
+	CB_ALLOC_USER   = 0x2
+};
+
 /*
  * struct hl_hw_sob - H/W SOB info.
  * @hdev: habanalabs device structure.
@@ -223,16 +234,18 @@ struct hl_hw_sob {
 /**
  * struct hw_queue_properties - queue information.
  * @type: queue type.
+ * @queue_cb_alloc_flags: bitmap which indicates if the hw queue supports CB
+ *                        that allocated by the Kernel driver and therefore,
+ *                        a CB handle can be provided for jobs on this queue.
+ *                        Otherwise, a CB address must be provided.
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
- * @requires_kernel_cb: true if a CB handle must be provided for jobs on this
- *                      queue, false otherwise (a CB address must be provided).
  * @supports_sync_stream: True if queue supports sync stream
  */
 struct hw_queue_properties {
 	enum hl_queue_type	type;
+	enum queue_cb_alloc_flags cb_alloc_flags;
 	u8			driver_only;
-	u8			requires_kernel_cb;
 	u8			supports_sync_stream;
 };
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 065c2377c1fa..559e22a5696c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -381,23 +381,28 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 		if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) {
 			prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
 			prop->hw_queues_props[i].driver_only = 0;
-			prop->hw_queues_props[i].requires_kernel_cb = 1;
 			prop->hw_queues_props[i].supports_sync_stream = 1;
+			prop->hw_queues_props[i].cb_alloc_flags =
+				CB_ALLOC_KERNEL;
 			num_sync_stream_queues++;
 		} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
 			prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
 			prop->hw_queues_props[i].driver_only = 1;
-			prop->hw_queues_props[i].requires_kernel_cb = 0;
 			prop->hw_queues_props[i].supports_sync_stream = 0;
+			prop->hw_queues_props[i].cb_alloc_flags =
+				CB_ALLOC_KERNEL;
 		} else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) {
 			prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
 			prop->hw_queues_props[i].driver_only = 0;
-			prop->hw_queues_props[i].requires_kernel_cb = 0;
+			prop->hw_queues_props[i].supports_sync_stream = 0;
+			prop->hw_queues_props[i].cb_alloc_flags =
+				CB_ALLOC_USER;
 		} else if (gaudi_queue_type[i] == QUEUE_TYPE_NA) {
 			prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
 			prop->hw_queues_props[i].driver_only = 0;
-			prop->hw_queues_props[i].requires_kernel_cb = 0;
 			prop->hw_queues_props[i].supports_sync_stream = 0;
+			prop->hw_queues_props[i].cb_alloc_flags =
+				CB_ALLOC_USER;
 		}
 	}
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index e8bf0b79cd67..7012fcdab837 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -373,20 +373,20 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
 		prop->hw_queues_props[i].driver_only = 0;
-		prop->hw_queues_props[i].requires_kernel_cb = 1;
+		prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_KERNEL;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
 		prop->hw_queues_props[i].driver_only = 1;
-		prop->hw_queues_props[i].requires_kernel_cb = 0;
+		prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_KERNEL;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
 			NUMBER_OF_INT_HW_QUEUES; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
 		prop->hw_queues_props[i].driver_only = 0;
-		prop->hw_queues_props[i].requires_kernel_cb = 0;
+		prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_USER;
 	}
 
 	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 9705b8adb60c..5753157e71b3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -490,6 +490,22 @@ union hl_cb_args {
 	struct hl_cb_out out;
 };
 
+/* HL_CS_CHUNK_FLAGS_ values
+ *
+ * HL_CS_CHUNK_FLAGS_USER_ALLOC_CB:
+ *      Indicates if the CB was allocated and mapped by userspace.
+ *      User allocated CB is a command buffer allocated by the user, via malloc
+ *      (or similar). After allocating the CB, the user invokes “memory ioctl”
+ *      to map the user memory into a device virtual address. The user provides
+ *      this address via the cb_handle field. The interface provides the
+ *      ability to create a large CBs, Which aren’t limited to
+ *      “HL_MAX_CB_SIZE”. Therefore, it increases the PCI-DMA queues
+ *      throughput. This CB allocation method also reduces the use of Linux
+ *      DMA-able memory pool. Which are limited and used by other Linux
+ *      sub-systems.
+ */
+#define HL_CS_CHUNK_FLAGS_USER_ALLOC_CB 0x1
+
 /*
  * This structure size must always be fixed to 64-bytes for backward
  * compatibility
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] habanalabs/gaudi: Set DMA5 QMAN internal
  2020-11-03 15:00 [PATCH 1/3] habanalabs: sync stream collective infrastructure Oded Gabbay
  2020-11-03 15:00 ` [PATCH] habanalabs: use enum for CB allocation options Oded Gabbay
@ 2020-11-03 15:00 ` Oded Gabbay
  2020-11-03 15:00 ` [PATCH 3/3] habanalabs: sync stream collective support Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-11-03 15:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

DMA5 QMAN is designated to be used for reduction process, hence it will
be no longer configured as external queue.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c  | 31 +++++++++++---------------
 drivers/misc/habanalabs/gaudi/gaudiP.h |  8 +++----
 include/uapi/misc/habanalabs.h         | 12 +++++-----
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 91916040faac..2ba15d3470dd 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -38,7 +38,7 @@
  *
  * MMU is always enabled.
  *
- * QMAN DMA channels 0,1,5 (PCI DMAN):
+ * QMAN DMA channels 0,1 (PCI DMAN):
  *     - DMA is not secured.
  *     - PQ and CQ are secured.
  *     - CP is secured: The driver needs to parse CB but WREG should be allowed
@@ -55,7 +55,7 @@
  *       idle)
  *     - MMU page tables area clear (happens on init)
  *
- * QMAN DMA 2-4,6,7, TPC, MME, NIC:
+ * QMAN DMA 2-7, TPC, MME, NIC:
  * PQ is secured and is located on the Host (HBM CON TPC3 bug)
  * CQ, CP and the engine are not secured
  *
@@ -113,12 +113,12 @@ static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
 static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = {
 	[GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0,
 	[GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1,
-	[GAUDI_PCI_DMA_3] = GAUDI_ENGINE_ID_DMA_5,
 	[GAUDI_HBM_DMA_1] = GAUDI_ENGINE_ID_DMA_2,
 	[GAUDI_HBM_DMA_2] = GAUDI_ENGINE_ID_DMA_3,
 	[GAUDI_HBM_DMA_3] = GAUDI_ENGINE_ID_DMA_4,
-	[GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_6,
-	[GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_7
+	[GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_5,
+	[GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_6,
+	[GAUDI_HBM_DMA_6] = GAUDI_ENGINE_ID_DMA_7
 };
 
 static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = {
@@ -130,10 +130,6 @@ static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = {
 	[5] = GAUDI_QUEUE_ID_DMA_1_1,
 	[6] = GAUDI_QUEUE_ID_DMA_1_2,
 	[7] = GAUDI_QUEUE_ID_DMA_1_3,
-	[8] = GAUDI_QUEUE_ID_DMA_5_0,
-	[9] = GAUDI_QUEUE_ID_DMA_5_1,
-	[10] = GAUDI_QUEUE_ID_DMA_5_2,
-	[11] = GAUDI_QUEUE_ID_DMA_5_3
 };
 
 static const u16 gaudi_packet_sizes[MAX_PACKET_ID] = {
@@ -249,10 +245,10 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_1 */
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_2 */
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_3 */
-	QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_5_0 */
-	QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_5_1 */
-	QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_5_2 */
-	QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_5_3 */
+	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_0 */
+	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_1 */
+	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_2 */
+	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_3 */
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_0 */
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_1 */
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_2 */
@@ -979,8 +975,7 @@ static int gaudi_alloc_internal_qmans_pq_mem(struct hl_device *hdev)
 		q = &gaudi->internal_qmans[i];
 
 		switch (i) {
-		case GAUDI_QUEUE_ID_DMA_2_0 ... GAUDI_QUEUE_ID_DMA_4_3:
-		case GAUDI_QUEUE_ID_DMA_6_0 ... GAUDI_QUEUE_ID_DMA_7_3:
+		case GAUDI_QUEUE_ID_DMA_2_0 ... GAUDI_QUEUE_ID_DMA_7_3:
 			q->pq_size = HBM_DMA_QMAN_SIZE_IN_BYTES;
 			break;
 		case GAUDI_QUEUE_ID_MME_0_0 ... GAUDI_QUEUE_ID_MME_1_3:
@@ -3433,21 +3428,21 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 		break;
 
 	case GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3:
-		dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_3];
+		dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_4];
 		dma_qm_offset = dma_id * DMA_QMAN_OFFSET;
 		q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4;
 		db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off;
 		break;
 
 	case GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3:
-		dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_4];
+		dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_5];
 		dma_qm_offset = dma_id * DMA_QMAN_OFFSET;
 		q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4;
 		db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off;
 		break;
 
 	case GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3:
-		dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_5];
+		dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_6];
 		dma_qm_offset = dma_id * DMA_QMAN_OFFSET;
 		q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4;
 		db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off;
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 5b99c94ebe36..73171d8b40d8 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -15,7 +15,7 @@
 #include "../include/gaudi/gaudi.h"
 #include "../include/gaudi/gaudi_async_events.h"
 
-#define NUMBER_OF_EXT_HW_QUEUES		12
+#define NUMBER_OF_EXT_HW_QUEUES		8
 #define NUMBER_OF_CMPLT_QUEUES		NUMBER_OF_EXT_HW_QUEUES
 #define NUMBER_OF_CPU_HW_QUEUES		1
 #define NUMBER_OF_INT_HW_QUEUES		100
@@ -62,8 +62,8 @@
 #error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
 #endif
 
-#define PCI_DMA_NUMBER_OF_CHNLS		3
-#define HBM_DMA_NUMBER_OF_CHNLS		5
+#define PCI_DMA_NUMBER_OF_CHNLS		2
+#define HBM_DMA_NUMBER_OF_CHNLS		6
 #define DMA_NUMBER_OF_CHNLS		(PCI_DMA_NUMBER_OF_CHNLS + \
 						HBM_DMA_NUMBER_OF_CHNLS)
 
@@ -205,12 +205,12 @@
 enum gaudi_dma_channels {
 	GAUDI_PCI_DMA_1,
 	GAUDI_PCI_DMA_2,
-	GAUDI_PCI_DMA_3,
 	GAUDI_HBM_DMA_1,
 	GAUDI_HBM_DMA_2,
 	GAUDI_HBM_DMA_3,
 	GAUDI_HBM_DMA_4,
 	GAUDI_HBM_DMA_5,
+	GAUDI_HBM_DMA_6,
 	GAUDI_DMA_MAX
 };
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 2b244d0bdc26..4661a74f0425 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -18,8 +18,8 @@
 #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START		0x8000	/* 32KB */
 #define GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START	0x80	/* 128 bytes */
 
-#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT		48
-#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR		24
+#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT		32
+#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR		16
 /*
  * Goya queue Numbering
  *
@@ -76,10 +76,10 @@ enum gaudi_queue_id {
 	GAUDI_QUEUE_ID_DMA_4_1 = 18,	/* internal */
 	GAUDI_QUEUE_ID_DMA_4_2 = 19,	/* internal */
 	GAUDI_QUEUE_ID_DMA_4_3 = 20,	/* internal */
-	GAUDI_QUEUE_ID_DMA_5_0 = 21,	/* external */
-	GAUDI_QUEUE_ID_DMA_5_1 = 22,	/* external */
-	GAUDI_QUEUE_ID_DMA_5_2 = 23,	/* external */
-	GAUDI_QUEUE_ID_DMA_5_3 = 24,	/* external */
+	GAUDI_QUEUE_ID_DMA_5_0 = 21,	/* internal */
+	GAUDI_QUEUE_ID_DMA_5_1 = 22,	/* internal */
+	GAUDI_QUEUE_ID_DMA_5_2 = 23,	/* internal */
+	GAUDI_QUEUE_ID_DMA_5_3 = 24,	/* internal */
 	GAUDI_QUEUE_ID_DMA_6_0 = 25,	/* internal */
 	GAUDI_QUEUE_ID_DMA_6_1 = 26,	/* internal */
 	GAUDI_QUEUE_ID_DMA_6_2 = 27,	/* internal */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] habanalabs: sync stream collective support
  2020-11-03 15:00 [PATCH 1/3] habanalabs: sync stream collective infrastructure Oded Gabbay
  2020-11-03 15:00 ` [PATCH] habanalabs: use enum for CB allocation options Oded Gabbay
  2020-11-03 15:00 ` [PATCH 2/3] habanalabs/gaudi: Set DMA5 QMAN internal Oded Gabbay
@ 2020-11-03 15:00 ` Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-11-03 15:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Implement sync stream collective for GAUDI. Need to allocate additional
resources for that and add ctx_fini() to clean up those resources.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    |  11 +-
 drivers/misc/habanalabs/common/context.c      |   1 +
 drivers/misc/habanalabs/common/habanalabs.h   |   7 +-
 drivers/misc/habanalabs/gaudi/gaudi.c         | 853 ++++++++++++++++--
 drivers/misc/habanalabs/gaudi/gaudiP.h        |  40 +-
 drivers/misc/habanalabs/goya/goya.c           |   6 +
 include/uapi/misc/habanalabs.h                |  14 +-
 7 files changed, 867 insertions(+), 65 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index e3fb8f51fefa..f5bf8684f8aa 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -142,7 +142,7 @@ static void hl_fence_init(struct hl_fence *fence)
 	init_completion(&fence->completion);
 }
 
-static void cs_get(struct hl_cs *cs)
+void cs_get(struct hl_cs *cs)
 {
 	kref_get(&cs->refcount);
 }
@@ -917,6 +917,9 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	job->job_cb_size = job->user_cb_size;
 	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
 
+	/* increment refcount as for external queues we get completion */
+	cs_get(cs);
+
 	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
 
 	list_add_tail(&job->cs_node, &cs->job_list);
@@ -1072,11 +1075,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				cs, q_idx, collective_engine_id);
 
 	if (rc)
-		goto put_cs;
-
-
-	/* increment refcount as for external queues we get completion */
-	cs_get(cs);
+		goto free_cs_object;
 
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 7a59dd7c6450..2077bbe3606a 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -40,6 +40,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);
 
+		hdev->asic_funcs->ctx_fini(ctx);
 		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 91a5de1c8805..70d166e4fe09 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -65,8 +65,8 @@
  * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
  * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
  */
-#define HL_RSVD_SOBS			4
-#define HL_RSVD_MONS			2
+#define HL_RSVD_SOBS			2
+#define HL_RSVD_MONS			1
 
 /*
  * HL_COLLECTIVE_RSVD_MSTR_MONS 'collective' reserved monitors per QMAN stream
@@ -785,6 +785,7 @@ enum div_select_defs {
  * @wreg: Write a register. Needed for simulator support.
  * @halt_coresight: stop the ETF and ETR traces.
  * @ctx_init: context dependent initialization.
+ * @ctx_fini: context dependent cleanup.
  * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
  * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
  * @read_device_fw_version: read the device's firmware versions that are
@@ -891,6 +892,7 @@ struct hl_asic_funcs {
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
 	void (*halt_coresight)(struct hl_device *hdev);
 	int (*ctx_init)(struct hl_ctx *ctx);
+	void (*ctx_fini)(struct hl_ctx *ctx);
 	int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
 	u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
 	void (*read_device_fw_version)(struct hl_device *hdev,
@@ -1992,6 +1994,7 @@ void hl_sob_reset_error(struct kref *ref);
 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
 void hl_fence_put(struct hl_fence *fence);
 void hl_fence_get(struct hl_fence *fence);
+void cs_get(struct hl_cs *cs);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 2ba15d3470dd..f73e508cf14c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -358,6 +358,31 @@ static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
 static int gaudi_cpucp_info_get(struct hl_device *hdev);
 static void gaudi_disable_clock_gating(struct hl_device *hdev);
 static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid);
+static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
+				u32 size);
+static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
+				struct hl_gen_wait_properties *prop);
+
+static inline enum hl_collective_mode
+get_collective_mode(struct hl_device *hdev, u32 queue_id)
+{
+	if (gaudi_queue_type[queue_id] == QUEUE_TYPE_EXT)
+		return HL_COLLECTIVE_MASTER;
+
+	if (queue_id >= GAUDI_QUEUE_ID_DMA_5_0 &&
+			queue_id <= GAUDI_QUEUE_ID_DMA_5_3)
+		return HL_COLLECTIVE_SLAVE;
+
+	if (queue_id >= GAUDI_QUEUE_ID_TPC_7_0 &&
+			queue_id <= GAUDI_QUEUE_ID_TPC_7_3)
+		return HL_COLLECTIVE_SLAVE;
+
+	if (queue_id >= GAUDI_QUEUE_ID_NIC_0_0 &&
+			queue_id <= GAUDI_QUEUE_ID_NIC_9_3)
+		return HL_COLLECTIVE_SLAVE;
+
+	return HL_COLLECTIVE_NOT_SUPPORTED;
+}
 
 static int gaudi_get_fixed_properties(struct hl_device *hdev)
 {
@@ -393,18 +418,28 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 			prop->hw_queues_props[i].supports_sync_stream = 0;
 			prop->hw_queues_props[i].cb_alloc_flags =
 				CB_ALLOC_USER;
-		} else if (gaudi_queue_type[i] == QUEUE_TYPE_NA) {
-			prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
-			prop->hw_queues_props[i].driver_only = 0;
-			prop->hw_queues_props[i].supports_sync_stream = 0;
-			prop->hw_queues_props[i].cb_alloc_flags =
-				CB_ALLOC_USER;
+
 		}
+		prop->hw_queues_props[i].collective_mode =
+						get_collective_mode(hdev, i);
 	}
 
 	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
-	prop->sync_stream_first_sob = 0;
-	prop->sync_stream_first_mon = 0;
+	prop->collective_first_sob = 0;
+	prop->collective_first_mon = 0;
+
+	/* 2 SOBs per internal queue stream are reserved for collective */
+	prop->sync_stream_first_sob =
+			ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR)
+			* QMAN_STREAMS * HL_RSVD_SOBS;
+
+	/* 1 monitor per internal queue stream are reserved for collective
+	 * 2 monitors per external queue stream are reserved for collective
+	 */
+	prop->sync_stream_first_mon =
+			(NUMBER_OF_COLLECTIVE_QUEUES * QMAN_STREAMS) +
+			(NUMBER_OF_EXT_HW_QUEUES * 2);
+
 	prop->dram_base_address = DRAM_PHYS_BASE;
 	prop->dram_size = GAUDI_HBM_SIZE_32GB;
 	prop->dram_end_address = prop->dram_base_address +
@@ -790,21 +825,454 @@ static int gaudi_init_tpc_mem(struct hl_device *hdev)
 	return rc;
 }
 
-static void gaudi_reset_sob_group(struct hl_device *hdev, u16 sob_groupt)
+static void gaudi_collective_map_sobs(struct hl_device *hdev, u32 stream)
+{
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct gaudi_collective_properties *prop = &gaudi->collective_props;
+	struct hl_hw_queue *q;
+	u32 i, sob_id, sob_group_id, queue_id;
+
+	/* Iterate through SOB groups and assign a SOB for each slave queue */
+	sob_group_id =
+		stream * HL_RSVD_SOBS + prop->curr_sob_group_idx[stream];
+	sob_id = prop->hw_sob_group[sob_group_id].base_sob_id;
+
+	queue_id = GAUDI_QUEUE_ID_NIC_0_0 + stream;
+	for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++) {
+		q = &hdev->kernel_queues[queue_id + (4 * i)];
+		q->sync_stream_prop.collective_sob_id = sob_id + i;
+	}
+
+	/* Both DMA5 and TPC7 use the same resources since only a single
+	 * engine need to participate in the reduction process
+	 */
+	queue_id = GAUDI_QUEUE_ID_DMA_5_0 + stream;
+	q = &hdev->kernel_queues[queue_id];
+	q->sync_stream_prop.collective_sob_id =
+			sob_id + NIC_NUMBER_OF_ENGINES;
+
+	queue_id = GAUDI_QUEUE_ID_TPC_7_0 + stream;
+	q = &hdev->kernel_queues[queue_id];
+	q->sync_stream_prop.collective_sob_id =
+			sob_id + NIC_NUMBER_OF_ENGINES;
+}
+
+static void gaudi_sob_group_hw_reset(struct kref *ref)
+{
+	struct gaudi_hw_sob_group *hw_sob_group =
+		container_of(ref, struct gaudi_hw_sob_group, kref);
+	struct hl_device *hdev = hw_sob_group->hdev;
+	int i;
+
+	for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++)
+		WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+				(hw_sob_group->base_sob_id + i) * 4, 0);
+
+	kref_init(&hw_sob_group->kref);
+}
+
+static void gaudi_sob_group_reset_error(struct kref *ref)
+{
+	struct gaudi_hw_sob_group *hw_sob_group =
+		container_of(ref, struct gaudi_hw_sob_group, kref);
+	struct hl_device *hdev = hw_sob_group->hdev;
+
+	dev_crit(hdev->dev,
+		"SOB release shouldn't be called here, base_sob_id: %d\n",
+		hw_sob_group->base_sob_id);
+}
+
+static int gaudi_collective_init(struct hl_device *hdev)
+{
+	u32 i, master_monitor_sobs, sob_id, reserved_sobs_per_group;
+	struct gaudi_collective_properties *prop;
+	struct gaudi_device *gaudi;
+
+	gaudi = hdev->asic_specific;
+	prop = &gaudi->collective_props;
+	sob_id = hdev->asic_prop.collective_first_sob;
+
+	/* First sob in group must be aligned to HL_MAX_SOBS_PER_MONITOR */
+	reserved_sobs_per_group =
+		ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR);
+
+	/* Init SOB groups */
+	for (i = 0 ; i < NUM_SOB_GROUPS; i++) {
+		prop->hw_sob_group[i].hdev = hdev;
+		prop->hw_sob_group[i].base_sob_id = sob_id;
+		sob_id += reserved_sobs_per_group;
+		gaudi_sob_group_hw_reset(&prop->hw_sob_group[i].kref);
+	}
+
+	for (i = 0 ; i < QMAN_STREAMS; i++) {
+		prop->next_sob_group_val[i] = 1;
+		prop->curr_sob_group_idx[i] = 0;
+		gaudi_collective_map_sobs(hdev, i);
+	}
+
+	prop->mstr_sob_mask[0] = 0;
+	master_monitor_sobs = HL_MAX_SOBS_PER_MONITOR;
+	for (i = 0 ; i < master_monitor_sobs ; i++)
+		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
+			prop->mstr_sob_mask[0] |= BIT(i);
+
+	prop->mstr_sob_mask[1] = 0;
+	master_monitor_sobs =
+		NIC_NUMBER_OF_ENGINES - HL_MAX_SOBS_PER_MONITOR;
+	for (i = 0 ; i < master_monitor_sobs; i++) {
+		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
+			prop->mstr_sob_mask[1] |= BIT(i);
+	}
+
+	/* Set collective engine bit */
+	prop->mstr_sob_mask[1] |= BIT(i);
+
+	return 0;
+}
+
+static void gaudi_reset_sob_group(struct hl_device *hdev, u16 sob_group)
+{
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct gaudi_collective_properties *cprop = &gaudi->collective_props;
+
+	kref_put(&cprop->hw_sob_group[sob_group].kref,
+					gaudi_sob_group_hw_reset);
+}
+
+static void gaudi_collective_master_init_job(struct hl_device *hdev,
+		struct hl_cs_job *job, u32 stream, u32 sob_group_offset)
+{
+	u32 master_sob_base, master_monitor, queue_id, cb_size = 0;
+	struct gaudi_collective_properties *cprop;
+	struct hl_gen_wait_properties wait_prop;
+	struct hl_sync_stream_properties *prop;
+	struct gaudi_device *gaudi;
+
+	gaudi = hdev->asic_specific;
+	cprop = &gaudi->collective_props;
+	queue_id = job->hw_queue_id;
+	prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
+
+	master_sob_base =
+		cprop->hw_sob_group[sob_group_offset].base_sob_id;
+	master_monitor = prop->collective_mstr_mon_id[0];
+
+	dev_dbg(hdev->dev,
+		"Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n",
+		master_sob_base, cprop->mstr_sob_mask[0],
+		cprop->next_sob_group_val[stream],
+		master_monitor, queue_id);
+
+	wait_prop.data = (void *) job->patched_cb;
+	wait_prop.sob_base = master_sob_base;
+	wait_prop.sob_mask = cprop->mstr_sob_mask[0];
+	wait_prop.sob_val = cprop->next_sob_group_val[stream];
+	wait_prop.mon_id = master_monitor;
+	wait_prop.q_idx = queue_id;
+	wait_prop.size = cb_size;
+	cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
+
+	master_sob_base += HL_MAX_SOBS_PER_MONITOR;
+	master_monitor = prop->collective_mstr_mon_id[1];
+
+	dev_dbg(hdev->dev,
+		"Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n",
+		master_sob_base, cprop->mstr_sob_mask[1],
+		cprop->next_sob_group_val[stream],
+		master_monitor, queue_id);
+
+	wait_prop.sob_base = master_sob_base;
+	wait_prop.sob_mask = cprop->mstr_sob_mask[1];
+	wait_prop.mon_id = master_monitor;
+	wait_prop.size = cb_size;
+	cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
+}
+
+static void gaudi_collective_slave_init_job(struct hl_device *hdev,
+		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
 {
+	struct hl_gen_wait_properties wait_prop;
+	struct hl_sync_stream_properties *prop;
+	u32 queue_id, cb_size = 0;
+
+	queue_id = job->hw_queue_id;
+	prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
 
+	/* Add to wait CBs using slave monitor */
+	wait_prop.data = (void *) job->user_cb;
+	wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
+	wait_prop.sob_mask = 0x1;
+	wait_prop.sob_val = cs_cmpl->sob_val;
+	wait_prop.mon_id = prop->collective_slave_mon_id;
+	wait_prop.q_idx = queue_id;
+	wait_prop.size = cb_size;
+
+	dev_dbg(hdev->dev,
+		"Generate slave wait CB, sob %d, val:0x%x, mon %d, q %d\n",
+		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
+		prop->collective_slave_mon_id, queue_id);
+
+	cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
+
+	dev_dbg(hdev->dev,
+		"generate signal CB, sob_id: %d, sob val: 1, q_idx: %d\n",
+		prop->collective_sob_id, queue_id);
+
+	cb_size += gaudi_gen_signal_cb(hdev, job->user_cb,
+			prop->collective_sob_id, cb_size);
 }
 
 static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
 {
+	struct hl_cs_compl *signal_cs_cmpl =
+		container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
+	struct hl_cs_compl *cs_cmpl =
+		container_of(cs->fence, struct hl_cs_compl, base_fence);
+	struct gaudi_collective_properties *cprop;
+	u32 stream, queue_id, sob_group_offset;
+	struct hl_sync_stream_properties *prop;
+	struct gaudi_device *gaudi;
+	struct hl_device *hdev;
+	struct hl_cs_job *job;
+	struct hl_ctx *ctx;
+
+	ctx = cs->ctx;
+	hdev = ctx->hdev;
+	gaudi = hdev->asic_specific;
+	cprop = &gaudi->collective_props;
+
+	/* copy the SOB id and value of the signal CS */
+	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+
+	/* Calculate the stream from collective master queue (1st job) */
+	job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node);
+	stream = job->hw_queue_id % 4;
+	sob_group_offset =
+		stream * HL_RSVD_SOBS + cprop->curr_sob_group_idx[stream];
+
+	list_for_each_entry(job, &cs->job_list, cs_node) {
+		queue_id = job->hw_queue_id;
+		prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
+
+		if (hdev->kernel_queues[queue_id].collective_mode ==
+				HL_COLLECTIVE_MASTER) {
+			gaudi_collective_master_init_job(hdev, job, stream,
+						sob_group_offset);
+		} else {
+			gaudi_collective_slave_init_job(hdev, job, cs_cmpl);
+		}
+	}
+
+	cs_cmpl->sob_group = sob_group_offset;
+
+	/* Handle sob group kref and wraparound */
+	kref_get(&cprop->hw_sob_group[sob_group_offset].kref);
+	cprop->next_sob_group_val[stream]++;
 
+	if (cprop->next_sob_group_val[stream] == HL_MAX_SOB_VAL) {
+		/*
+		 * Decrement as we reached the max value.
+		 * The release function won't be called here as we've
+		 * just incremented the refcount.
+		 */
+		kref_put(&cprop->hw_sob_group[sob_group_offset].kref,
+				gaudi_sob_group_reset_error);
+		cprop->next_sob_group_val[stream] = 1;
+		/* only two SOBs are currently in use */
+		cprop->curr_sob_group_idx[stream] =
+			(cprop->curr_sob_group_idx[stream] + 1) &
+							(HL_RSVD_SOBS - 1);
+
+		gaudi_collective_map_sobs(hdev, stream);
+
+		dev_dbg(hdev->dev, "switched to SOB group %d, stream: %d\n",
+				cprop->curr_sob_group_idx[stream], stream);
+	}
+
+	/* Increment kref since all slave queues are now waiting on it */
+	kref_get(&cs_cmpl->hw_sob->kref);
+	/*
+	 * Must put the signal fence after the SOB refcnt increment so
+	 * the SOB refcnt won't turn 0 and reset the SOB before the
+	 * wait CS was submitted.
+	 */
+	mb();
+	hl_fence_put(cs->signal_fence);
+	cs->signal_fence = NULL;
+}
+
+static int gaudi_collective_wait_create_job(struct hl_device *hdev,
+		struct hl_ctx *ctx, struct hl_cs *cs,
+		enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id)
+{
+	struct hw_queue_properties *hw_queue_prop;
+	struct hl_cs_counters_atomic *cntr;
+	struct hl_cs_job *job;
+	struct hl_cb *cb;
+	u32 cb_size;
+	bool patched_cb;
+
+	cntr = &hdev->aggregated_cs_counters;
+
+	if (mode == HL_COLLECTIVE_MASTER) {
+		/* CB size of collective master queue contains
+		 * 4 msg short packets for monitor 1 configuration
+		 * 1 fence packet
+		 * 4 msg short packets for monitor 2 configuration
+		 * 1 fence packet
+		 * 2 msg prot packets for completion and MSI-X
+		 */
+		cb_size = sizeof(struct packet_msg_short) * 8 +
+				sizeof(struct packet_fence) * 2 +
+				sizeof(struct packet_msg_prot) * 2;
+		patched_cb = true;
+	} else {
+		/* CB size of collective slave queues contains
+		 * 4 msg short packets for monitor configuration
+		 * 1 fence packet
+		 * 1 additional msg short packet for sob signal
+		 */
+		cb_size = sizeof(struct packet_msg_short) * 5 +
+				sizeof(struct packet_fence);
+		patched_cb = false;
+	}
+
+	hw_queue_prop = &hdev->asic_prop.hw_queues_props[queue_id];
+	job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true);
+	if (!job) {
+		ctx->cs_counters.out_of_mem_drop_cnt++;
+		atomic64_inc(&cntr->out_of_mem_drop_cnt);
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate internal mapped CB for non patched CBs */
+	cb = hl_cb_kernel_create(hdev, cb_size,
+			hdev->mmu_enable && !patched_cb);
+	if (!cb) {
+		ctx->cs_counters.out_of_mem_drop_cnt++;
+		atomic64_inc(&cntr->out_of_mem_drop_cnt);
+		kfree(job);
+		return -EFAULT;
+	}
+
+	job->id = 0;
+	job->cs = cs;
+	job->user_cb = cb;
+	job->user_cb->cs_cnt++;
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = queue_id;
+
+	/*
+	 * No need in parsing, user CB is the patched CB.
+	 * We call hl_cb_destroy() out of two reasons - we don't need
+	 * the CB in the CB idr anymore and to decrement its refcount as
+	 * it was incremented inside hl_cb_kernel_create().
+	 */
+	if (patched_cb)
+		job->patched_cb = job->user_cb;
+	else
+		job->patched_cb = NULL;
+
+	job->job_cb_size = job->user_cb_size;
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	/* increment refcount as for external queues we get completion */
+	if (hw_queue_prop->type == QUEUE_TYPE_EXT)
+		cs_get(cs);
+
+	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+	list_add_tail(&job->cs_node, &cs->job_list);
+
+	hl_debugfs_add_job(hdev, job);
+
+	return 0;
 }
 
 static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
 		struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
 		u32 collective_engine_id)
 {
-	return -EINVAL;
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct hw_queue_properties *hw_queue_prop;
+	u32 queue_id, collective_queue, num_jobs;
+	u32 stream, nic_queue, nic_idx = 0;
+	bool skip;
+	int i, rc;
+
+	/* Verify wait queue id is configured as master */
+	hw_queue_prop = &hdev->asic_prop.hw_queues_props[wait_queue_id];
+	if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
+		dev_err(hdev->dev,
+			"Queue %d is not configured as collective master\n",
+			wait_queue_id);
+		return -EINVAL;
+	}
+
+	/* Verify engine id is supported */
+	if (collective_engine_id != GAUDI_ENGINE_ID_DMA_5 &&
+			collective_engine_id != GAUDI_ENGINE_ID_TPC_7) {
+		dev_err(hdev->dev,
+			"Collective wait does not support engine %u\n",
+			collective_engine_id);
+		return -EINVAL;
+	}
+
+	stream = wait_queue_id % 4;
+
+	if (collective_engine_id == GAUDI_ENGINE_ID_DMA_5)
+		collective_queue = GAUDI_QUEUE_ID_DMA_5_0 + stream;
+	else if (collective_engine_id == GAUDI_ENGINE_ID_TPC_7)
+		collective_queue = GAUDI_QUEUE_ID_TPC_7_0 + stream;
+	else
+		return -EINVAL;
+
+	num_jobs = NUMBER_OF_SOBS_IN_GRP + 1;
+	nic_queue = GAUDI_QUEUE_ID_NIC_0_0 + stream;
+
+	/* First job goes to the collective master queue, it will wait for
+	 * the collective slave queues to finish execution.
+	 * The synchronization is done using two monitors:
+	 * First monitor for NICs 0-7, second monitor for NICs 8-9 and the
+	 * reduction engine (DMA5/TPC7).
+	 *
+	 * Rest of the jobs goes to the collective slave queues which will
+	 * all wait for the user to signal sob 'cs_cmpl->sob_val'.
+	 */
+	for (i = 0 ; i < num_jobs ; i++) {
+		if (i == 0) {
+			queue_id = wait_queue_id;
+			rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
+				HL_COLLECTIVE_MASTER, queue_id, wait_queue_id);
+		} else {
+			if (nic_idx < NIC_NUMBER_OF_ENGINES) {
+				if (gaudi->hw_cap_initialized &
+					BIT(HW_CAP_NIC_SHIFT + nic_idx))
+					skip = false;
+				else
+					skip = true;
+
+				queue_id = nic_queue;
+				nic_queue += 4;
+				nic_idx++;
+
+				if (skip)
+					continue;
+			} else {
+				queue_id = collective_queue;
+			}
+
+			rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
+				HL_COLLECTIVE_SLAVE, queue_id, wait_queue_id);
+		}
+
+		if (rc)
+			return rc;
+	}
+
+	return rc;
 }
 
 static int gaudi_late_init(struct hl_device *hdev)
@@ -861,6 +1329,12 @@ static int gaudi_late_init(struct hl_device *hdev)
 		goto disable_pci_access;
 	}
 
+	rc = gaudi_collective_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to init collective\n");
+		goto disable_pci_access;
+	}
+
 	return 0;
 
 disable_pci_access:
@@ -2042,21 +2516,29 @@ static void gaudi_init_pci_dma_qmans(struct hl_device *hdev)
 static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id,
 					int qman_id, u64 qman_base_addr)
 {
-	u32 mtr_base_lo, mtr_base_hi;
-	u32 so_base_lo, so_base_hi;
+	u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi;
+	u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi;
 	u32 q_off, dma_qm_offset;
 	u32 dma_qm_err_cfg;
 
 	dma_qm_offset = dma_id * DMA_QMAN_OFFSET;
 
-	mtr_base_lo = lower_32_bits(CFG_BASE +
-				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	mtr_base_hi = upper_32_bits(CFG_BASE +
+	mtr_base_en_lo = lower_32_bits(CFG_BASE +
+			mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	so_base_lo = lower_32_bits(CFG_BASE +
+	so_base_en_lo = lower_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
-	so_base_hi = upper_32_bits(CFG_BASE +
+	so_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	mtr_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	so_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	so_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
 
 	q_off = dma_qm_offset + qman_id * 4;
 
@@ -2114,10 +2596,22 @@ static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id,
 				QMAN_INTERNAL_MAKE_TRUSTED);
 	}
 
-	WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_lo);
-	WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_hi);
-	WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_lo);
-	WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_hi);
+	WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo);
+	WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi);
+	WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo);
+	WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi);
+
+	/* Configure DMA5 CP_MSG_BASE 2/3 for sync stream collective */
+	if (gaudi_dma_assignment[dma_id] == GAUDI_ENGINE_ID_DMA_5) {
+		WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_LO_0 + q_off,
+				mtr_base_ws_lo);
+		WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_HI_0 + q_off,
+				mtr_base_ws_hi);
+		WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off,
+				so_base_ws_lo);
+		WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off,
+				so_base_ws_hi);
+	}
 }
 
 static void gaudi_init_hbm_dma_qmans(struct hl_device *hdev)
@@ -2280,22 +2774,33 @@ static void gaudi_init_mme_qmans(struct hl_device *hdev)
 static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset,
 				int qman_id, u64 qman_base_addr)
 {
-	u32 mtr_base_lo, mtr_base_hi;
-	u32 so_base_lo, so_base_hi;
+	u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi;
+	u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi;
 	u32 q_off, tpc_id;
 	u32 tpc_qm_err_cfg;
 
-	mtr_base_lo = lower_32_bits(CFG_BASE +
-				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	mtr_base_hi = upper_32_bits(CFG_BASE +
+	mtr_base_en_lo = lower_32_bits(CFG_BASE +
+			mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	so_base_lo = lower_32_bits(CFG_BASE +
+	so_base_en_lo = lower_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
-	so_base_hi = upper_32_bits(CFG_BASE +
+	so_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	mtr_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	so_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	so_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
 
 	q_off = tpc_offset + qman_id * 4;
 
+	tpc_id = tpc_offset /
+			(mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0);
+
 	if (qman_id < 4) {
 		WREG32(mmTPC0_QM_PQ_BASE_LO_0 + q_off,
 					lower_32_bits(qman_base_addr));
@@ -2321,9 +2826,6 @@ static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset,
 							QMAN_LDMA_DST_OFFSET);
 
 		/* Configure RAZWI IRQ */
-		tpc_id = tpc_offset /
-				(mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0);
-
 		tpc_qm_err_cfg = TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
 		if (hdev->stop_on_err) {
 			tpc_qm_err_cfg |=
@@ -2353,10 +2855,22 @@ static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset,
 				QMAN_INTERNAL_MAKE_TRUSTED);
 	}
 
-	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_lo);
-	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_hi);
-	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_lo);
-	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_hi);
+	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo);
+	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi);
+	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo);
+	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi);
+
+	/* Configure TPC7 CP_MSG_BASE 2/3 for sync stream collective */
+	if (tpc_id == 6) {
+		WREG32(mmTPC0_QM_CP_MSG_BASE2_ADDR_LO_0 + q_off,
+				mtr_base_ws_lo);
+		WREG32(mmTPC0_QM_CP_MSG_BASE2_ADDR_HI_0 + q_off,
+				mtr_base_ws_hi);
+		WREG32(mmTPC0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off,
+				so_base_ws_lo);
+		WREG32(mmTPC0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off,
+				so_base_ws_hi);
+	}
 }
 
 static void gaudi_init_tpc_qmans(struct hl_device *hdev)
@@ -2407,19 +2921,27 @@ static void gaudi_init_tpc_qmans(struct hl_device *hdev)
 static void gaudi_init_nic_qman(struct hl_device *hdev, u32 nic_offset,
 				int qman_id, u64 qman_base_addr, int nic_id)
 {
-	u32 mtr_base_lo, mtr_base_hi;
-	u32 so_base_lo, so_base_hi;
+	u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi;
+	u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi;
 	u32 q_off;
 	u32 nic_qm_err_cfg;
 
-	mtr_base_lo = lower_32_bits(CFG_BASE +
-				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	mtr_base_hi = upper_32_bits(CFG_BASE +
+	mtr_base_en_lo = lower_32_bits(CFG_BASE +
+			mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
-	so_base_lo = lower_32_bits(CFG_BASE +
+	so_base_en_lo = lower_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
-	so_base_hi = upper_32_bits(CFG_BASE +
+	so_base_en_hi = upper_32_bits(CFG_BASE +
 				mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	mtr_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	mtr_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0);
+	so_base_ws_lo = lower_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
+	so_base_ws_hi = upper_32_bits(CFG_BASE +
+				mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0);
 
 	q_off = nic_offset + qman_id * 4;
 
@@ -2430,14 +2952,23 @@ static void gaudi_init_nic_qman(struct hl_device *hdev, u32 nic_offset,
 	WREG32(mmNIC0_QM0_PQ_PI_0 + q_off, 0);
 	WREG32(mmNIC0_QM0_PQ_CI_0 + q_off, 0);
 
-	WREG32(mmNIC0_QM0_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x74);
-	WREG32(mmNIC0_QM0_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x14);
-	WREG32(mmNIC0_QM0_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C);
+	WREG32(mmNIC0_QM0_CP_LDMA_TSIZE_OFFSET_0 + q_off,
+							QMAN_LDMA_SIZE_OFFSET);
+	WREG32(mmNIC0_QM0_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off,
+							QMAN_LDMA_SRC_OFFSET);
+	WREG32(mmNIC0_QM0_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off,
+							QMAN_LDMA_DST_OFFSET);
+
+	WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi);
 
-	WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_lo);
-	WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_hi);
-	WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_lo);
-	WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_hi);
+	/* Configure NIC CP_MSG_BASE 2/3 for sync stream collective */
+	WREG32(mmNIC0_QM0_CP_MSG_BASE2_ADDR_LO_0 + q_off, mtr_base_ws_lo);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE2_ADDR_HI_0 + q_off, mtr_base_ws_hi);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE3_ADDR_LO_0 + q_off, so_base_ws_lo);
+	WREG32(mmNIC0_QM0_CP_MSG_BASE3_ADDR_HI_0 + q_off, so_base_ws_hi);
 
 	if (qman_id == 0) {
 		/* Configure RAZWI IRQ */
@@ -7027,11 +7558,152 @@ static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev)
 	return RREG32(mmHW_STATE);
 }
 
+static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
+		struct hl_ctx *ctx)
+{
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	bool flush_pte;
+	u64 va, pa;
+	s64 off;
+	int min_alloc_order, rc, collective_cb_size;
+
+	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU))
+		return 0;
+
+	hdev->internal_cb_pool_virt_addr =
+			hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
+					HOST_SPACE_INTERNAL_CB_SZ,
+					&hdev->internal_cb_pool_dma_addr,
+					GFP_KERNEL | __GFP_ZERO);
+
+	if (!hdev->internal_cb_pool_virt_addr)
+		return -ENOMEM;
+
+	collective_cb_size = sizeof(struct packet_msg_short) * 5 +
+			sizeof(struct packet_fence);
+	min_alloc_order = ilog2(collective_cb_size);
+
+	hdev->internal_cb_pool = gen_pool_create(min_alloc_order, -1);
+	if (!hdev->internal_cb_pool) {
+		dev_err(hdev->dev,
+			"Failed to create internal CB pool\n");
+		rc = -ENOMEM;
+		goto free_internal_cb_pool;
+	}
+
+	rc = gen_pool_add(hdev->internal_cb_pool,
+				(uintptr_t) hdev->internal_cb_pool_virt_addr,
+				HOST_SPACE_INTERNAL_CB_SZ, -1);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to internal CB pool\n");
+		rc = -EFAULT;
+		goto destroy_internal_cb_pool;
+	}
+
+	hdev->internal_cb_va_base = VA_HOST_SPACE_INTERNAL_CB_START;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	/* The mapping is done page by page since we can't assure allocated ptr
+	 * is aligned to HOST_SPACE_INTERNAL_CB_SZ
+	 */
+	for (off = 0 ; off < HOST_SPACE_INTERNAL_CB_SZ ; off += PAGE_SIZE_4KB) {
+		va = VA_HOST_SPACE_INTERNAL_CB_START + off;
+		pa = hdev->internal_cb_pool_dma_addr + off;
+		flush_pte = (off + PAGE_SIZE_4KB) >= HOST_SPACE_INTERNAL_CB_SZ;
+		rc = hl_mmu_map(ctx, va, pa, PAGE_SIZE_4KB, flush_pte);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Map failed for va 0x%llx to pa 0x%llx\n",
+				va, pa);
+			goto unmap;
+		}
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	return 0;
+
+unmap:
+	for (; off >= 0 ; off -= PAGE_SIZE_4KB) {
+		va = VA_HOST_SPACE_INTERNAL_CB_START + off;
+		flush_pte = (off - (s32) PAGE_SIZE_4KB) < 0;
+		if (hl_mmu_unmap(ctx, va, PAGE_SIZE_4KB, flush_pte))
+			dev_warn_ratelimited(hdev->dev,
+					"failed to unmap va 0x%llx\n", va);
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+destroy_internal_cb_pool:
+	gen_pool_destroy(hdev->internal_cb_pool);
+
+free_internal_cb_pool:
+	hdev->asic_funcs->asic_dma_free_coherent(hdev,
+			HOST_SPACE_INTERNAL_CB_SZ,
+			hdev->internal_cb_pool_virt_addr,
+			hdev->internal_cb_pool_dma_addr);
+
+	return rc;
+}
+
+static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
+		struct hl_ctx *ctx)
+{
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	bool flush_pte = false;
+	u64 va, off;
+
+	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU))
+		return;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	for (off = 0 ; off < HOST_SPACE_INTERNAL_CB_SZ ; off += PAGE_SIZE_4KB) {
+		va = VA_HOST_SPACE_INTERNAL_CB_START + off;
+
+		if (off + PAGE_SIZE_4KB >= HOST_SPACE_INTERNAL_CB_SZ)
+			flush_pte = true;
+
+		if (hl_mmu_unmap(ctx, va, PAGE_SIZE_4KB, flush_pte))
+			dev_warn_ratelimited(hdev->dev,
+					"failed to unmap va 0x%llx\n", va);
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	gen_pool_destroy(hdev->internal_cb_pool);
+
+	hdev->asic_funcs->asic_dma_free_coherent(hdev,
+			HOST_SPACE_INTERNAL_CB_SZ,
+			hdev->internal_cb_pool_virt_addr,
+			hdev->internal_cb_pool_dma_addr);
+}
+
 static int gaudi_ctx_init(struct hl_ctx *ctx)
 {
 	gaudi_mmu_prepare(ctx->hdev, ctx->asid);
+	return gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+}
 
-	return 0;
+void gaudi_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	/* Gaudi will NEVER support more then a single compute context.
+	 * Therefore, don't clear anything unless it is the compute context
+	 */
+	if (hdev->compute_ctx != ctx)
+		return;
+
+	gaudi_internal_cb_pool_fini(ctx->hdev, ctx);
 }
 
 static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
@@ -7053,7 +7725,7 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev)
 }
 
 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
-		u32 size)
+				u32 size)
 {
 	struct hl_cb *cb = (struct hl_cb *) data;
 	struct packet_msg_short *pkt;
@@ -7173,7 +7845,7 @@ static u32 gaudi_add_fence_pkt(struct packet_fence *pkt)
 
 static int gaudi_get_fence_addr(struct hl_device *hdev, u32 queue_id, u64 *addr)
 {
-	u32 offset;
+	u32 offset, nic_index;
 
 	switch (queue_id) {
 	case GAUDI_QUEUE_ID_DMA_0_0:
@@ -7212,6 +7884,78 @@ static int gaudi_get_fence_addr(struct hl_device *hdev, u32 queue_id, u64 *addr)
 	case GAUDI_QUEUE_ID_DMA_5_3:
 		offset = mmDMA5_QM_CP_FENCE2_RDATA_3;
 		break;
+	case GAUDI_QUEUE_ID_TPC_7_0:
+		offset = mmTPC7_QM_CP_FENCE2_RDATA_0;
+		break;
+	case GAUDI_QUEUE_ID_TPC_7_1:
+		offset = mmTPC7_QM_CP_FENCE2_RDATA_1;
+		break;
+	case GAUDI_QUEUE_ID_TPC_7_2:
+		offset = mmTPC7_QM_CP_FENCE2_RDATA_2;
+		break;
+	case GAUDI_QUEUE_ID_TPC_7_3:
+		offset = mmTPC7_QM_CP_FENCE2_RDATA_3;
+		break;
+	case GAUDI_QUEUE_ID_NIC_0_0:
+	case GAUDI_QUEUE_ID_NIC_1_0:
+	case GAUDI_QUEUE_ID_NIC_2_0:
+	case GAUDI_QUEUE_ID_NIC_3_0:
+	case GAUDI_QUEUE_ID_NIC_4_0:
+	case GAUDI_QUEUE_ID_NIC_5_0:
+	case GAUDI_QUEUE_ID_NIC_6_0:
+	case GAUDI_QUEUE_ID_NIC_7_0:
+	case GAUDI_QUEUE_ID_NIC_8_0:
+	case GAUDI_QUEUE_ID_NIC_9_0:
+		nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_0) >> 2;
+		offset = mmNIC0_QM0_CP_FENCE2_RDATA_0 +
+				(nic_index >> 1) * NIC_MACRO_QMAN_OFFSET +
+				(nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET;
+		break;
+	case GAUDI_QUEUE_ID_NIC_0_1:
+	case GAUDI_QUEUE_ID_NIC_1_1:
+	case GAUDI_QUEUE_ID_NIC_2_1:
+	case GAUDI_QUEUE_ID_NIC_3_1:
+	case GAUDI_QUEUE_ID_NIC_4_1:
+	case GAUDI_QUEUE_ID_NIC_5_1:
+	case GAUDI_QUEUE_ID_NIC_6_1:
+	case GAUDI_QUEUE_ID_NIC_7_1:
+	case GAUDI_QUEUE_ID_NIC_8_1:
+	case GAUDI_QUEUE_ID_NIC_9_1:
+		nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_1) >> 2;
+		offset = mmNIC0_QM0_CP_FENCE2_RDATA_1 +
+				(nic_index >> 1) * NIC_MACRO_QMAN_OFFSET +
+				(nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET;
+		break;
+	case GAUDI_QUEUE_ID_NIC_0_2:
+	case GAUDI_QUEUE_ID_NIC_1_2:
+	case GAUDI_QUEUE_ID_NIC_2_2:
+	case GAUDI_QUEUE_ID_NIC_3_2:
+	case GAUDI_QUEUE_ID_NIC_4_2:
+	case GAUDI_QUEUE_ID_NIC_5_2:
+	case GAUDI_QUEUE_ID_NIC_6_2:
+	case GAUDI_QUEUE_ID_NIC_7_2:
+	case GAUDI_QUEUE_ID_NIC_8_2:
+	case GAUDI_QUEUE_ID_NIC_9_2:
+		nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_2) >> 2;
+		offset = mmNIC0_QM0_CP_FENCE2_RDATA_2 +
+				(nic_index >> 1) * NIC_MACRO_QMAN_OFFSET +
+				(nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET;
+		break;
+	case GAUDI_QUEUE_ID_NIC_0_3:
+	case GAUDI_QUEUE_ID_NIC_1_3:
+	case GAUDI_QUEUE_ID_NIC_2_3:
+	case GAUDI_QUEUE_ID_NIC_3_3:
+	case GAUDI_QUEUE_ID_NIC_4_3:
+	case GAUDI_QUEUE_ID_NIC_5_3:
+	case GAUDI_QUEUE_ID_NIC_6_3:
+	case GAUDI_QUEUE_ID_NIC_7_3:
+	case GAUDI_QUEUE_ID_NIC_8_3:
+	case GAUDI_QUEUE_ID_NIC_9_3:
+		nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_3) >> 2;
+		offset = mmNIC0_QM0_CP_FENCE2_RDATA_3 +
+				(nic_index >> 1) * NIC_MACRO_QMAN_OFFSET +
+				(nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -7262,8 +8006,8 @@ static u32 gaudi_add_mon_pkts(void *buf, u16 mon_id, u64 fence_addr)
 	return size;
 }
 
-u32 gaudi_gen_wait_cb(struct hl_device *hdev,
-		struct hl_gen_wait_properties *prop)
+static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
+				struct hl_gen_wait_properties *prop)
 {
 	struct hl_cb *cb = (struct hl_cb *) prop->data;
 	void *buf = (void *) (uintptr_t) cb->kernel_address;
@@ -7377,6 +8121,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.wreg = hl_wreg,
 	.halt_coresight = gaudi_halt_coresight,
 	.ctx_init = gaudi_ctx_init,
+	.ctx_fini = gaudi_ctx_fini,
 	.get_clk_rate = gaudi_get_clk_rate,
 	.get_queue_id_for_cq = gaudi_get_queue_id_for_cq,
 	.read_device_fw_version = gaudi_read_device_fw_version,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 73171d8b40d8..1dcf7c41e698 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -23,6 +23,10 @@
 					NUMBER_OF_CPU_HW_QUEUES + \
 					NUMBER_OF_INT_HW_QUEUES)
 
+/* 10 NIC QMANs, DMA5 QMAN, TPC7 QMAN */
+#define NUMBER_OF_COLLECTIVE_QUEUES	12
+#define NUMBER_OF_SOBS_IN_GRP		11
+
 /*
  * Number of MSI interrupts IDS:
  * Each completion queue has 1 ID
@@ -149,10 +153,14 @@
 
 /* Virtual address space */
 #define VA_HOST_SPACE_START	0x1000000000000ull	/* 256TB */
-#define VA_HOST_SPACE_END	0x3FF8000000000ull	/* 1PB - 1TB */
+#define VA_HOST_SPACE_END	0x3FF7FFFE00000ull	/* 1PB - 1TB */
 #define VA_HOST_SPACE_SIZE	(VA_HOST_SPACE_END - \
 					VA_HOST_SPACE_START) /* 767TB */
 
+#define VA_HOST_SPACE_INTERNAL_CB_START	0x3FF7FFFE00000ull /* 1PB - 1TB - 2MB */
+#define VA_HOST_SPACE_INTERNAL_CB_END	0x3FF8000000000ull /* 1PB - 1TB */
+#define HOST_SPACE_INTERNAL_CB_SZ	SZ_2M
+
 #define HW_CAP_PLL		BIT(0)
 #define HW_CAP_HBM		BIT(1)
 #define HW_CAP_MMU		BIT(2)
@@ -240,6 +248,34 @@ enum gaudi_nic_mask {
 	GAUDI_NIC_MASK_ALL = 0x3FF
 };
 
+/*
+ * struct gaudi_hw_sob_group - H/W SOB group info.
+ * @hdev: habanalabs device structure.
+ * @kref: refcount of this SOB group. group will reset once refcount is zero.
+ * @base_sob_id: base sob id of this SOB group.
+ */
+struct gaudi_hw_sob_group {
+	struct hl_device	*hdev;
+	struct kref		kref;
+	u32			base_sob_id;
+};
+
+#define NUM_SOB_GROUPS (HL_RSVD_SOBS * QMAN_STREAMS)
+/**
+ * struct gaudi_collective_properties -
+ *     holds all SOB groups and queues info reserved for the collective
+ * @hw_sob_group: H/W SOB groups.
+ * @next_sob_group_val: the next value to use for the currently used SOB group.
+ * @curr_sob_group_idx: the index of the currently used SOB group.
+ * @mstr_sob_mask: pre-defined masks for collective master monitors
+ */
+struct gaudi_collective_properties {
+	struct gaudi_hw_sob_group hw_sob_group[NUM_SOB_GROUPS];
+	u16			next_sob_group_val[QMAN_STREAMS];
+	u8			curr_sob_group_idx[QMAN_STREAMS];
+	u8			mstr_sob_mask[HL_COLLECTIVE_RSVD_MSTR_MONS];
+};
+
 /**
  * struct gaudi_internal_qman_info - Internal QMAN information.
  * @pq_kernel_addr: Kernel address of the PQ memory area in the host.
@@ -285,6 +321,8 @@ struct gaudi_device {
 
 	struct gaudi_internal_qman_info	internal_qmans[GAUDI_QUEUE_ID_SIZE];
 
+	struct gaudi_collective_properties collective_props;
+
 	u64				hbm_bar_cur_addr;
 	u64				max_freq_value;
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 757a55215b0f..0afce8a834af 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5347,6 +5347,11 @@ int goya_collective_wait_create_jobs(struct hl_device *hdev,
 	return -EINVAL;
 }
 
+static void goya_ctx_fini(struct hl_ctx *ctx)
+{
+
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5408,6 +5413,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.wreg = hl_wreg,
 	.halt_coresight = goya_halt_coresight,
 	.ctx_init = goya_ctx_init,
+	.ctx_fini = goya_ctx_fini,
 	.get_clk_rate = goya_get_clk_rate,
 	.get_queue_id_for_cq = goya_get_queue_id_for_cq,
 	.read_device_fw_version = goya_read_device_fw_version,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 4661a74f0425..0185311b679b 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -18,8 +18,18 @@
 #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START		0x8000	/* 32KB */
 #define GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START	0x80	/* 128 bytes */
 
-#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT		32
-#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR		16
+/*
+ * 128 SOBs reserved for collective wait
+ * 16 SOBs reserved for sync stream
+ */
+#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT		144
+
+/*
+ * 64 monitors reserved for collective wait
+ * 8 monitors reserved for sync stream
+ */
+#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR		72
+
 /*
  * Goya queue Numbering
  *
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-11-03 15:02 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-03 15:00 [PATCH 1/3] habanalabs: sync stream collective infrastructure Oded Gabbay
2020-11-03 15:00 ` [PATCH] habanalabs: use enum for CB allocation options Oded Gabbay
2020-11-03 15:00 ` [PATCH 2/3] habanalabs/gaudi: Set DMA5 QMAN internal Oded Gabbay
2020-11-03 15:00 ` [PATCH 3/3] habanalabs: sync stream collective support Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.