linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] uapi: habanalabs: add signal/wait operations
@ 2020-05-07 14:36 Oded Gabbay
  2020-05-07 14:36 ` [PATCH 2/4] habanalabs: define ASIC-dependent interface for signal/wait Oded Gabbay
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-05-07 14:36 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

From: Omer Shpigelman <oshpigelman@habana.ai>

This is a pre-requisite to upstreaming GAUDI support.

Signal/wait operations are done by the user to perform sync between two
Primary Queues (PQs). The sync is done using the sync manager and it is
usually resolved inside the device, but sometimes it can be resolved in the
host, i.e. the user should be able to wait in the host until a signal has
been completed.

The mechanism to define signal and wait operations is done by the driver
because it needs atomicity and serialization, which is already done in the
driver when submitting work to the different queues.

To implement this feature, the driver "takes" a couple of h/w resources,
and this is reflected by the defines added to the uapi file.

The signal/wait operations are done via the existing CS IOCTL, and they use
the same data structure. There is a difference in the meaning of some of
the parameters, and for that we added unions to make the code more
readable.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c | 13 +++-
 drivers/misc/habanalabs/habanalabs.h         |  2 +
 include/uapi/misc/habanalabs.h               | 67 +++++++++++++++-----
 3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 6680e183d881..f7d03a35e6a8 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -11,6 +11,8 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 
+#define HL_CS_FLAGS_SIG_WAIT	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT)
+
 static void job_wq_completion(struct work_struct *work);
 static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		struct hl_ctx *ctx, u64 timeout_us, u64 seq);
@@ -659,7 +661,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	union hl_cs_args *args = data;
 	struct hl_ctx *ctx = hpriv->ctx;
 	void __user *chunks_execute, *chunks_restore;
-	u32 num_chunks_execute, num_chunks_restore;
+	u32 num_chunks_execute, num_chunks_restore, sig_wait_flags;
 	u64 cs_seq = ULONG_MAX;
 	int rc, do_ctx_switch;
 	bool need_soft_reset = false;
@@ -672,6 +674,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		goto out;
 	}
 
+	sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT;
+
+	if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) &&
+			(!hdev->supports_sync_stream))) {
+		dev_err(hdev->dev, "Sync stream CS is not supported\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
 	chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute;
 	num_chunks_execute = args->in.num_chunks_execute;
 
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index b1dc6a22ba0d..7cd9a8d72451 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1347,6 +1347,7 @@ struct hl_device_idle_busy_ts {
  *                           only to POWER9 machines.
  * @cdev_sysfs_created: were char devices and sysfs nodes created.
  * @stop_on_err: true if engines should stop on error.
+ * @supports_sync_stream: is sync stream supported.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1429,6 +1430,7 @@ struct hl_device {
 	u8                              power9_64bit_dma_enable;
 	u8				cdev_sysfs_created;
 	u8				stop_on_err;
+	u8				supports_sync_stream;
 
 	/* Parameters for bring-up */
 	u8				mmu_enable;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 523e511e6cff..70dfccea7038 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
  *
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2020 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -241,52 +241,87 @@ union hl_cb_args {
  * compatibility
  */
 struct hl_cs_chunk {
-	/*
-	 * For external queue, this represents a Handle of CB on the Host
-	 * For internal queue, this represents an SRAM or DRAM address of the
-	 * internal CB
-	 */
-	__u64 cb_handle;
+	union {
+		/* For external queue, this represents a Handle of CB on the
+		 * Host.
+		 * For internal queue in Goya, this represents an SRAM or
+		 * a DRAM address of the internal CB. In Gaudi, this might also
+		 * represent a mapped host address of the CB.
+		 *
+		 * A mapped host address is in the device address space, after
+		 * a host address was mapped by the device MMU.
+		 */
+		__u64 cb_handle;
+
+		/* Relevant only when HL_CS_FLAGS_WAIT is set.
+		 * This holds address of array of u64 values that contain
+		 * signal CS sequence numbers. The wait described by this job
+		 * will listen on all those signals (wait event per signal)
+		 */
+		__u64 signal_seq_arr;
+	};
+
 	/* Index of queue to put the CB on */
 	__u32 queue_index;
-	/*
-	 * Size of command buffer with valid packets
-	 * Can be smaller then actual CB size
-	 */
-	__u32 cb_size;
+
+	union {
+		/*
+		 * Size of command buffer with valid packets
+		 * Can be smaller then actual CB size
+		 */
+		__u32 cb_size;
+
+		/* Relevant only when HL_CS_FLAGS_WAIT is set.
+		 * Number of entries in signal_seq_arr
+		 */
+		__u32 num_signal_seq_arr;
+	};
+
 	/* HL_CS_CHUNK_FLAGS_* */
 	__u32 cs_chunk_flags;
+
 	/* Align structure to 64 bytes */
 	__u32 pad[11];
 };
 
+/* SIGNAL and WAIT flags are mutually exclusive */
 #define HL_CS_FLAGS_FORCE_RESTORE	0x1
+#define HL_CS_FLAGS_SIGNAL		0x2
+#define HL_CS_FLAGS_WAIT		0x4
 
 #define HL_CS_STATUS_SUCCESS		0
 
 #define HL_MAX_JOBS_PER_CS		512
 
 struct hl_cs_in {
+
 	/* this holds address of array of hl_cs_chunk for restore phase */
 	__u64 chunks_restore;
-	/* this holds address of array of hl_cs_chunk for execution phase */
+
+	/* holds address of array of hl_cs_chunk for execution phase */
 	__u64 chunks_execute;
+
 	/* this holds address of array of hl_cs_chunk for store phase -
 	 * Currently not in use
 	 */
 	__u64 chunks_store;
+
 	/* Number of chunks in restore phase array. Maximum number is
 	 * HL_MAX_JOBS_PER_CS
 	 */
 	__u32 num_chunks_restore;
+
 	/* Number of chunks in execution array. Maximum number is
 	 * HL_MAX_JOBS_PER_CS
 	 */
 	__u32 num_chunks_execute;
+
 	/* Number of chunks in restore phase array - Currently not in use */
 	__u32 num_chunks_store;
+
 	/* HL_CS_FLAGS_* */
 	__u32 cs_flags;
+
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
 };
@@ -597,8 +632,8 @@ struct hl_debug_args {
  * For jobs on external queues, the user needs to create command buffers
  * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on
  * internal queues, the user needs to prepare a "command buffer" with packets
- * on either the SRAM or DRAM, and give the device address of that buffer to
- * the CS ioctl.
+ * on either the device SRAM/DRAM or the host, and give the device address of
+ * that buffer to the CS ioctl.
  *
  * This IOCTL is asynchronous in regard to the actual execution of the CS. This
  * means it returns immediately after ALL the JOBS were enqueued on their
@@ -610,7 +645,7 @@ struct hl_debug_args {
  * external JOBS have been completed. Note that if the CS has internal JOBS
  * which can execute AFTER the external JOBS have finished, the driver might
  * report that the CS has finished executing BEFORE the internal JOBS have
- * actually finish executing.
+ * actually finished executing.
  *
  * Even though the sequence number increments per CS, the user can NOT
  * automatically assume that if CS with sequence number N finished, then CS
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/4] habanalabs: define ASIC-dependent interface for signal/wait
  2020-05-07 14:36 [PATCH 1/4] uapi: habanalabs: add signal/wait operations Oded Gabbay
@ 2020-05-07 14:36 ` Oded Gabbay
  2020-05-07 14:37 ` [PATCH 3/4] habanalabs: handle the h/w sync object Oded Gabbay
  2020-05-07 14:37 ` [PATCH 4/4] habanalabs: add signal/wait to CS IOCTL operations Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-05-07 14:36 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

From: Omer Shpigelman <oshpigelman@habana.ai>

This feature requires handling h/w resources which are a bit different from
one ASIC to the other. Therefore, we need to define a set of interfaces the
ASIC code provides to the common code to signal, wait, reset sync object
and to reset and init a queue.

As this feature is not supported in Goya, provide an empty implementation
of those functions.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c  | 43 ++++++++++++++++++++++++++++
 drivers/misc/habanalabs/habanalabs.h | 15 ++++++++++
 2 files changed, 58 insertions(+)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 70fe00264f4a..b3e2354aaca2 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5137,6 +5137,42 @@ u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
 	return cq_idx;
 }
 
+static void goya_ext_queue_init(struct hl_device *hdev, u32 q_idx)
+{
+
+}
+
+static void goya_ext_queue_reset(struct hl_device *hdev, u32 q_idx)
+{
+
+}
+
+static u32 goya_get_signal_cb_size(struct hl_device *hdev)
+{
+	return 0;
+}
+
+static u32 goya_get_wait_cb_size(struct hl_device *hdev)
+{
+	return 0;
+}
+
+static void goya_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id)
+{
+
+}
+
+static void goya_gen_wait_cb(struct hl_device *hdev, void *data, u16 sob_id,
+			u16 sob_val, u16 mon_id, u32 q_idx)
+{
+
+}
+
+static void goya_reset_sob(struct hl_device *hdev, void *data)
+{
+
+}
+
 static void goya_set_dma_mask_from_fw(struct hl_device *hdev)
 {
 	if (RREG32(mmPSOC_GLOBAL_CONF_NON_RST_FLOPS_0) ==
@@ -5222,6 +5258,13 @@ static const struct hl_asic_funcs goya_funcs = {
 	.read_device_fw_version = goya_read_device_fw_version,
 	.load_firmware_to_device = goya_load_firmware_to_device,
 	.load_boot_fit_to_device = goya_load_boot_fit_to_device,
+	.ext_queue_init = goya_ext_queue_init,
+	.ext_queue_reset = goya_ext_queue_reset,
+	.get_signal_cb_size = goya_get_signal_cb_size,
+	.get_wait_cb_size = goya_get_wait_cb_size,
+	.gen_signal_cb = goya_gen_signal_cb,
+	.gen_wait_cb = goya_gen_wait_cb,
+	.reset_sob = goya_reset_sob,
 	.set_dma_mask_from_fw = goya_set_dma_mask_from_fw,
 	.get_device_time = goya_get_device_time
 };
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 7cd9a8d72451..dd93cd903f91 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -553,6 +553,13 @@ enum hl_pll_frequency {
  *                          contained in registers
  * @load_firmware_to_device: load the firmware to the device's memory
  * @load_boot_fit_to_device: load boot fit to device's memory
+ * @ext_queue_init: Initialize the given external queue.
+ * @ext_queue_reset: Reset the given external queue.
+ * @get_signal_cb_size: Get signal CB size.
+ * @get_wait_cb_size: Get wait CB size.
+ * @gen_signal_cb: Generate a signal CB.
+ * @gen_wait_cb: Generate a wait CB.
+ * @reset_sob: Reset a SOB.
  * @set_dma_mask_from_fw: set the DMA mask in the driver according to the
  *                        firmware configuration
  * @get_device_time: Get the device time.
@@ -648,6 +655,14 @@ struct hl_asic_funcs {
 					enum hl_fw_component fwc);
 	int (*load_firmware_to_device)(struct hl_device *hdev);
 	int (*load_boot_fit_to_device)(struct hl_device *hdev);
+	void (*ext_queue_init)(struct hl_device *hdev, u32 hw_queue_id);
+	void (*ext_queue_reset)(struct hl_device *hdev, u32 hw_queue_id);
+	u32 (*get_signal_cb_size)(struct hl_device *hdev);
+	u32 (*get_wait_cb_size)(struct hl_device *hdev);
+	void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id);
+	void (*gen_wait_cb)(struct hl_device *hdev, void *data, u16 sob_id,
+				u16 sob_val, u16 mon_id, u32 q_idx);
+	void (*reset_sob)(struct hl_device *hdev, void *data);
 	void (*set_dma_mask_from_fw)(struct hl_device *hdev);
 	u64 (*get_device_time)(struct hl_device *hdev);
 };
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/4] habanalabs: handle the h/w sync object
  2020-05-07 14:36 [PATCH 1/4] uapi: habanalabs: add signal/wait operations Oded Gabbay
  2020-05-07 14:36 ` [PATCH 2/4] habanalabs: define ASIC-dependent interface for signal/wait Oded Gabbay
@ 2020-05-07 14:37 ` Oded Gabbay
  2020-05-07 14:37 ` [PATCH 4/4] habanalabs: add signal/wait to CS IOCTL operations Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-05-07 14:37 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

From: Omer Shpigelman <oshpigelman@habana.ai>

Define a structure representing the h/w sync object (SOB).

a SOB can contain up to 2^15 values. Each signal CS will increment the SOB
by 1, so after some time we will reach the maximum number the SOB can
represent. When that happens, the driver needs to move to a different SOB
for the signal operation.

A SOB can be in 1 of 4 states:

1. Working state with value < 2^15

2. We reached a value of 2^15, but the signal operations weren't completed
yet OR there are pending waits on this signal. For the next submission, the
driver will move to another SOB.

3. ALL the signal operations on the SOB have finished AND there are no more
pending waits on the SOB AND we reached a value of 2^15 (This basically
means the refcnt of the SOB is 0 - see explanation below). When that
happens, the driver can clear the SOB by simply doing WREG32 0 to it and
set the refcnt back to 1.

4. The SOB is cleared and can be used next time by the driver when it needs
to reuse an SOB.

Per SOB, the driver will maintain a single refcnt, that will be initialized
to 1. When a signal or wait operation on this SOB is submitted to the PQ,
the refcnt will be incremented. When a signal or wait operation on this SOB
completes, the refcnt will be decremented. After the submission of the
signal operation that increments the SOB to a value of 2^15, the refcnt is
also decremented.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c |  51 +++++----
 drivers/misc/habanalabs/habanalabs.h         |  57 +++++++++-
 drivers/misc/habanalabs/hw_queue.c           | 107 ++++++++++++++++++-
 3 files changed, 188 insertions(+), 27 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f7d03a35e6a8..a4211cfc752a 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -25,10 +25,10 @@ static const char *hl_fence_get_driver_name(struct dma_fence *fence)
 
 static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
 {
-	struct hl_dma_fence *hl_fence =
-		container_of(fence, struct hl_dma_fence, base_fence);
+	struct hl_cs_compl *hl_cs_compl =
+		container_of(fence, struct hl_cs_compl, base_fence);
 
-	return dev_name(hl_fence->hdev->dev);
+	return dev_name(hl_cs_compl->hdev->dev);
 }
 
 static bool hl_fence_enable_signaling(struct dma_fence *fence)
@@ -38,10 +38,10 @@ static bool hl_fence_enable_signaling(struct dma_fence *fence)
 
 static void hl_fence_release(struct dma_fence *fence)
 {
-	struct hl_dma_fence *hl_fence =
-		container_of(fence, struct hl_dma_fence, base_fence);
+	struct hl_cs_compl *hl_cs_cmpl =
+		container_of(fence, struct hl_cs_compl, base_fence);
 
-	kfree_rcu(hl_fence, base_fence.rcu);
+	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
 }
 
 static const struct dma_fence_ops hl_fence_ops = {
@@ -189,6 +189,17 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 	kfree(job);
 }
 
+void hl_sob_reset_error(struct kref *ref)
+{
+	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
+							kref);
+	struct hl_device *hdev = hw_sob->hdev;
+
+	dev_crit(hdev->dev,
+			"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
+			hw_sob->q_idx, hw_sob->sob_id);
+}
+
 static void cs_do_release(struct kref *ref)
 {
 	struct hl_cs *cs = container_of(ref, struct hl_cs,
@@ -317,7 +328,7 @@ static void cs_timedout(struct work_struct *work)
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 			struct hl_cs **cs_new)
 {
-	struct hl_dma_fence *fence;
+	struct hl_cs_compl *cs_cmpl;
 	struct dma_fence *other = NULL;
 	struct hl_cs *cs;
 	int rc;
@@ -334,20 +345,20 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	kref_init(&cs->refcount);
 	spin_lock_init(&cs->job_lock);
 
-	fence = kmalloc(sizeof(*fence), GFP_ATOMIC);
-	if (!fence) {
+	cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
+	if (!cs_cmpl) {
 		rc = -ENOMEM;
 		goto free_cs;
 	}
 
-	fence->hdev = hdev;
-	spin_lock_init(&fence->lock);
-	cs->fence = &fence->base_fence;
+	cs_cmpl->hdev = hdev;
+	spin_lock_init(&cs_cmpl->lock);
+	cs->fence = &cs_cmpl->base_fence;
 
 	spin_lock(&ctx->cs_lock);
 
-	fence->cs_seq = ctx->cs_sequence;
-	other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
+	cs_cmpl->cs_seq = ctx->cs_sequence;
+	other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
 	if ((other) && (!dma_fence_is_signaled(other))) {
 		spin_unlock(&ctx->cs_lock);
 		dev_dbg(hdev->dev,
@@ -356,16 +367,16 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto free_fence;
 	}
 
-	dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
+	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
 			ctx->asid, ctx->cs_sequence);
 
-	cs->sequence = fence->cs_seq;
+	cs->sequence = cs_cmpl->cs_seq;
 
-	ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
-							&fence->base_fence;
+	ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+							&cs_cmpl->base_fence;
 	ctx->cs_sequence++;
 
-	dma_fence_get(&fence->base_fence);
+	dma_fence_get(&cs_cmpl->base_fence);
 
 	dma_fence_put(other);
 
@@ -376,7 +387,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	return 0;
 
 free_fence:
-	kfree(fence);
+	kfree(cs_cmpl);
 free_cs:
 	kfree(cs);
 	return rc;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index dd93cd903f91..630ed43eb410 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -51,6 +51,14 @@
 /* MMU */
 #define MMU_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
 
+#define HL_RSVD_SOBS			4
+#define HL_RSVD_MONS			2
+
+#define HL_RSVD_SOBS_IN_USE		2
+#define HL_RSVD_MONS_IN_USE		1
+
+#define HL_MAX_SOB_VAL			(1 << 15)
+
 /**
  * struct pgt_info - MMU hop page info.
  * @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -104,6 +112,26 @@ enum hl_queue_type {
 	QUEUE_TYPE_HW
 };
 
+enum hl_cs_type {
+	CS_TYPE_DEFAULT,
+	CS_TYPE_SIGNAL,
+	CS_TYPE_WAIT
+};
+
+/*
+ * struct hl_hw_sob - H/W SOB info.
+ * @hdev: habanalabs device structure.
+ * @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
+ * @sob_id: id of this SOB.
+ * @q_idx: the H/W queue that uses this SOB.
+ */
+struct hl_hw_sob {
+	struct hl_device	*hdev;
+	struct kref		kref;
+	u32			sob_id;
+	u32			q_idx;
+};
+
 /**
  * struct hw_queue_properties - queue information.
  * @type: queue type.
@@ -260,17 +288,23 @@ struct asic_fixed_properties {
 };
 
 /**
- * struct hl_dma_fence - wrapper for fence object used by command submissions.
+ * struct hl_cs_compl - command submission completion object.
  * @base_fence: kernel fence object.
  * @lock: spinlock to protect fence.
  * @hdev: habanalabs device structure.
+ * @hw_sob: the H/W SOB used in this signal/wait CS.
  * @cs_seq: command submission sequence number.
+ * @type: type of the CS - signal/wait.
+ * @sob_val: the SOB value that is used in this signal/wait CS.
  */
-struct hl_dma_fence {
+struct hl_cs_compl {
 	struct dma_fence	base_fence;
 	spinlock_t		lock;
 	struct hl_device	*hdev;
+	struct hl_hw_sob	*hw_sob;
 	u64			cs_seq;
+	enum hl_cs_type		type;
+	u16			sob_val;
 };
 
 /*
@@ -368,6 +402,7 @@ struct hl_cs_job;
 
 /**
  * struct hl_hw_queue - describes a H/W transport queue.
+ * @hw_sob: array of the used H/W SOBs by this H/W queue.
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
  * @queue_type: type of queue.
  * @kernel_address: holds the queue's kernel virtual address.
@@ -378,10 +413,16 @@ struct hl_cs_job;
  * @cq_id: the id for the corresponding CQ for this H/W queue.
  * @msi_vec: the IRQ number of the H/W queue.
  * @int_queue_len: length of internal queue (number of entries).
+ * @next_sob_val: the next value to use for the currently used SOB.
+ * @base_sob_id: the base SOB id of the SOBs used by this queue.
+ * @base_mon_id: the base MON id of the MONs used by this queue.
  * @valid: is the queue valid (we have array of 32 queues, not all of them
- *		exists).
+ *         exist).
+ * @curr_sob_offset: the id offset to the currently used SOB from the
+ *                   HL_RSVD_SOBS that are being used by this queue.
  */
 struct hl_hw_queue {
+	struct hl_hw_sob	hw_sob[HL_RSVD_SOBS];
 	struct hl_cs_job	**shadow_queue;
 	enum hl_queue_type	queue_type;
 	u64			kernel_address;
@@ -392,7 +433,11 @@ struct hl_hw_queue {
 	u32			cq_id;
 	u32			msi_vec;
 	u16			int_queue_len;
+	u16			next_sob_val;
+	u16			base_sob_id;
+	u16			base_mon_id;
 	u8			valid;
+	u8			curr_sob_offset;
 };
 
 /**
@@ -796,10 +841,13 @@ struct hl_userptr {
  * @job_lock: spinlock for the CS's jobs list. Needed for free_job.
  * @refcount: reference counter for usage of the CS.
  * @fence: pointer to the fence object of this CS.
+ * @signal_fence: pointer to the fence object of the signal CS (used by wait
+ *                CS only).
  * @work_tdr: delayed work node for TDR.
  * @mirror_node : node in device mirror list of command submissions.
  * @debugfs_list: node in debugfs list of command submissions.
  * @sequence: the sequence number of this CS.
+ * @type: CS_TYPE_*.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
  * @timedout : true if CS was timedout.
@@ -814,10 +862,12 @@ struct hl_cs {
 	spinlock_t		job_lock;
 	struct kref		refcount;
 	struct dma_fence	*fence;
+	struct dma_fence	*signal_fence;
 	struct delayed_work	work_tdr;
 	struct list_head	mirror_node;
 	struct list_head	debugfs_list;
 	u64			sequence;
+	enum hl_cs_type		type;
 	u8			submitted;
 	u8			completed;
 	u8			timedout;
@@ -1620,6 +1670,7 @@ int hl_cb_pool_fini(struct hl_device *hdev);
 void hl_cs_rollback_all(struct hl_device *hdev);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
+void hl_sob_reset_error(struct kref *ref);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index a5abc224399d..f4434b39ef1b 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -403,20 +403,110 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
 	 * checked in hw_queue_sanity_checks
 	 */
 	cq = &hdev->completion_queue[q->cq_id];
+
 	cq->pi = hl_cq_inc_ptr(cq->pi);
 
 	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 /*
- * hl_hw_queue_schedule_cs - schedule a command submission
- *
- * @job        : pointer to the CS
+ * init_signal_wait_cs - initialize a signal/wait CS
+ * @cs: pointer to the signal/wait CS
  *
+ * H/W queues spinlock should be taken before calling this function
+ */
+static void init_signal_wait_cs(struct hl_cs *cs)
+{
+	struct hl_ctx *ctx = cs->ctx;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_hw_queue *hw_queue;
+	struct hl_cs_compl *cs_cmpl =
+			container_of(cs->fence, struct hl_cs_compl, base_fence);
+
+	struct hl_hw_sob *hw_sob;
+	struct hl_cs_job *job;
+	u32 q_idx;
+
+	/* There is only one job in a signal/wait CS */
+	job = list_first_entry(&cs->job_list, struct hl_cs_job,
+				cs_node);
+	q_idx = job->hw_queue_id;
+	hw_queue = &hdev->kernel_queues[q_idx];
+
+	if (cs->type & CS_TYPE_SIGNAL) {
+		hw_sob = &hw_queue->hw_sob[hw_queue->curr_sob_offset];
+
+		cs_cmpl->hw_sob = hw_sob;
+		cs_cmpl->sob_val = hw_queue->next_sob_val++;
+
+		dev_dbg(hdev->dev,
+			"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
+			cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx);
+
+		hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
+					cs_cmpl->hw_sob->sob_id);
+
+		kref_get(&hw_sob->kref);
+
+		/* check for wraparound */
+		if (hw_queue->next_sob_val == HL_MAX_SOB_VAL) {
+			/*
+			 * Decrement as we reached the max value.
+			 * The release function won't be called here as we've
+			 * just incremented the refcount.
+			 */
+			kref_put(&hw_sob->kref, hl_sob_reset_error);
+			hw_queue->next_sob_val = 1;
+			/* only two SOBs are currently in use */
+			hw_queue->curr_sob_offset =
+					(hw_queue->curr_sob_offset + 1) %
+						HL_RSVD_SOBS_IN_USE;
+
+			dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
+					hw_queue->curr_sob_offset, q_idx);
+		}
+	} else if (cs->type & CS_TYPE_WAIT) {
+		struct hl_cs_compl *signal_cs_cmpl;
+
+		signal_cs_cmpl = container_of(cs->signal_fence,
+						struct hl_cs_compl,
+						base_fence);
+
+		/* copy the the SOB id and value of the signal CS */
+		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+
+		dev_dbg(hdev->dev,
+			"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
+			cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
+			hw_queue->base_mon_id, q_idx);
+
+		hdev->asic_funcs->gen_wait_cb(hdev, job->patched_cb,
+						cs_cmpl->hw_sob->sob_id,
+						cs_cmpl->sob_val,
+						hw_queue->base_mon_id,
+						q_idx);
+
+		kref_get(&cs_cmpl->hw_sob->kref);
+		/*
+		 * Must put the signal fence after the SOB refcnt increment so
+		 * the SOB refcnt won't turn 0 and reset the SOB before the
+		 * wait CS was submitted.
+		 */
+		mb();
+		dma_fence_put(cs->signal_fence);
+		cs->signal_fence = NULL;
+	}
+}
+
+/*
+ * hl_hw_queue_schedule_cs - schedule a command submission
+ * @cs: pointer to the CS
  */
 int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 {
-	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_ctx *ctx = cs->ctx;
+	struct hl_device *hdev = ctx->hdev;
 	struct hl_cs_job *job, *tmp;
 	struct hl_hw_queue *q;
 	int rc = 0, i, cq_cnt;
@@ -462,6 +552,9 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 		}
 	}
 
+	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT))
+		init_signal_wait_cs(cs);
+
 	spin_lock(&hdev->hw_queues_mirror_lock);
 	list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
 
@@ -570,6 +663,9 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 	q->ci = 0;
 	q->pi = 0;
 
+	if (!is_cpu_queue)
+		hdev->asic_funcs->ext_queue_init(hdev, q->hw_queue_id);
+
 	return 0;
 
 free_queue:
@@ -792,5 +888,8 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
 			((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
 			continue;
 		q->pi = q->ci = 0;
+
+		if (q->queue_type == QUEUE_TYPE_EXT)
+			hdev->asic_funcs->ext_queue_reset(hdev, q->hw_queue_id);
 	}
 }
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 4/4] habanalabs: add signal/wait to CS IOCTL operations
  2020-05-07 14:36 [PATCH 1/4] uapi: habanalabs: add signal/wait operations Oded Gabbay
  2020-05-07 14:36 ` [PATCH 2/4] habanalabs: define ASIC-dependent interface for signal/wait Oded Gabbay
  2020-05-07 14:37 ` [PATCH 3/4] habanalabs: handle the h/w sync object Oded Gabbay
@ 2020-05-07 14:37 ` Oded Gabbay
  2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2020-05-07 14:37 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

From: Omer Shpigelman <oshpigelman@habana.ai>

Add the following two operations to the CS IOCTL:

Signal:

The signal operation is basically a command submission, that is created by
the driver upon user request. It will be implemented using a dedicated PQE
that will increment a specific SOB. There will be a new flag:
HL_CS_FLAGS_SIGNAL. When the user set this flag in the CS IOCTL structure,
the driver will execute a dedicated code path that will prepare this
special PQE and submit it. The user only needs to provide a queue index on
which to put the signal.

Wait:

The wait operation is also a command submission that is created by the
driver upon user request. It will be implemented using a dedicated PQE that
will contain packets of "ARM a monitor" + FENCE packet. There will be a new
flag: HL_CS_FLAGS_WAIT. When the user set this flag in the CS structure,
the driver will execute a dedicated code path that will prepare this
special PQE and submit it.

The user needs to provide the following parameters:
1. queue ID
2. an array of signal_seq numbers and the number of signals to wait on
   (the length of signal_seq_arr).

The IOCTL will return the CS sequence number of the wait it put on the
queue ID.

Currently, the code supports signal_seq_nr==1. But this API definition will
allow us to put a single PQE that waits on multiple signals.

To correctly configure the monitor and fence, the driver will need to
retrieve the specified signal CS object that contains the relevant SOB and
its expected value. In case the signal CS has already been completed, there
is no point of adding a wait operation. In this case, the driver will
return to the user *without* putting anything on the PQ. The return code
should reflect to the user that the signal was completed, as we won't
return a CS sequence number for this wait.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c | 341 +++++++++++++++++--
 drivers/misc/habanalabs/context.c            |   8 -
 drivers/misc/habanalabs/habanalabs.h         |   2 +
 3 files changed, 323 insertions(+), 28 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index a4211cfc752a..7da9847eb9d6 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -18,6 +18,26 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		struct hl_ctx *ctx, u64 timeout_us, u64 seq);
 static void cs_do_release(struct kref *ref);
 
+static void hl_sob_reset(struct kref *ref)
+{
+	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
+							kref);
+	struct hl_device *hdev = hw_sob->hdev;
+
+	hdev->asic_funcs->reset_sob(hdev, hw_sob);
+}
+
+void hl_sob_reset_error(struct kref *ref)
+{
+	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
+							kref);
+	struct hl_device *hdev = hw_sob->hdev;
+
+	dev_crit(hdev->dev,
+			"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
+			hw_sob->q_idx, hw_sob->sob_id);
+}
+
 static const char *hl_fence_get_driver_name(struct dma_fence *fence)
 {
 	return "HabanaLabs";
@@ -40,6 +60,37 @@ static void hl_fence_release(struct dma_fence *fence)
 {
 	struct hl_cs_compl *hl_cs_cmpl =
 		container_of(fence, struct hl_cs_compl, base_fence);
+	struct hl_device *hdev = hl_cs_cmpl->hdev;
+
+	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
+			(hl_cs_cmpl->type == CS_TYPE_WAIT)) {
+
+		dev_dbg(hdev->dev,
+			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
+			hl_cs_cmpl->cs_seq,
+			hl_cs_cmpl->type,
+			hl_cs_cmpl->hw_sob->sob_id,
+			hl_cs_cmpl->sob_val);
+
+		/*
+		 * A signal CS can get completion while the corresponding wait
+		 * for signal CS is on its way to the PQ. The wait for signal CS
+		 * will get stuck if the signal CS incremented the SOB to its
+		 * max value and there are no pending (submitted) waits on this
+		 * SOB.
+		 * We do the following to void this situation:
+		 * 1. The wait for signal CS must get a ref for the signal CS as
+		 *    soon as possible in cs_ioctl_signal_wait() and put it
+		 *    before being submitted to the PQ but after it incremented
+		 *    the SOB refcnt in init_signal_wait_cs().
+		 * 2. Signal/Wait for signal CS will decrement the SOB refcnt
+		 *    here.
+		 * These two measures guarantee that the wait for signal CS will
+		 * reset the SOB upon completion rather than the signal CS and
+		 * hence the above scenario is avoided.
+		 */
+		kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
+	}
 
 	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
 }
@@ -189,17 +240,6 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 	kfree(job);
 }
 
-void hl_sob_reset_error(struct kref *ref)
-{
-	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
-							kref);
-	struct hl_device *hdev = hw_sob->hdev;
-
-	dev_crit(hdev->dev,
-			"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
-			hw_sob->q_idx, hw_sob->sob_id);
-}
-
 static void cs_do_release(struct kref *ref)
 {
 	struct hl_cs *cs = container_of(ref, struct hl_cs,
@@ -273,6 +313,12 @@ static void cs_do_release(struct kref *ref)
 
 			spin_unlock(&hdev->hw_queues_mirror_lock);
 		}
+	} else if (cs->type == CS_TYPE_WAIT) {
+		/*
+		 * In case the wait for signal CS was submitted, the put occurs
+		 * in init_signal_wait_cs() right before hanging on the PQ.
+		 */
+		dma_fence_put(cs->signal_fence);
 	}
 
 	/*
@@ -326,7 +372,7 @@ static void cs_timedout(struct work_struct *work)
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
-			struct hl_cs **cs_new)
+			enum hl_cs_type cs_type, struct hl_cs **cs_new)
 {
 	struct hl_cs_compl *cs_cmpl;
 	struct dma_fence *other = NULL;
@@ -340,6 +386,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->ctx = ctx;
 	cs->submitted = false;
 	cs->completed = false;
+	cs->type = cs_type;
 	INIT_LIST_HEAD(&cs->job_list);
 	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
 	kref_init(&cs->refcount);
@@ -352,6 +399,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	}
 
 	cs_cmpl->hdev = hdev;
+	cs_cmpl->type = cs->type;
 	spin_lock_init(&cs_cmpl->lock);
 	cs->fence = &cs_cmpl->base_fence;
 
@@ -513,8 +561,8 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 	return job;
 }
 
-static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
-			u32 num_chunks, u64 *cs_seq)
+static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
+				u32 num_chunks, u64 *cs_seq)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_cs_chunk *cs_chunk_array;
@@ -552,7 +600,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	/* increment refcnt for context */
 	hl_ctx_get(hdev, hpriv->ctx);
 
-	rc = allocate_cs(hdev, hpriv->ctx, &cs);
+	rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT, &cs);
 	if (rc) {
 		hl_ctx_put(hpriv->ctx);
 		goto free_cs_chunk_array;
@@ -666,12 +714,229 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	return rc;
 }
 
+static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
+				void __user *chunks, u32 num_chunks,
+				u64 *cs_seq)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_ctx *ctx = hpriv->ctx;
+	struct hl_cs_chunk *cs_chunk_array, *chunk;
+	struct hw_queue_properties *hw_queue_prop;
+	struct dma_fence *sig_fence = NULL;
+	struct hl_cs_job *job;
+	struct hl_cs *cs;
+	struct hl_cb *cb;
+	u64 *signal_seq_arr = NULL, signal_seq;
+	u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size;
+	int rc;
+
+	*cs_seq = ULLONG_MAX;
+
+	if (num_chunks > HL_MAX_JOBS_PER_CS) {
+		dev_err(hdev->dev,
+			"Number of chunks can NOT be larger than %d\n",
+			HL_MAX_JOBS_PER_CS);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+					GFP_ATOMIC);
+	if (!cs_chunk_array) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
+	if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
+		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
+		rc = -EFAULT;
+		goto free_cs_chunk_array;
+	}
+
+	/* currently it is guaranteed to have only one chunk */
+	chunk = &cs_chunk_array[0];
+	q_idx = chunk->queue_index;
+	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+
+	if ((q_idx >= HL_MAX_QUEUES) ||
+			(hw_queue_prop->type != QUEUE_TYPE_EXT)) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
+		rc = -EINVAL;
+		goto free_cs_chunk_array;
+	}
+
+	if (cs_type == CS_TYPE_WAIT) {
+		struct hl_cs_compl *sig_waitcs_cmpl;
+
+		signal_seq_arr_len = chunk->num_signal_seq_arr;
+
+		/* currently only one signal seq is supported */
+		if (signal_seq_arr_len != 1) {
+			dev_err(hdev->dev,
+				"Wait for signal CS supports only one signal CS seq\n");
+			rc = -EINVAL;
+			goto free_cs_chunk_array;
+		}
+
+		signal_seq_arr = kmalloc_array(signal_seq_arr_len,
+						sizeof(*signal_seq_arr),
+						GFP_ATOMIC);
+		if (!signal_seq_arr) {
+			rc = -ENOMEM;
+			goto free_cs_chunk_array;
+		}
+
+		size_to_copy = chunk->num_signal_seq_arr *
+				sizeof(*signal_seq_arr);
+		if (copy_from_user(signal_seq_arr,
+					(void __user *) chunk->signal_seq_arr,
+					size_to_copy)) {
+			dev_err(hdev->dev,
+				"Failed to copy signal seq array from user\n");
+			rc = -EFAULT;
+			goto free_signal_seq_array;
+		}
+
+		/* currently it is guaranteed to have only one signal seq */
+		signal_seq = signal_seq_arr[0];
+		sig_fence = hl_ctx_get_fence(ctx, signal_seq);
+		if (IS_ERR(sig_fence)) {
+			dev_err(hdev->dev,
+				"Failed to get signal CS with seq 0x%llx\n",
+				signal_seq);
+			rc = PTR_ERR(sig_fence);
+			goto free_signal_seq_array;
+		}
+
+		if (!sig_fence) {
+			/* signal CS already finished */
+			rc = 0;
+			goto free_signal_seq_array;
+		}
+
+		sig_waitcs_cmpl =
+			container_of(sig_fence, struct hl_cs_compl, base_fence);
+
+		if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) {
+			dev_err(hdev->dev,
+				"CS seq 0x%llx is not of a signal CS\n",
+				signal_seq);
+			dma_fence_put(sig_fence);
+			rc = -EINVAL;
+			goto free_signal_seq_array;
+		}
+
+		if (dma_fence_is_signaled(sig_fence)) {
+			/* signal CS already finished */
+			dma_fence_put(sig_fence);
+			rc = 0;
+			goto free_signal_seq_array;
+		}
+	}
+
+	/* increment refcnt for context */
+	hl_ctx_get(hdev, ctx);
+
+	rc = allocate_cs(hdev, ctx, cs_type, &cs);
+	if (rc) {
+		if (cs_type == CS_TYPE_WAIT)
+			dma_fence_put(sig_fence);
+		hl_ctx_put(ctx);
+		goto free_signal_seq_array;
+	}
+
+	/*
+	 * Save the signal CS fence for later initialization right before
+	 * hanging the wait CS on the queue.
+	 */
+	if (cs->type == CS_TYPE_WAIT)
+		cs->signal_fence = sig_fence;
+
+	hl_debugfs_add_cs(cs);
+
+	*cs_seq = cs->sequence;
+
+	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
+	if (!job) {
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		rc = -ENOMEM;
+		goto put_cs;
+	}
+
+	cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+	if (!cb) {
+		kfree(job);
+		rc = -EFAULT;
+		goto put_cs;
+	}
+
+	if (cs->type == CS_TYPE_WAIT)
+		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
+	else
+		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
+
+	job->id = 0;
+	job->cs = cs;
+	job->user_cb = cb;
+	job->user_cb->cs_cnt++;
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = q_idx;
+
+	/*
+	 * No need in parsing, user CB is the patched CB.
+	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
+	 * the CB idr anymore and to decrement its refcount as it was
+	 * incremented inside hl_cb_kernel_create().
+	 */
+	job->patched_cb = job->user_cb;
+	job->job_cb_size = job->user_cb_size;
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+	list_add_tail(&job->cs_node, &cs->job_list);
+
+	/* increment refcount as for external queues we get completion */
+	cs_get(cs);
+
+	hl_debugfs_add_job(hdev, job);
+
+	rc = hl_hw_queue_schedule_cs(cs);
+	if (rc) {
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
+				ctx->asid, cs->sequence, rc);
+		goto free_cs_object;
+	}
+
+	rc = HL_CS_STATUS_SUCCESS;
+	goto put_cs;
+
+free_cs_object:
+	cs_rollback(hdev, cs);
+	*cs_seq = ULLONG_MAX;
+	/* The path below is both for good and erroneous exits */
+put_cs:
+	/* We finished with the CS in this function, so put the ref */
+	cs_put(cs);
+free_signal_seq_array:
+	if (cs_type == CS_TYPE_WAIT)
+		kfree(signal_seq_arr);
+free_cs_chunk_array:
+	kfree(cs_chunk_array);
+out:
+	return rc;
+}
+
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	union hl_cs_args *args = data;
 	struct hl_ctx *ctx = hpriv->ctx;
 	void __user *chunks_execute, *chunks_restore;
+	enum hl_cs_type cs_type;
 	u32 num_chunks_execute, num_chunks_restore, sig_wait_flags;
 	u64 cs_seq = ULONG_MAX;
 	int rc, do_ctx_switch;
@@ -687,6 +952,14 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT;
 
+	if (unlikely(sig_wait_flags == HL_CS_FLAGS_SIG_WAIT)) {
+		dev_err(hdev->dev,
+			"Signal and wait CS flags are mutually exclusive, context %d\n",
+		ctx->asid);
+		rc = -EINVAL;
+		goto out;
+	}
+
 	if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) &&
 			(!hdev->supports_sync_stream))) {
 		dev_err(hdev->dev, "Sync stream CS is not supported\n");
@@ -694,12 +967,27 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		goto out;
 	}
 
+	if (args->in.cs_flags & HL_CS_FLAGS_SIGNAL)
+		cs_type = CS_TYPE_SIGNAL;
+	else if (args->in.cs_flags & HL_CS_FLAGS_WAIT)
+		cs_type = CS_TYPE_WAIT;
+	else
+		cs_type = CS_TYPE_DEFAULT;
+
 	chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute;
 	num_chunks_execute = args->in.num_chunks_execute;
 
-	if (!num_chunks_execute) {
+	if (cs_type == CS_TYPE_DEFAULT) {
+		if (!num_chunks_execute) {
+			dev_err(hdev->dev,
+				"Got execute CS with 0 chunks, context %d\n",
+				ctx->asid);
+			rc = -EINVAL;
+			goto out;
+		}
+	} else if (num_chunks_execute != 1) {
 		dev_err(hdev->dev,
-			"Got execute CS with 0 chunks, context %d\n",
+			"Sync stream CS mandates one chunk only, context %d\n",
 			ctx->asid);
 		rc = -EINVAL;
 		goto out;
@@ -745,7 +1033,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 			"Need to run restore phase but restore CS is empty\n");
 			rc = 0;
 		} else {
-			rc = _hl_cs_ioctl(hpriv, chunks_restore,
+			rc = cs_ioctl_default(hpriv, chunks_restore,
 						num_chunks_restore, &cs_seq);
 		}
 
@@ -787,7 +1075,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		}
 	}
 
-	rc = _hl_cs_ioctl(hpriv, chunks_execute, num_chunks_execute, &cs_seq);
+	if (cs_type == CS_TYPE_DEFAULT)
+		rc = cs_ioctl_default(hpriv, chunks_execute, num_chunks_execute,
+					&cs_seq);
+	else
+		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks_execute,
+						num_chunks_execute, &cs_seq);
 
 out:
 	if (rc != -EAGAIN) {
@@ -819,6 +1112,10 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 	fence = hl_ctx_get_fence(ctx, seq);
 	if (IS_ERR(fence)) {
 		rc = PTR_ERR(fence);
+		if (rc == -EINVAL)
+			dev_notice_ratelimited(hdev->dev,
+				"Can't wait on seq %llu because current CS is at seq %llu\n",
+				seq, ctx->cs_sequence);
 	} else if (fence) {
 		rc = dma_fence_wait_timeout(fence, true, timeout);
 		if (fence->error == -ETIMEDOUT)
@@ -826,8 +1123,12 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		else if (fence->error == -EIO)
 			rc = -EIO;
 		dma_fence_put(fence);
-	} else
+	} else {
+		dev_dbg(hdev->dev,
+			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+			seq, ctx->cs_sequence);
 		rc = 1;
+	}
 
 	hl_ctx_put(ctx);
 
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index 2df6fb87e7ff..ec92b3506b1f 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -170,24 +170,16 @@ int hl_ctx_put(struct hl_ctx *ctx)
 
 struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
-	struct hl_device *hdev = ctx->hdev;
 	struct dma_fence *fence;
 
 	spin_lock(&ctx->cs_lock);
 
 	if (seq >= ctx->cs_sequence) {
-		dev_notice_ratelimited(hdev->dev,
-			"Can't wait on seq %llu because current CS is at seq %llu\n",
-			seq, ctx->cs_sequence);
 		spin_unlock(&ctx->cs_lock);
 		return ERR_PTR(-EINVAL);
 	}
 
-
 	if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
-		dev_dbg(hdev->dev,
-			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
-			seq, ctx->cs_sequence);
 		spin_unlock(&ctx->cs_lock);
 		return NULL;
 	}
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 630ed43eb410..a68df32094f1 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -843,6 +843,7 @@ struct hl_userptr {
  * @fence: pointer to the fence object of this CS.
  * @signal_fence: pointer to the fence object of the signal CS (used by wait
  *                CS only).
+ * @finish_work: workqueue object to run when CS is completed by H/W.
  * @work_tdr: delayed work node for TDR.
  * @mirror_node : node in device mirror list of command submissions.
  * @debugfs_list: node in debugfs list of command submissions.
@@ -863,6 +864,7 @@ struct hl_cs {
 	struct kref		refcount;
 	struct dma_fence	*fence;
 	struct dma_fence	*signal_fence;
+	struct work_struct	finish_work;
 	struct delayed_work	work_tdr;
 	struct list_head	mirror_node;
 	struct list_head	debugfs_list;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-05-07 14:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-07 14:36 [PATCH 1/4] uapi: habanalabs: add signal/wait operations Oded Gabbay
2020-05-07 14:36 ` [PATCH 2/4] habanalabs: define ASIC-dependent interface for signal/wait Oded Gabbay
2020-05-07 14:37 ` [PATCH 3/4] habanalabs: handle the h/w sync object Oded Gabbay
2020-05-07 14:37 ` [PATCH 4/4] habanalabs: add signal/wait to CS IOCTL operations Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).