All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kari Argillander <kari.argillander@stargateuniverse.net>
To: Cai Huoqing <cai.huoqing@linux.dev>
Cc: "Maarten Lankhorst" <maarten.lankhorst@linux.intel.com>,
	"Maxime Ripard" <mripard@kernel.org>,
	"Thomas Zimmermann" <tzimmermann@suse.de>,
	"David Airlie" <airlied@linux.ie>,
	"Daniel Vetter" <daniel@ffwll.ch>,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	"Christian König" <christian.koenig@amd.com>,
	linux-kernel@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linux-media@vger.kernel.org, linaro-mm-sig@lists.linaro.org
Subject: Re: [PATCH 2/2] drm/nvdla: Add driver support for NVDLA
Date: Fri, 22 Apr 2022 01:01:14 +0300	[thread overview]
Message-ID: <0fa9ab41-c18e-a569-e6fe-a0e9d965905e@stargateuniverse.net> (raw)
In-Reply-To: <20220419135908.39606-3-cai.huoqing@linux.dev>

This is just quick look up. I basically check some style issues and did
some basic static analyzing.

I have run
  - cppcheck (which found couple mistakes)
  - flawfinder (did not found anything to my eyes)
  - codespell (did find couple typo)

You can run these yourself also or check below.

Couple common things which you can ignore or not	.
- Usually in this code there is goto exit and it is just return. Maybe
   use just return straight away. No need to jump.
- Some comments start capital others not. Maybe all should start
   capital. Very small nit, but makes nice touch to the code.
- Lot of oneline comments are unneccessary three line comments.

On 19.4.2022 16.59, Cai Huoqing wrote:
> The NVIDIA Deep Learning Accelerator (NVDLA) is an open source IP
> which is integrated into NVIDIA Jetson AGX Xavier,
> so add driver support for this accelerator.
> 
> Signed-off-by: Cai Huoqing <cai.huoqing@linux.dev>

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_bdma.c b/drivers/gpu/drm/nvdla/nvdla_bdma.c
> new file mode 100644
> index 000000000000..225613f27acf
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_bdma.c

... snip

> +static int32_t
> +processor_bdma_program_slot(struct dla_engine *engine,
> +							struct dla_bdma_surface_desc *bdma_surface,
> +							struct dla_bdma_transfer_desc *transfer)
> +{
> +	int32_t ret = 0;
> +	uint64_t source_addr = 0;
> +	uint64_t destination_addr = 0;
> +	uint32_t high, low, reg;
> +	uint8_t  bdma_free_slots = 0;
> +
> +	/* make sure there're enough free slots */
> +	if (bdma_free_slots <= 0) {

This is always true right now.

> +		do {
> +			reg = bdma_reg_read(engine, STATUS);
> +			reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >>
> +					SHIFT(BDMA_STATUS_0, FREE_SLOT);
> +		} while (reg == 0);
> +		bdma_free_slots = (uint8_t)reg;
> +	}
> +
> +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> +						transfer->source_address,
> +						(void *)&source_addr,
> +						DESTINATION_DMA);
> +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> +						transfer->destination_address,
> +						(void *)&destination_addr,
> +						DESTINATION_DMA);
> +
> +	ASSERT_GOTO((transfer->line_repeat <= 8192),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO((transfer->surface_repeat <= 8192),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO((transfer->line_size % 32) == 0,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->source_line >= transfer->line_size,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->destination_line >= transfer->line_size,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->source_surface >=
> +			(transfer->source_line * transfer->line_repeat),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->destination_surface >=
> +			(transfer->destination_line * transfer->line_repeat),
> +				ret, -EINVAL, exit);
> +
> +	/* config registers */
> +	high = upper_32_bits(source_addr);
> +	low = lower_32_bits(source_addr);
> +	bdma_reg_write(engine, CFG_SRC_ADDR_LOW, low);
> +	bdma_reg_write(engine, CFG_SRC_ADDR_HIGH, high);
> +	high = upper_32_bits(destination_addr);
> +	low = lower_32_bits(destination_addr);
> +	bdma_reg_write(engine, CFG_DST_ADDR_LOW, low);
> +	bdma_reg_write(engine, CFG_DST_ADDR_HIGH, high);
> +	bdma_reg_write(engine, CFG_LINE, (transfer->line_size >> 5) - 1);
> +	reg = (map_mem[bdma_surface->source_type] <<
> +				SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) |
> +		(map_mem[bdma_surface->destination_type] <<
> +				SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE));
> +	bdma_reg_write(engine, CFG_CMD, reg);
> +	bdma_reg_write(engine, CFG_LINE_REPEAT, transfer->line_repeat - 1);
> +	bdma_reg_write(engine, CFG_SRC_LINE, transfer->source_line);
> +	bdma_reg_write(engine, CFG_DST_LINE, transfer->destination_line);
> +	bdma_reg_write(engine, CFG_SURF_REPEAT, transfer->surface_repeat - 1);
> +	bdma_reg_write(engine, CFG_SRC_SURF, transfer->source_surface);
> +	bdma_reg_write(engine, CFG_DST_SURF, transfer->destination_surface);
> +	bdma_reg_write(engine, CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE));
> +
> +exit:
> +	return ret;
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_cache.c b/drivers/gpu/drm/nvdla/nvdla_cache.c
> new file mode 100644
> index 000000000000..f8bd7b514aab
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_cache.c
> @@ -0,0 +1,215 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> +/*
> + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> + * Copyright (C) 2022 Cai Huoqing
> + */
> +
> +#include "nvdla_common.h"
> +#include "nvdla_drm.h"
> +#include "nvdla_reg.h"
> +#include "nvdla_engine.h"
> +
> +#define DLA_OP_CACHE_SIZE (DLA_NUM_GROUPS * ((DLA_OP_NUM + 2) * 2))
> +
> +static struct dla_common_op_desc desc_cache[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> +static int32_t desc_refcount[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> +
> +void
> +dla_get_refcount(struct dla_common_op_desc *op_desc)
> +{
> +	int32_t i;
> +	struct dla_common_op_desc *desc = NULL;
> +
> +	if (op_desc == NULL)
> +		return;
> +
> +	if (op_desc->index == -1)
> +		return;
> +
> +	desc = &desc_cache[op_desc->op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == op_desc->index &&
> +				desc->roi_index == op_desc->roi_index) {

reverse if

		if (desc->index != op_desc->index)
			continue;
		if (desc->roi_index != op_desc->roi_index)
			continue;

> +			desc_refcount[op_desc->op_type][i]++;
> +			return;
> +		}
> +	}
> +}
> +
> +struct dla_common_op_desc *
> +dla_get_op_desc(struct dla_engine *engine,
> +				struct dla_task *task, int16_t index,
> +				uint8_t op_type, uint8_t roi_index)
> +{
> +	int32_t i;
> +	int32_t ret;
> +	uint64_t op_base;
> +	uint64_t dep_graph_addr;
> +	struct dla_common_op_desc *desc = NULL;
> +
> +	if (index == -1) {
> +		pr_debug("no desc get due to index==-1\n");
> +		goto exit;
> +	}
> +
> +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> +				engine->network->num_operations * roi_index);
> +
> +	desc = &desc_cache[op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == index && desc->roi_index == roi_index) {
> +			if (desc->op_type != op_type) {
> +				pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
> +					   op_type, desc->op_type);
> +				continue;
> +			}

reverse if so this will be pretty clean

		if (desc->index != index)
			continue;
		if (desc->roi_index != roi_index)
			continue;
		if (desc->op_type != op_type) {
			pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
					op_type, desc->op_type);
			continue;
		}


> +			desc_refcount[op_type][i]++;
> +			goto exit;
> +		}
> +	}
> +
> +	desc = &desc_cache[op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == -1) {

reverse if
		if (desc->index != -1)
			continue;

> +			op_base = dep_graph_addr +
> +					(sizeof(struct dla_common_op_desc) *
> +					(uint64_t)index);
> +			ret = dla_data_read(engine->driver_context,
> +					task->task_data,
> +					task->dependency_graph_addr,
> +					(void *)(desc),
> +					sizeof(struct dla_common_op_desc),
> +					op_base);
> +			if (ret) {
> +				desc = NULL;
> +				goto exit;
> +			}
> +
> +			if (op_type != desc->op_type) {
> +				/*
> +				 * op_type of entry read from DRAM should not
> +				 * mismatch with given op_type. If they
> +				 * mismatches, then wrong entry is fetched, so
> +				 * report this issue by throwing error.
> +				 */
> +				pr_err("Fetched [op_type=%u] from DRAM doesn't match with op_type[%u]\n",
> +					   desc->op_type, op_type);
> +				desc->op_type = op_type;
> +				desc->index = -1;
> +				desc->roi_index = -1;
> +				desc = NULL;
> +				goto exit;
> +			}
> +
> +			desc->index = index;
> +			desc->roi_index = roi_index;
> +
> +			desc_refcount[op_type][i]++;
> +			goto exit;
> +		}
> +	}
> +
> +exit:
> +	return desc;
> +}
> +
> +static void
> +dla_free_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> +{
> +	uint64_t op_base;
> +	uint64_t dep_graph_addr;
> +	struct dla_task *task;
> +
> +	pr_debug("Enter: %s op desc index %u ROI %d\n", __func__,
> +				op_desc->index, op_desc->roi_index);

Possiple null pointer dereference

> +	task = engine->task;
> +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> +				engine->network->num_operations *
> +				op_desc->roi_index);
> +
> +	if (op_desc->index == -1)
> +		goto exit;

Possiple null pointer dereference

> +	if (op_desc == NULL)
> +		goto exit;

Or this is unnecessary.

> +
> +	/**
> +	 * TODO: keeping the depth value hardcoded as 0 for now,
> +	 * need to replace it once corresponding implementation is done.
> +	 */
> +	op_base = (dep_graph_addr +
> +			(sizeof(struct dla_common_op_desc) *
> +			(uint64_t)op_desc->index));
> +
> +	/**
> +	 * Flush descriptor to DRAM
> +	 */
> +	dla_data_write(engine->driver_context,
> +			task->task_data,
> +			(void *)op_desc,
> +			task->dependency_graph_addr,
> +			sizeof(struct dla_common_op_desc),
> +			op_base);
> +
> +	/**
> +	 * Release it
> +	 */
> +	op_desc->index = -1;
> +	op_desc->roi_index = -1;
> +exit:
> +	return;
> +}
> +
> +void
> +dla_put_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> +{
> +	int32_t i;
> +	struct dla_common_op_desc *desc;
> +
> +	if (op_desc == NULL)
> +		return;
> +
> +	if (op_desc->index == -1)
> +		return;
> +
> +	desc = &desc_cache[op_desc->op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == op_desc->index &&
> +				desc->roi_index == op_desc->roi_index) {

Reverse if.

		if (desc->index != op_desc->index)
			continue;
		if (desc->roi_index != op_desc->roi_index)
			continue;

> +
> +			desc_refcount[op_desc->op_type][i]--;
> +
> +			/**
> +			 * Free desc if refcount is 0
> +			 */
Pretty useless comment and totally not needed three line for this.

> +			if (desc_refcount[op_desc->op_type][i] == 0)
> +				dla_free_op_desc(engine, op_desc);
> +
> +			return;
> +		}
> +	}
> +}
> +
> +void
> +dla_init_op_cache(struct dla_engine *engine)
> +{
> +	int32_t i, j;
> +	struct dla_common_op_desc *desc = &desc_cache[0][0];
> +
> +	memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache));
> +	memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount));
> +
> +	for (i = 0; i < DLA_OP_NUM; i++) {
> +		for (j = 0; j < DLA_OP_CACHE_SIZE; j++) {
> +			desc->index = -1;
> +			desc->roi_index = -1;
> +			desc->op_type = (uint8_t)i;
> +			desc++;
> +		}
> +	}
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_common.h b/drivers/gpu/drm/nvdla/nvdla_common.h
> new file mode 100644
> index 000000000000..38cf43246890
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_common.h
> @@ -0,0 +1,835 @@

... snip

> +struct dla_conv_op_desc {
> +	/* Performance parameters */
> +
> +	/* dla_conv_mode */
> +	uint8_t conv_mode;
> +	uint8_t data_reuse;
> +	uint8_t weight_reuse;
> +	uint8_t skip_data_rls;
> +
> +	uint8_t skip_weight_rls;
> +	uint8_t reserved0;
> +	uint16_t entry_per_slice;
> +
> +	/* dla_data_format */
> +	uint8_t data_format;
> +	/* dla_pixel_mapping */
> +	uint8_t pixel_mapping;
> +	/* number of free slices before fetch */
> +	uint16_t fetch_grain;
> +
> +	uint8_t reserved_b[8];
> +
> +	/* batch_num */
> +	uint8_t batch;
> +	/* dla_weight_format */
> +	uint8_t weight_format;
> +	uint8_t data_bank;
> +	uint8_t weight_bank;
> +
> +	/* the offset in bytes of each data cube in a batch */
> +	uint32_t batch_stride;
> +
> +	uint8_t post_extension;
> +	uint8_t pixel_override;
> +	/* number of slices need to be released */
> +	uint16_t release;
> +
> +	 /* The input cube dimension for CSC */
> +	uint16_t input_width_csc;
> +	uint16_t input_height_csc;
> +
> +	uint16_t input_channel_csc;
> +	uint16_t kernel_width_csc;
> +
> +	uint16_t kernel_height_csc;
> +	uint16_t kernel_channel_csc;
> +
> +	/* The input cube dimension for CMAC */
> +	uint16_t input_width_cmac;
> +	uint16_t input_height_cmac;
> +
> +	/* actual size in bytes */
> +	uint32_t bytes_per_kernel;
> +
> +	/* Algorithm parameters */
> +
> +	int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
> +	int16_t mean_gu; /* mean value for green in RGB or U in YUV */
> +
> +	int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
> +	int16_t mean_ax;
> +
> +	uint8_t mean_format; /* dla_mean_format */
> +	uint8_t conv_stride_x;
> +	uint8_t conv_stride_y;
> +	uint8_t pad_x_left;
> +
> +	uint8_t pad_x_right;
> +	uint8_t pad_y_top;
> +	uint8_t pad_y_bottom;
> +	uint8_t dilation_x;
> +
> +	uint8_t dilation_y;
> +	uint8_t reserved2[2];
> +
> +	/* Precision parameters */
> +	uint8_t pra_truncate;
> +
> +	uint8_t in_precision;
> +	/* The output precision from CONV, it's the MAC processing precison */

./nvdla_common.h:428: precison ==> precision

> +	uint8_t out_precision;
> +	int16_t pad_val;
> +
> +	/* input converter parameters */
> +	struct dla_cvt_param in_cvt;
> +	/* output converter parameters, support truncate only */
> +	struct dla_cvt_param out_cvt;
> +
> +} __packed __aligned(4);
> +
> +struct dla_conv_stat_desc {
> +	uint32_t data_read_stall;
> +	uint32_t weight_read_stall;
> +	uint32_t data_read_latency;
> +	uint32_t weight_read_latency;
> +	uint32_t saturation_count;
> +	uint32_t nan_data_num;
> +	uint32_t nan_weight_num;
> +	uint32_t inf_data_num;
> +	uint32_t inf_weight_num;
> +} __packed __aligned(4);
> +
> +/**
> + * @ingroup SDP
> + * @name Activation functions
> + * @brief Activation functions supported in SDP
> + * @{
> + */
> +#define ACTIVATION_NONE		0
> +#define ACTIVATION_RELU		1
> +#define ACTIVATION_LUT		2
> +#define ACTIVATION_PRELU	3
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT size
> + * @brief LUT sizes for linear and exponentila LUT
> + * @{
> + */
> +#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2		6
> +#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2	8
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT types
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_LINEAR_EXP_TABLE		0
> +#define LUT_LINEAR_ONLY_TABLE		1
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT methods
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_METHOD_EXPONENTIAL		0
> +#define LUT_METHOD_LINEAR		1
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_PRI_LINEAR_EXP		0
> +#define LUT_PRI_LINEAR_ONLY		1
> +/** @} */
> +
> +union dla_lut_offset {
> +	/**
> +	 * Number should be substracted on log domain before look up

./nvdla_common.h:505: substracted ==> subtracted

> +	 * exponetial table it has the same definition as hardware

./nvdla_common.h:506: exponetial ==> exponential

> +	 * thus input scaling should also take into account when
> +	 * set this field.
> +	 */
> +	int8_t exp_offset;
> +	/**
> +	 * Number of bits should be right shift before looking
> +	 * up linear table
> +	 */
> +	int8_t frac_bits;
> +	uint16_t reserved0;
> +};

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_drm.c b/drivers/gpu/drm/nvdla/nvdla_drm.c
> new file mode 100644
> index 000000000000..9217eee1de3b
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_drm.c
> @@ -0,0 +1,695 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> +/*
> + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> + * Copyright (C) 2022 Cai Huoqing
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/fs.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_device.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/time.h>
> +#include <linux/uaccess.h>
> +#include <linux/types.h>
> +
> +#include "nvdla_drm.h"
> +#include "nvdla_ioctl.h"
> +#include "nvdla_engine.h"
> +
> +static struct nvdla_config nvdla_config_os_initial = {
> +	.atom_size = 32,
> +	.bdma_enable = true,
> +	.rubik_enable = true,
> +	.weight_compress_support = true,
> +};
> +
> +static struct nvdla_config nvdla_config_small = {
> +	//.atom_size = 8,
> +	.atom_size = 32,  // nv_large config
> +	.bdma_enable = false,
> +	.rubik_enable = false,
> +	.weight_compress_support = false,
> +};
> +
> +int64_t dla_get_time_us(void)

Funtion is never used.

> +{
> +	return ktime_get_ns() / NSEC_PER_USEC;
> +}
> +
> +void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg)
> +{
> +	struct nvdla_device *nvdla_dev =
> +			(struct nvdla_device *)driver_context;
> +
> +	if (!nvdla_dev)
> +		return;
> +
> +	writel(reg, nvdla_dev->base + addr);
> +}
> +
> +uint32_t dla_reg_read(void *driver_context, uint32_t addr)
> +{
> +	struct nvdla_device *nvdla_dev =
> +			(struct nvdla_device *)driver_context;
> +
> +	if (!nvdla_dev)
> +		return 0;
> +
> +	return readl(nvdla_dev->base + addr);
> +}
> +
> +static irqreturn_t nvdla_engine_isr(int32_t irq, void *data)
> +{
> +	unsigned long flags;
> +	uint32_t mask;
> +	uint32_t reg;
> +	struct dla_processor *processor = NULL;
> +	struct dla_processor_group *group;
> +	struct dla_engine *engine;
> +	struct nvdla_device *nvdla_dev = (struct nvdla_device *)data;
> +
> +	if (!nvdla_dev)
> +		return IRQ_NONE;
> +
> +	engine = nvdla_dev->engine_context;
> +	spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
> +
> +	mask = glb_reg_read(engine, S_INTR_MASK);

Never used. It would be nice so that static analyzer will not complain
these anymore, but your choice what you want to do.

> +	reg = glb_reg_read(engine, S_INTR_STATUS);
> +
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_SDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_SDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_RUBIK];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_RUBIK];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_PDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_PDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_BDMA];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_BDMA];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> +	}
> +
> +	glb_reg_write(engine, S_INTR_STATUS, reg);
> +	mask = glb_reg_read(engine, S_INTR_MASK);

Never used

> +	reg = glb_reg_read(engine, S_INTR_STATUS);

Never used.

> +
> +	complete(&nvdla_dev->event_notifier);
> +	spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
> +
> +	return IRQ_HANDLED;
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_gem.c b/drivers/gpu/drm/nvdla/nvdla_gem.c
> new file mode 100644
> index 000000000000..cccf6d01a564
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_gem.c

... snip

> +static const struct drm_ioctl_desc nvdla_drm_ioctls[] = {
> +	DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW),
> +	/* use DRM_IOCTL_MODE_DESTROY_DUMB to destory */

./nvdla_gem.c:347: destory ==> destroy

> +};

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_scheduler.c b/drivers/gpu/drm/nvdla/nvdla_scheduler.c
> new file mode 100644
> index 000000000000..b814077478c6
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_scheduler.c

... snip

> +static int
> +dla_update_dependency(struct dla_engine *engine,
> +					  struct dla_consumer *consumer,
> +					  struct dla_common_op_desc *op_desc,
> +					  uint8_t event, uint8_t roi_index)
> +{
> +	int32_t ret = 0;
> +	struct dla_processor *processor;
> +
> +	if (consumer->index == -1)
> +		goto exit;
> +
> +	/* Update dependency only if event matches */
> +	if (event != consumer->event)
> +		goto exit;
> +
> +	/**
> +	 * If consumer index is valid but op desc is NULL means
> +	 * op desc for consumer was not pre-fetched
> +	 */
> +	if (op_desc == NULL) {
> +		ret = -EINVAL;
> +		pr_err("Operation descriptor is NULL, consumer index %d",
> +				consumer->index);
> +		goto exit;
> +	}
> +
> +	pr_debug("Update dependency operation index %d ROI %d DEP_COUNT=%d\n",
> +					op_desc->index, op_desc->roi_index,
> +					op_desc->dependency_count);
> +	op_desc->dependency_count--;
> +
> +	if (op_desc->dependency_count == 0) {
> +		processor = &engine->processors[op_desc->op_type];
> +		pr_debug("enable %s in %s as depdency are resolved\n",

./nvdla_scheduler.c:455: depdency ==> dependency

> +			processor->name, __func__);
> +
> +		ret = dla_enable_operation(engine, processor, op_desc);
> +		if (ret)
> +			goto exit;
> +	}
> +exit:
> +	return ret;
> +}

... snip

> +int
> +dla_process_events(struct dla_engine *engine, uint32_t *task_complete)
> +{
> +	int32_t i;
> +	int32_t ret = 0;
> +
> +	for (i = 0; i < DLA_OP_NUM; i++) {
> +		struct dla_processor *processor;
> +
> +		processor = &engine->processors[i];
> +		ret = dla_handle_events(engine, processor);
> +		/**
> +		 * Incase engine status is non-zero, then don't

./nvdla_scheduler.c:905: Incase ==> In case

> +		 * update the engine status. We should keep its
> +		 * status for later cleaning of engine.
> +		 */
> +		if (!engine->status)
> +			engine->status = ret;
> +	}
> +
> +	if (engine->network->num_operations == engine->num_proc_hwl)
> +		*task_complete = 1;
> +
> +	return ret;
> +}

... snip

   Argillander

WARNING: multiple messages have this Message-ID (diff)
From: Kari Argillander <kari.argillander@stargateuniverse.net>
To: Cai Huoqing <cai.huoqing@linux.dev>
Cc: "Thomas Zimmermann" <tzimmermann@suse.de>,
	"David Airlie" <airlied@linux.ie>,
	linux-kernel@vger.kernel.org,
	"Christian König" <christian.koenig@amd.com>,
	linaro-mm-sig@lists.linaro.org, dri-devel@lists.freedesktop.org,
	"Sumit Semwal" <sumit.semwal@linaro.org>,
	linux-media@vger.kernel.org
Subject: Re: [PATCH 2/2] drm/nvdla: Add driver support for NVDLA
Date: Fri, 22 Apr 2022 01:01:14 +0300	[thread overview]
Message-ID: <0fa9ab41-c18e-a569-e6fe-a0e9d965905e@stargateuniverse.net> (raw)
In-Reply-To: <20220419135908.39606-3-cai.huoqing@linux.dev>

This is just quick look up. I basically check some style issues and did
some basic static analyzing.

I have run
  - cppcheck (which found couple mistakes)
  - flawfinder (did not found anything to my eyes)
  - codespell (did find couple typo)

You can run these yourself also or check below.

Couple common things which you can ignore or not	.
- Usually in this code there is goto exit and it is just return. Maybe
   use just return straight away. No need to jump.
- Some comments start capital others not. Maybe all should start
   capital. Very small nit, but makes nice touch to the code.
- Lot of oneline comments are unneccessary three line comments.

On 19.4.2022 16.59, Cai Huoqing wrote:
> The NVIDIA Deep Learning Accelerator (NVDLA) is an open source IP
> which is integrated into NVIDIA Jetson AGX Xavier,
> so add driver support for this accelerator.
> 
> Signed-off-by: Cai Huoqing <cai.huoqing@linux.dev>

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_bdma.c b/drivers/gpu/drm/nvdla/nvdla_bdma.c
> new file mode 100644
> index 000000000000..225613f27acf
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_bdma.c

... snip

> +static int32_t
> +processor_bdma_program_slot(struct dla_engine *engine,
> +							struct dla_bdma_surface_desc *bdma_surface,
> +							struct dla_bdma_transfer_desc *transfer)
> +{
> +	int32_t ret = 0;
> +	uint64_t source_addr = 0;
> +	uint64_t destination_addr = 0;
> +	uint32_t high, low, reg;
> +	uint8_t  bdma_free_slots = 0;
> +
> +	/* make sure there're enough free slots */
> +	if (bdma_free_slots <= 0) {

This is always true right now.

> +		do {
> +			reg = bdma_reg_read(engine, STATUS);
> +			reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >>
> +					SHIFT(BDMA_STATUS_0, FREE_SLOT);
> +		} while (reg == 0);
> +		bdma_free_slots = (uint8_t)reg;
> +	}
> +
> +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> +						transfer->source_address,
> +						(void *)&source_addr,
> +						DESTINATION_DMA);
> +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> +						transfer->destination_address,
> +						(void *)&destination_addr,
> +						DESTINATION_DMA);
> +
> +	ASSERT_GOTO((transfer->line_repeat <= 8192),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO((transfer->surface_repeat <= 8192),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO((transfer->line_size % 32) == 0,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->source_line >= transfer->line_size,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->destination_line >= transfer->line_size,
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->source_surface >=
> +			(transfer->source_line * transfer->line_repeat),
> +				ret, -EINVAL, exit);
> +	ASSERT_GOTO(transfer->destination_surface >=
> +			(transfer->destination_line * transfer->line_repeat),
> +				ret, -EINVAL, exit);
> +
> +	/* config registers */
> +	high = upper_32_bits(source_addr);
> +	low = lower_32_bits(source_addr);
> +	bdma_reg_write(engine, CFG_SRC_ADDR_LOW, low);
> +	bdma_reg_write(engine, CFG_SRC_ADDR_HIGH, high);
> +	high = upper_32_bits(destination_addr);
> +	low = lower_32_bits(destination_addr);
> +	bdma_reg_write(engine, CFG_DST_ADDR_LOW, low);
> +	bdma_reg_write(engine, CFG_DST_ADDR_HIGH, high);
> +	bdma_reg_write(engine, CFG_LINE, (transfer->line_size >> 5) - 1);
> +	reg = (map_mem[bdma_surface->source_type] <<
> +				SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) |
> +		(map_mem[bdma_surface->destination_type] <<
> +				SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE));
> +	bdma_reg_write(engine, CFG_CMD, reg);
> +	bdma_reg_write(engine, CFG_LINE_REPEAT, transfer->line_repeat - 1);
> +	bdma_reg_write(engine, CFG_SRC_LINE, transfer->source_line);
> +	bdma_reg_write(engine, CFG_DST_LINE, transfer->destination_line);
> +	bdma_reg_write(engine, CFG_SURF_REPEAT, transfer->surface_repeat - 1);
> +	bdma_reg_write(engine, CFG_SRC_SURF, transfer->source_surface);
> +	bdma_reg_write(engine, CFG_DST_SURF, transfer->destination_surface);
> +	bdma_reg_write(engine, CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE));
> +
> +exit:
> +	return ret;
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_cache.c b/drivers/gpu/drm/nvdla/nvdla_cache.c
> new file mode 100644
> index 000000000000..f8bd7b514aab
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_cache.c
> @@ -0,0 +1,215 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> +/*
> + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> + * Copyright (C) 2022 Cai Huoqing
> + */
> +
> +#include "nvdla_common.h"
> +#include "nvdla_drm.h"
> +#include "nvdla_reg.h"
> +#include "nvdla_engine.h"
> +
> +#define DLA_OP_CACHE_SIZE (DLA_NUM_GROUPS * ((DLA_OP_NUM + 2) * 2))
> +
> +static struct dla_common_op_desc desc_cache[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> +static int32_t desc_refcount[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> +
> +void
> +dla_get_refcount(struct dla_common_op_desc *op_desc)
> +{
> +	int32_t i;
> +	struct dla_common_op_desc *desc = NULL;
> +
> +	if (op_desc == NULL)
> +		return;
> +
> +	if (op_desc->index == -1)
> +		return;
> +
> +	desc = &desc_cache[op_desc->op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == op_desc->index &&
> +				desc->roi_index == op_desc->roi_index) {

reverse if

		if (desc->index != op_desc->index)
			continue;
		if (desc->roi_index != op_desc->roi_index)
			continue;

> +			desc_refcount[op_desc->op_type][i]++;
> +			return;
> +		}
> +	}
> +}
> +
> +struct dla_common_op_desc *
> +dla_get_op_desc(struct dla_engine *engine,
> +				struct dla_task *task, int16_t index,
> +				uint8_t op_type, uint8_t roi_index)
> +{
> +	int32_t i;
> +	int32_t ret;
> +	uint64_t op_base;
> +	uint64_t dep_graph_addr;
> +	struct dla_common_op_desc *desc = NULL;
> +
> +	if (index == -1) {
> +		pr_debug("no desc get due to index==-1\n");
> +		goto exit;
> +	}
> +
> +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> +				engine->network->num_operations * roi_index);
> +
> +	desc = &desc_cache[op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == index && desc->roi_index == roi_index) {
> +			if (desc->op_type != op_type) {
> +				pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
> +					   op_type, desc->op_type);
> +				continue;
> +			}

reverse if so this will be pretty clean

		if (desc->index != index)
			continue;
		if (desc->roi_index != roi_index)
			continue;
		if (desc->op_type != op_type) {
			pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
					op_type, desc->op_type);
			continue;
		}


> +			desc_refcount[op_type][i]++;
> +			goto exit;
> +		}
> +	}
> +
> +	desc = &desc_cache[op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == -1) {

reverse if
		if (desc->index != -1)
			continue;

> +			op_base = dep_graph_addr +
> +					(sizeof(struct dla_common_op_desc) *
> +					(uint64_t)index);
> +			ret = dla_data_read(engine->driver_context,
> +					task->task_data,
> +					task->dependency_graph_addr,
> +					(void *)(desc),
> +					sizeof(struct dla_common_op_desc),
> +					op_base);
> +			if (ret) {
> +				desc = NULL;
> +				goto exit;
> +			}
> +
> +			if (op_type != desc->op_type) {
> +				/*
> +				 * op_type of entry read from DRAM should not
> +				 * mismatch with given op_type. If they
> +				 * mismatches, then wrong entry is fetched, so
> +				 * report this issue by throwing error.
> +				 */
> +				pr_err("Fetched [op_type=%u] from DRAM doesn't match with op_type[%u]\n",
> +					   desc->op_type, op_type);
> +				desc->op_type = op_type;
> +				desc->index = -1;
> +				desc->roi_index = -1;
> +				desc = NULL;
> +				goto exit;
> +			}
> +
> +			desc->index = index;
> +			desc->roi_index = roi_index;
> +
> +			desc_refcount[op_type][i]++;
> +			goto exit;
> +		}
> +	}
> +
> +exit:
> +	return desc;
> +}
> +
> +static void
> +dla_free_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> +{
> +	uint64_t op_base;
> +	uint64_t dep_graph_addr;
> +	struct dla_task *task;
> +
> +	pr_debug("Enter: %s op desc index %u ROI %d\n", __func__,
> +				op_desc->index, op_desc->roi_index);

Possiple null pointer dereference

> +	task = engine->task;
> +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> +				engine->network->num_operations *
> +				op_desc->roi_index);
> +
> +	if (op_desc->index == -1)
> +		goto exit;

Possiple null pointer dereference

> +	if (op_desc == NULL)
> +		goto exit;

Or this is unnecessary.

> +
> +	/**
> +	 * TODO: keeping the depth value hardcoded as 0 for now,
> +	 * need to replace it once corresponding implementation is done.
> +	 */
> +	op_base = (dep_graph_addr +
> +			(sizeof(struct dla_common_op_desc) *
> +			(uint64_t)op_desc->index));
> +
> +	/**
> +	 * Flush descriptor to DRAM
> +	 */
> +	dla_data_write(engine->driver_context,
> +			task->task_data,
> +			(void *)op_desc,
> +			task->dependency_graph_addr,
> +			sizeof(struct dla_common_op_desc),
> +			op_base);
> +
> +	/**
> +	 * Release it
> +	 */
> +	op_desc->index = -1;
> +	op_desc->roi_index = -1;
> +exit:
> +	return;
> +}
> +
> +void
> +dla_put_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> +{
> +	int32_t i;
> +	struct dla_common_op_desc *desc;
> +
> +	if (op_desc == NULL)
> +		return;
> +
> +	if (op_desc->index == -1)
> +		return;
> +
> +	desc = &desc_cache[op_desc->op_type][0];
> +
> +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> +		if (desc->index == op_desc->index &&
> +				desc->roi_index == op_desc->roi_index) {

Reverse if.

		if (desc->index != op_desc->index)
			continue;
		if (desc->roi_index != op_desc->roi_index)
			continue;

> +
> +			desc_refcount[op_desc->op_type][i]--;
> +
> +			/**
> +			 * Free desc if refcount is 0
> +			 */
Pretty useless comment and totally not needed three line for this.

> +			if (desc_refcount[op_desc->op_type][i] == 0)
> +				dla_free_op_desc(engine, op_desc);
> +
> +			return;
> +		}
> +	}
> +}
> +
> +void
> +dla_init_op_cache(struct dla_engine *engine)
> +{
> +	int32_t i, j;
> +	struct dla_common_op_desc *desc = &desc_cache[0][0];
> +
> +	memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache));
> +	memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount));
> +
> +	for (i = 0; i < DLA_OP_NUM; i++) {
> +		for (j = 0; j < DLA_OP_CACHE_SIZE; j++) {
> +			desc->index = -1;
> +			desc->roi_index = -1;
> +			desc->op_type = (uint8_t)i;
> +			desc++;
> +		}
> +	}
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_common.h b/drivers/gpu/drm/nvdla/nvdla_common.h
> new file mode 100644
> index 000000000000..38cf43246890
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_common.h
> @@ -0,0 +1,835 @@

... snip

> +struct dla_conv_op_desc {
> +	/* Performance parameters */
> +
> +	/* dla_conv_mode */
> +	uint8_t conv_mode;
> +	uint8_t data_reuse;
> +	uint8_t weight_reuse;
> +	uint8_t skip_data_rls;
> +
> +	uint8_t skip_weight_rls;
> +	uint8_t reserved0;
> +	uint16_t entry_per_slice;
> +
> +	/* dla_data_format */
> +	uint8_t data_format;
> +	/* dla_pixel_mapping */
> +	uint8_t pixel_mapping;
> +	/* number of free slices before fetch */
> +	uint16_t fetch_grain;
> +
> +	uint8_t reserved_b[8];
> +
> +	/* batch_num */
> +	uint8_t batch;
> +	/* dla_weight_format */
> +	uint8_t weight_format;
> +	uint8_t data_bank;
> +	uint8_t weight_bank;
> +
> +	/* the offset in bytes of each data cube in a batch */
> +	uint32_t batch_stride;
> +
> +	uint8_t post_extension;
> +	uint8_t pixel_override;
> +	/* number of slices need to be released */
> +	uint16_t release;
> +
> +	 /* The input cube dimension for CSC */
> +	uint16_t input_width_csc;
> +	uint16_t input_height_csc;
> +
> +	uint16_t input_channel_csc;
> +	uint16_t kernel_width_csc;
> +
> +	uint16_t kernel_height_csc;
> +	uint16_t kernel_channel_csc;
> +
> +	/* The input cube dimension for CMAC */
> +	uint16_t input_width_cmac;
> +	uint16_t input_height_cmac;
> +
> +	/* actual size in bytes */
> +	uint32_t bytes_per_kernel;
> +
> +	/* Algorithm parameters */
> +
> +	int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
> +	int16_t mean_gu; /* mean value for green in RGB or U in YUV */
> +
> +	int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
> +	int16_t mean_ax;
> +
> +	uint8_t mean_format; /* dla_mean_format */
> +	uint8_t conv_stride_x;
> +	uint8_t conv_stride_y;
> +	uint8_t pad_x_left;
> +
> +	uint8_t pad_x_right;
> +	uint8_t pad_y_top;
> +	uint8_t pad_y_bottom;
> +	uint8_t dilation_x;
> +
> +	uint8_t dilation_y;
> +	uint8_t reserved2[2];
> +
> +	/* Precision parameters */
> +	uint8_t pra_truncate;
> +
> +	uint8_t in_precision;
> +	/* The output precision from CONV, it's the MAC processing precison */

./nvdla_common.h:428: precison ==> precision

> +	uint8_t out_precision;
> +	int16_t pad_val;
> +
> +	/* input converter parameters */
> +	struct dla_cvt_param in_cvt;
> +	/* output converter parameters, support truncate only */
> +	struct dla_cvt_param out_cvt;
> +
> +} __packed __aligned(4);
> +
> +struct dla_conv_stat_desc {
> +	uint32_t data_read_stall;
> +	uint32_t weight_read_stall;
> +	uint32_t data_read_latency;
> +	uint32_t weight_read_latency;
> +	uint32_t saturation_count;
> +	uint32_t nan_data_num;
> +	uint32_t nan_weight_num;
> +	uint32_t inf_data_num;
> +	uint32_t inf_weight_num;
> +} __packed __aligned(4);
> +
> +/**
> + * @ingroup SDP
> + * @name Activation functions
> + * @brief Activation functions supported in SDP
> + * @{
> + */
> +#define ACTIVATION_NONE		0
> +#define ACTIVATION_RELU		1
> +#define ACTIVATION_LUT		2
> +#define ACTIVATION_PRELU	3
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT size
> + * @brief LUT sizes for linear and exponentila LUT
> + * @{
> + */
> +#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2		6
> +#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2	8
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT types
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_LINEAR_EXP_TABLE		0
> +#define LUT_LINEAR_ONLY_TABLE		1
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT methods
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_METHOD_EXPONENTIAL		0
> +#define LUT_METHOD_LINEAR		1
> +/** @} */
> +
> +/**
> + * @ingroup LUT
> + * @name LUT
> + * @brief DLA supports two types of LUT, linear and exonential
> + * @{
> + */
> +#define LUT_PRI_LINEAR_EXP		0
> +#define LUT_PRI_LINEAR_ONLY		1
> +/** @} */
> +
> +union dla_lut_offset {
> +	/**
> +	 * Number should be substracted on log domain before look up

./nvdla_common.h:505: substracted ==> subtracted

> +	 * exponetial table it has the same definition as hardware

./nvdla_common.h:506: exponetial ==> exponential

> +	 * thus input scaling should also take into account when
> +	 * set this field.
> +	 */
> +	int8_t exp_offset;
> +	/**
> +	 * Number of bits should be right shift before looking
> +	 * up linear table
> +	 */
> +	int8_t frac_bits;
> +	uint16_t reserved0;
> +};

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_drm.c b/drivers/gpu/drm/nvdla/nvdla_drm.c
> new file mode 100644
> index 000000000000..9217eee1de3b
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_drm.c
> @@ -0,0 +1,695 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> +/*
> + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> + * Copyright (C) 2022 Cai Huoqing
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/fs.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_device.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/time.h>
> +#include <linux/uaccess.h>
> +#include <linux/types.h>
> +
> +#include "nvdla_drm.h"
> +#include "nvdla_ioctl.h"
> +#include "nvdla_engine.h"
> +
> +static struct nvdla_config nvdla_config_os_initial = {
> +	.atom_size = 32,
> +	.bdma_enable = true,
> +	.rubik_enable = true,
> +	.weight_compress_support = true,
> +};
> +
> +static struct nvdla_config nvdla_config_small = {
> +	//.atom_size = 8,
> +	.atom_size = 32,  // nv_large config
> +	.bdma_enable = false,
> +	.rubik_enable = false,
> +	.weight_compress_support = false,
> +};
> +
> +int64_t dla_get_time_us(void)

Funtion is never used.

> +{
> +	return ktime_get_ns() / NSEC_PER_USEC;
> +}
> +
> +void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg)
> +{
> +	struct nvdla_device *nvdla_dev =
> +			(struct nvdla_device *)driver_context;
> +
> +	if (!nvdla_dev)
> +		return;
> +
> +	writel(reg, nvdla_dev->base + addr);
> +}
> +
> +uint32_t dla_reg_read(void *driver_context, uint32_t addr)
> +{
> +	struct nvdla_device *nvdla_dev =
> +			(struct nvdla_device *)driver_context;
> +
> +	if (!nvdla_dev)
> +		return 0;
> +
> +	return readl(nvdla_dev->base + addr);
> +}
> +
> +static irqreturn_t nvdla_engine_isr(int32_t irq, void *data)
> +{
> +	unsigned long flags;
> +	uint32_t mask;
> +	uint32_t reg;
> +	struct dla_processor *processor = NULL;
> +	struct dla_processor_group *group;
> +	struct dla_engine *engine;
> +	struct nvdla_device *nvdla_dev = (struct nvdla_device *)data;
> +
> +	if (!nvdla_dev)
> +		return IRQ_NONE;
> +
> +	engine = nvdla_dev->engine_context;
> +	spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
> +
> +	mask = glb_reg_read(engine, S_INTR_MASK);

Never used. It would be nice so that static analyzer will not complain
these anymore, but your choice what you want to do.

> +	reg = glb_reg_read(engine, S_INTR_STATUS);
> +
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_SDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_SDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_RUBIK];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_RUBIK];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_PDP];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_PDP];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_BDMA];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_BDMA];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[0];
> +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> +	}
> +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) {
> +		processor = &engine->processors[DLA_OP_CONV];
> +		group = &processor->groups[1];
> +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> +	}
> +
> +	glb_reg_write(engine, S_INTR_STATUS, reg);
> +	mask = glb_reg_read(engine, S_INTR_MASK);

Never used

> +	reg = glb_reg_read(engine, S_INTR_STATUS);

Never used.

> +
> +	complete(&nvdla_dev->event_notifier);
> +	spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
> +
> +	return IRQ_HANDLED;
> +}

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_gem.c b/drivers/gpu/drm/nvdla/nvdla_gem.c
> new file mode 100644
> index 000000000000..cccf6d01a564
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_gem.c

... snip

> +static const struct drm_ioctl_desc nvdla_drm_ioctls[] = {
> +	DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW),
> +	/* use DRM_IOCTL_MODE_DESTROY_DUMB to destory */

./nvdla_gem.c:347: destory ==> destroy

> +};

... snip

> diff --git a/drivers/gpu/drm/nvdla/nvdla_scheduler.c b/drivers/gpu/drm/nvdla/nvdla_scheduler.c
> new file mode 100644
> index 000000000000..b814077478c6
> --- /dev/null
> +++ b/drivers/gpu/drm/nvdla/nvdla_scheduler.c

... snip

> +static int
> +dla_update_dependency(struct dla_engine *engine,
> +					  struct dla_consumer *consumer,
> +					  struct dla_common_op_desc *op_desc,
> +					  uint8_t event, uint8_t roi_index)
> +{
> +	int32_t ret = 0;
> +	struct dla_processor *processor;
> +
> +	if (consumer->index == -1)
> +		goto exit;
> +
> +	/* Update dependency only if event matches */
> +	if (event != consumer->event)
> +		goto exit;
> +
> +	/**
> +	 * If consumer index is valid but op desc is NULL means
> +	 * op desc for consumer was not pre-fetched
> +	 */
> +	if (op_desc == NULL) {
> +		ret = -EINVAL;
> +		pr_err("Operation descriptor is NULL, consumer index %d",
> +				consumer->index);
> +		goto exit;
> +	}
> +
> +	pr_debug("Update dependency operation index %d ROI %d DEP_COUNT=%d\n",
> +					op_desc->index, op_desc->roi_index,
> +					op_desc->dependency_count);
> +	op_desc->dependency_count--;
> +
> +	if (op_desc->dependency_count == 0) {
> +		processor = &engine->processors[op_desc->op_type];
> +		pr_debug("enable %s in %s as depdency are resolved\n",

./nvdla_scheduler.c:455: depdency ==> dependency

> +			processor->name, __func__);
> +
> +		ret = dla_enable_operation(engine, processor, op_desc);
> +		if (ret)
> +			goto exit;
> +	}
> +exit:
> +	return ret;
> +}

... snip

> +int
> +dla_process_events(struct dla_engine *engine, uint32_t *task_complete)
> +{
> +	int32_t i;
> +	int32_t ret = 0;
> +
> +	for (i = 0; i < DLA_OP_NUM; i++) {
> +		struct dla_processor *processor;
> +
> +		processor = &engine->processors[i];
> +		ret = dla_handle_events(engine, processor);
> +		/**
> +		 * Incase engine status is non-zero, then don't

./nvdla_scheduler.c:905: Incase ==> In case

> +		 * update the engine status. We should keep its
> +		 * status for later cleaning of engine.
> +		 */
> +		if (!engine->status)
> +			engine->status = ret;
> +	}
> +
> +	if (engine->network->num_operations == engine->num_proc_hwl)
> +		*task_complete = 1;
> +
> +	return ret;
> +}

... snip

   Argillander

  parent reply	other threads:[~2022-04-21 22:01 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-19 13:58 [PATCH 0/2] drm/nvdla: Add driver support for NVDLA Cai Huoqing
2022-04-19 13:58 ` Cai Huoqing
2022-04-19 13:58 ` [PATCH 1/2] MAINTAINERS: Add the driver info of the NVDLA Cai Huoqing
2022-04-19 13:58   ` Cai Huoqing
2022-04-19 13:59 ` [PATCH 2/2] drm/nvdla: Add driver support for NVDLA Cai Huoqing
2022-04-19 13:59   ` Cai Huoqing
2022-04-19 14:07   ` Christian König
2022-04-19 14:07     ` Christian König
2022-04-19 14:35     ` Cai Huoqing
2022-04-19 14:35       ` Cai Huoqing
2022-04-20  7:53   ` kernel test robot
2022-04-20  7:53     ` kernel test robot
2022-04-20  9:57   ` kernel test robot
2022-04-20  9:57     ` kernel test robot
2022-04-20 10:26   ` Thomas Zimmermann
2022-04-20 10:26     ` Thomas Zimmermann
2022-04-21  8:09   ` Cai Huoqing
2022-04-21  8:30   ` Thomas Zimmermann
2022-04-21  8:30     ` Thomas Zimmermann
2022-04-21  8:34     ` Christian König
2022-04-21  8:34       ` Christian König
2022-04-21  8:57       ` Thomas Zimmermann
2022-04-21  8:57         ` Thomas Zimmermann
2022-04-21  9:07       ` Thomas Zimmermann
2022-04-21  9:07         ` Thomas Zimmermann
2022-04-21  9:13       ` Thomas Zimmermann
2022-04-21  9:13         ` Thomas Zimmermann
2022-04-21  9:23         ` Christian König
2022-04-21  9:23           ` Christian König
2022-04-21 22:01   ` Kari Argillander [this message]
2022-04-21 22:01     ` Kari Argillander
2022-04-25 14:28     ` Cai Huoqing
2022-04-25 14:28       ` Cai Huoqing
2022-04-27 15:23   ` Daniel Vetter
2022-04-27 15:23     ` Daniel Vetter
2022-04-26  5:20 ` [PATCH 0/2] " Peter Robinson
2022-04-26  5:20   ` Peter Robinson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0fa9ab41-c18e-a569-e6fe-a0e9d965905e@stargateuniverse.net \
    --to=kari.argillander@stargateuniverse.net \
    --cc=airlied@linux.ie \
    --cc=cai.huoqing@linux.dev \
    --cc=christian.koenig@amd.com \
    --cc=daniel@ffwll.ch \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linaro-mm-sig@lists.linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-media@vger.kernel.org \
    --cc=maarten.lankhorst@linux.intel.com \
    --cc=mripard@kernel.org \
    --cc=sumit.semwal@linaro.org \
    --cc=tzimmermann@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.