Re: [PATCH 03/15] habanalabs: add basic Goya support

From: Oded Gabbay <oded.gabbay@gmail.com>
To: Mike Rapoport <rppt@linux.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	"Linux-Kernel@Vger. Kernel. Org" <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 03/15] habanalabs: add basic Goya support
Date: Fri, 25 Jan 2019 22:32:55 +0200	[thread overview]
Message-ID: <CAFCwf12KD2gynDF2GfXrLF9ivZnWN6QGwdAZryBPupr2=QB8Fw@mail.gmail.com> (raw)
In-Reply-To: <20190123122817.GB4747@rapoport-lnx>

On Wed, Jan 23, 2019 at 2:28 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> On Wed, Jan 23, 2019 at 02:00:45AM +0200, Oded Gabbay wrote:
> > This patch adds a basic support for the Goya device. The code initializes
> > the device's PCI controller and PCI bars. It also initializes various S/W
> > structures and adds some basic helper functions.
> >
> > Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
> > ---
> >  drivers/misc/habanalabs/Makefile            |   5 +-
> >  drivers/misc/habanalabs/device.c            |  71 +++
> >  drivers/misc/habanalabs/goya/Makefile       |   3 +
> >  drivers/misc/habanalabs/goya/goya.c         | 633 ++++++++++++++++++++
> >  drivers/misc/habanalabs/goya/goyaP.h        | 125 ++++
> >  drivers/misc/habanalabs/habanalabs.h        | 131 ++++
> >  drivers/misc/habanalabs/habanalabs_drv.c    |   3 +
> >  drivers/misc/habanalabs/include/goya/goya.h | 115 ++++
> >  8 files changed, 1085 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/misc/habanalabs/goya/Makefile
> >  create mode 100644 drivers/misc/habanalabs/goya/goya.c
> >  create mode 100644 drivers/misc/habanalabs/goya/goyaP.h
> >  create mode 100644 drivers/misc/habanalabs/include/goya/goya.h
> >
> > diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
> > index b41433a09e02..6f1ead69bd77 100644
> > --- a/drivers/misc/habanalabs/Makefile
> > +++ b/drivers/misc/habanalabs/Makefile
> > @@ -4,4 +4,7 @@
> >
> >  obj-m        := habanalabs.o
> >
> > -habanalabs-y := habanalabs_drv.o device.o
> > \ No newline at end of file
> > +habanalabs-y := habanalabs_drv.o device.o
> > +
> > +include $(src)/goya/Makefile
> > +habanalabs-y += $(HL_GOYA_FILES)
> > diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
> > index 376b55eb73d4..a4276ef559b3 100644
> > --- a/drivers/misc/habanalabs/device.c
> > +++ b/drivers/misc/habanalabs/device.c
> > @@ -116,8 +116,11 @@ static int device_setup_cdev(struct hl_device *hdev, struct class *hclass,
> >   */
> >  static int device_early_init(struct hl_device *hdev)
> >  {
> > +     int rc;
> > +
> >       switch (hdev->asic_type) {
> >       case ASIC_GOYA:
> > +             goya_set_asic_funcs(hdev);
> >               sprintf(hdev->asic_name, "GOYA");
> >               break;
> >       default:
> > @@ -126,6 +129,10 @@ static int device_early_init(struct hl_device *hdev)
> >               return -EINVAL;
> >       }
> >
> > +     rc = hdev->asic_funcs->early_init(hdev);
> > +     if (rc)
> > +             return rc;
> > +
> >       return 0;
> >  }
> >
> > @@ -137,6 +144,10 @@ static int device_early_init(struct hl_device *hdev)
> >   */
> >  static void device_early_fini(struct hl_device *hdev)
> >  {
> > +
> > +     if (hdev->asic_funcs->early_fini)
> > +             hdev->asic_funcs->early_fini(hdev);
> > +
> >  }
> >
> >  /**
> > @@ -150,8 +161,15 @@ static void device_early_fini(struct hl_device *hdev)
> >   */
> >  int hl_device_suspend(struct hl_device *hdev)
> >  {
> > +     int rc;
> > +
> >       pci_save_state(hdev->pdev);
> >
> > +     rc = hdev->asic_funcs->suspend(hdev);
> > +     if (rc)
> > +             dev_err(hdev->dev,
> > +                     "Failed to disable PCI access of device CPU\n");
> > +
> >       /* Shut down the device */
> >       pci_disable_device(hdev->pdev);
> >       pci_set_power_state(hdev->pdev, PCI_D3hot);
> > @@ -181,6 +199,13 @@ int hl_device_resume(struct hl_device *hdev)
> >               return rc;
> >       }
> >
> > +     rc = hdev->asic_funcs->resume(hdev);
> > +     if (rc) {
> > +             dev_err(hdev->dev,
> > +                     "Failed to enable PCI access from device CPU\n");
> > +             return rc;
> > +     }
> > +
> >       return 0;
> >  }
> >
> > @@ -208,11 +233,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
> >       if (rc)
> >               goto release_device;
> >
> > +     /*
> > +      * Start calling ASIC initialization. First S/W then H/W and finally
> > +      * late init
> > +      */
> > +     rc = hdev->asic_funcs->sw_init(hdev);
> > +     if (rc)
> > +             goto early_fini;
> > +
> >       dev_notice(hdev->dev,
> >               "Successfully added device to habanalabs driver\n");
> >
> >       return 0;
> >
> > +early_fini:
> > +     device_early_fini(hdev);
> >  release_device:
> >       device_destroy(hclass, hdev->dev->devt);
> >       cdev_del(&hdev->cdev);
> > @@ -243,6 +278,9 @@ void hl_device_fini(struct hl_device *hdev)
> >       /* Mark device as disabled */
> >       hdev->disabled = true;
> >
> > +     /* Call ASIC S/W finalize function */
> > +     hdev->asic_funcs->sw_fini(hdev);
> > +
> >       device_early_fini(hdev);
> >
> >       /* Hide device from user */
> > @@ -329,3 +367,36 @@ int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
> >
> >       return (*val ? 0 : -ETIMEDOUT);
> >  }
> > +
> > +/*
> > + * MMIO register access helper functions.
> > + */
> > +
> > +/**
> > + * hl_rreg - Read an MMIO register
> > + *
> > + * @hdev: pointer to habanalabs device structure
> > + * @reg: MMIO register offset (in bytes)
> > + *
> > + * Returns the value of the MMIO register we are asked to read
> > + *
> > + */
> > +inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
> > +{
> > +     return readl(hdev->rmmio + reg);
> > +}
> > +
> > +/**
> > + * hl_wreg - Write to an MMIO register
> > + *
> > + * @hdev: pointer to habanalabs device structure
> > + * @reg: MMIO register offset (in bytes)
> > + * @val: 32-bit value
> > + *
> > + * Writes the 32-bit value into the MMIO register
> > + *
> > + */
> > +inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
> > +{
> > +     writel(val, hdev->rmmio + reg);
> > +}
> > diff --git a/drivers/misc/habanalabs/goya/Makefile b/drivers/misc/habanalabs/goya/Makefile
> > new file mode 100644
> > index 000000000000..5ebf3d0d5794
> > --- /dev/null
> > +++ b/drivers/misc/habanalabs/goya/Makefile
> > @@ -0,0 +1,3 @@
> > +subdir-ccflags-y += -I$(src)
> > +
> > +HL_GOYA_FILES :=  goya/goya.o
> > \ No newline at end of file
> > diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
> > new file mode 100644
> > index 000000000000..b2952296b890
> > --- /dev/null
> > +++ b/drivers/misc/habanalabs/goya/goya.c
> > @@ -0,0 +1,633 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +/*
> > + * Copyright 2016-2018 HabanaLabs, Ltd.
> > + * All Rights Reserved.
> > + */
> > +
> > +#include "goyaP.h"
> > +#include "include/goya/asic_reg/goya_masks.h"
> > +
> > +#include <linux/fs.h>
> > +#include <linux/delay.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/sched.h>
> > +#include <linux/genalloc.h>
> > +#include <linux/sysfs.h>
> > +#include <linux/kfifo.h>
> > +#include <linux/dma-mapping.h>
> > +#include <linux/firmware.h>
> > +#include <linux/log2.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/string.h>
> > +#include <linux/io.h>
> > +
> > +/*
> > + * GOYA security scheme:
> > + *
> > + * 1. Host is protected by:
> > + *        - Range registers (When MMU is enabled, DMA RR does NOT protect host)
> > + *        - MMU
> > + *
> > + * 2. DRAM is protected by:
> > + *        - Range registers (protect the first 512MB)
> > + *        - MMU (isolation between users)
> > + *
> > + * 3. Configuration is protected by:
> > + *        - Range registers
> > + *        - Protection bits
> > + *
> > + * When MMU is disabled:
> > + *
> > + * QMAN DMA: PQ, CQ, CP, DMA are secured.
> > + * PQ, CB and the data are on the host.
> > + *
> > + * QMAN TPC/MME:
> > + * PQ, CQ and CP are not secured.
> > + * PQ, CB and the data are on the SRAM/DRAM.
> > + *
> > + * Since QMAN DMA is secured, KMD is parsing the DMA CB:
> > + *     - KMD checks DMA pointer
> > + *     - WREG, MSG_PROT are not allowed.
> > + *     - MSG_LONG/SHORT are allowed.
> > + *
> > + * A read/write transaction by the QMAN to a protected area will succeed if
> > + * and only if the QMAN's CP is secured and MSG_PROT is used
> > + *
> > + *
> > + * When MMU is enabled:
> > + *
> > + * QMAN DMA: PQ, CQ and CP are secured.
> > + * MMU is set to bypass on the Secure props register of the QMAN.
> > + * The reasons we don't enable MMU for PQ, CQ and CP are:
> > + *     - PQ entry is in kernel address space and KMD doesn't map it.
> > + *     - CP writes to MSIX register and to kernel address space (completion
> > + *       queue).
> > + *
> > + * DMA is not secured but because CP is secured, KMD still needs to parse the
> > + * CB, but doesn't need to check the DMA addresses.
> > + *
> > + * For QMAN DMA 0, DMA is also secured because only KMD uses this DMA and KMD
> > + * doesn't map memory in MMU.
> > + *
> > + * QMAN TPC/MME: PQ, CQ and CP aren't secured (no change from MMU disabled mode)
> > + *
> > + * DMA RR does NOT protect host because DMA is not secured
> > + *
> > + */
> > +
> > +#define GOYA_MMU_REGS_NUM            61
> > +
> > +#define GOYA_DMA_POOL_BLK_SIZE               0x100           /* 256 bytes */
> > +
> > +#define GOYA_RESET_TIMEOUT_MSEC              500             /* 500ms */
> > +#define GOYA_PLDM_RESET_TIMEOUT_MSEC 20000           /* 20s */
> > +#define GOYA_RESET_WAIT_MSEC         1               /* 1ms */
> > +#define GOYA_CPU_RESET_WAIT_MSEC     100             /* 100ms */
> > +#define GOYA_PLDM_RESET_WAIT_MSEC    1000            /* 1s */
> > +#define GOYA_CPU_TIMEOUT_USEC                10000000        /* 10s */
> > +#define GOYA_TEST_QUEUE_WAIT_USEC    100000          /* 100ms */
> > +
> > +#define GOYA_QMAN0_FENCE_VAL         0xD169B243
> > +
> > +#define GOYA_MAX_INITIATORS          20
> > +
> > +static void goya_get_fixed_properties(struct hl_device *hdev)
> > +{
> > +     struct asic_fixed_properties *prop = &hdev->asic_prop;
> > +
> > +     prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
> > +
> > +     prop->dram_base_address = DRAM_PHYS_BASE;
> > +     prop->dram_size = DRAM_PHYS_DEFAULT_SIZE;
> > +     prop->dram_end_address = prop->dram_base_address + prop->dram_size;
> > +     prop->dram_user_base_address = DRAM_BASE_ADDR_USER;
> > +
> > +     prop->sram_base_address = SRAM_BASE_ADDR;
> > +     prop->sram_size = SRAM_SIZE;
> > +     prop->sram_end_address = prop->sram_base_address + prop->sram_size;
> > +     prop->sram_user_base_address = prop->sram_base_address +
> > +                                             SRAM_USER_BASE_OFFSET;
> > +
> > +     prop->host_phys_base_address = HOST_PHYS_BASE;
> > +     prop->va_space_host_start_address = VA_HOST_SPACE_START;
> > +     prop->va_space_host_end_address = VA_HOST_SPACE_END;
> > +     prop->va_space_dram_start_address = VA_DDR_SPACE_START;
> > +     prop->va_space_dram_end_address = VA_DDR_SPACE_END;
> > +     prop->cfg_size = CFG_SIZE;
> > +     prop->max_asid = MAX_ASID;
> > +     prop->tpc_enabled_mask = TPC_ENABLED_MASK;
> > +
> > +     prop->high_pll = PLL_HIGH_DEFAULT;
> > +}
> > +
> > +/**
> > + * goya_pci_bars_map - Map PCI BARS of Goya device
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * Request PCI regions and map them to kernel virtual addresses.
> > + * Returns 0 on success
> > + *
> > + */
> > +int goya_pci_bars_map(struct hl_device *hdev)
> > +{
> > +     struct pci_dev *pdev = hdev->pdev;
> > +     int rc;
>
> You could just init rc= -ENODEV here and avoid the hassle below.

But the next line assigns rc the return value of pci_request_regions...
I could do rc= -ENODEV before the calls to pci_ioremapbar but then if
this function will change in the future and I will have another
possibility of a different error, it will seem strange.
I honestly prefer to write code in drivers as explicitly as possible,
even if that means a bit more code.

> > +
> > +     rc = pci_request_regions(pdev, HL_NAME);
> > +     if (rc) {
> > +             dev_err(hdev->dev, "Cannot obtain PCI resources\n");
> > +             return rc;
> > +     }
> > +
> > +     hdev->pcie_bar[SRAM_CFG_BAR_ID] =
> > +                     pci_ioremap_bar(pdev, SRAM_CFG_BAR_ID);
> > +     if (!hdev->pcie_bar[SRAM_CFG_BAR_ID]) {
> > +             dev_err(hdev->dev, "pci_ioremap_bar failed for CFG\n");
> > +             rc = -ENODEV;
> > +             goto err_release_regions;
> > +     }
> > +
> > +     hdev->pcie_bar[MSIX_BAR_ID] = pci_ioremap_bar(pdev, MSIX_BAR_ID);
> > +     if (!hdev->pcie_bar[MSIX_BAR_ID]) {
> > +             dev_err(hdev->dev, "pci_ioremap_bar failed for MSIX\n");
> > +             rc = -ENODEV;
> > +             goto err_unmap_sram_cfg;
> > +     }
> > +
> > +     hdev->pcie_bar[DDR_BAR_ID] = pci_ioremap_wc_bar(pdev, DDR_BAR_ID);
> > +     if (!hdev->pcie_bar[DDR_BAR_ID]) {
> > +             dev_err(hdev->dev, "pci_ioremap_bar failed for DDR\n");
> > +             rc = -ENODEV;
> > +             goto err_unmap_msix;
> > +     }
> > +
> > +     hdev->rmmio = hdev->pcie_bar[SRAM_CFG_BAR_ID] +
> > +                             (CFG_BASE - SRAM_BASE_ADDR);
> > +
> > +     return 0;
> > +
> > +err_unmap_msix:
> > +     iounmap(hdev->pcie_bar[MSIX_BAR_ID]);
> > +err_unmap_sram_cfg:
> > +     iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]);
> > +err_release_regions:
> > +     pci_release_regions(pdev);
> > +
> > +     return rc;
> > +}
> > +
> > +/**
> > + * goya_pci_bars_unmap - Unmap PCI BARS of Goya device
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * Release all PCI BARS and unmap their virtual addresses
> > + *
> > + */
> > +static void goya_pci_bars_unmap(struct hl_device *hdev)
> > +{
> > +     struct pci_dev *pdev = hdev->pdev;
> > +
> > +     iounmap(hdev->pcie_bar[DDR_BAR_ID]);
> > +     iounmap(hdev->pcie_bar[MSIX_BAR_ID]);
> > +     iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]);
> > +     pci_release_regions(pdev);
> > +}
> > +
> > +/**
> > + * goya_elbi_write - Write through the ELBI interface
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * return 0 on success, -1 on failure
> > + *
> > + */
> > +static int goya_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
> > +{
> > +     struct pci_dev *pdev = hdev->pdev;
> > +     ktime_t timeout;
> > +     u32 val;
> > +
> > +     /* Clear previous status */
> > +     pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0);
> > +
> > +     pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_ADDR, (u32) addr);
> > +     pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data);
> > +     pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL,
> > +                             PCI_CONFIG_ELBI_CTRL_WRITE);
> > +
> > +     timeout = ktime_add_ms(ktime_get(), 10);
> > +     for (;;) {
> > +             pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val);
> > +             if (val & PCI_CONFIG_ELBI_STS_MASK)
> > +                     break;
> > +             if (ktime_compare(ktime_get(), timeout) > 0) {
> > +                     pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS,
> > +                                             &val);
> > +                     break;
> > +             }
> > +             usleep_range(300, 500);
> > +     }
> > +
> > +     if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE)
> > +             return 0;
> > +
> > +     if (val & PCI_CONFIG_ELBI_STS_ERR) {
> > +             dev_err(hdev->dev, "Error writing to ELBI\n");o
> > +             return -1;
>
> Please change -1 to an error code, say -EIO...
Of course, done.

>
> > +     }
> > +
> > +     if (!(val & PCI_CONFIG_ELBI_STS_MASK)) {
> > +             dev_err(hdev->dev, "ELBI write didn't finish in time\n");
> > +             return -1;
> > +     }
> > +
> > +     dev_err(hdev->dev, "ELBI write has undefined bits in status\n");
> > +     return -1;
> > +}
> > +
> > +/**
> > + * goya_iatu_write - iatu write routine
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + */
> > +static int goya_iatu_write(struct hl_device *hdev, u32 addr, u32 data)
> > +{
> > +     u32 dbi_offset;
> > +     int rc;
> > +
> > +     dbi_offset = addr & 0xFFF;
> > +
> > +     rc = goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0x00300000);
> > +     rc |= goya_elbi_write(hdev, mmPCIE_DBI_BASE + dbi_offset, data);
>
> hmm, error code in goya_elbi_write probably won't work...
> Any reason to try the second write if the first failed?
>
You are correct it definitely won't work. But I didn't want to put an
if() after each call to that function - it happens a few more times in
the code.
And because the second write won't do any harm either, I thought this
is a more elegant solution to make the code more readable.

> > +
> > +     return rc;
> > +}
> > +
> > +void goya_reset_link_through_bridge(struct hl_device *hdev)
> > +{
> > +     struct pci_dev *pdev = hdev->pdev;
> > +     struct pci_dev *parent_port;
> > +     u16 val;
> > +
> > +     parent_port = pdev->bus->self;
> > +     pci_read_config_word(parent_port, PCI_BRIDGE_CONTROL, &val);
> > +     val |= PCI_BRIDGE_CTL_BUS_RESET;
> > +     pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val);
> > +     ssleep(1);
> > +
> > +     val &= ~(PCI_BRIDGE_CTL_BUS_RESET);
> > +     pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val);
> > +     ssleep(3);
> > +}
> > +
> > +/**
> > + * goya_set_ddr_bar_base - set DDR bar to map specific device address
> > + *
> > + * @hdev: pointer to hl_device structure
> > + * @addr: address in DDR. Must be aligned to DDR bar size
> > + *
> > + * This function configures the iATU so that the DDR bar will start at the
> > + * specified addr.
> > + *
> > + */
> > +static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
> > +{
> > +     struct goya_device *goya = hdev->asic_specific;
> > +     int rc;
> > +
> > +     if ((goya) && (goya->ddr_bar_cur_addr == addr))
> > +             return 0;
> > +
> > +     /* Inbound Region 1 - Bar 4 - Point to DDR */
> > +     rc = goya_iatu_write(hdev, 0x314, lower_32_bits(addr));
> > +     rc |= goya_iatu_write(hdev, 0x318, upper_32_bits(addr));
> > +     rc |= goya_iatu_write(hdev, 0x300, 0);
> > +     /* Enable + Bar match + match enable + Bar 4 */
> > +     rc |= goya_iatu_write(hdev, 0x304, 0xC0080400);
> > +
> > +     /* Return the DBI window to the default location */
> > +     rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0);
> > +     rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0);
>
> And here as well.
Same remark as the previous one

> > +
> > +     if (rc) {
> > +             dev_err(hdev->dev, "failed to map DDR bar to 0x%08llx\n", addr);
> > +             return rc;
> > +     }
>
> I believe that at least here you'd want to return an error code.
Fixed
>
> > +
> > +     if (goya)
> > +             goya->ddr_bar_cur_addr = addr;
> > +
> > +     return 0;
> > +}
> > +
> > +/**
> > + * goya_init_iatu - Initialize the iATU unit inside the PCI controller
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * This is needed in case the firmware doesn't initialize the iATU
> > + *
> > + */
> > +static int goya_init_iatu(struct hl_device *hdev)
> > +{
> > +     int rc;
> > +
> > +     /* Inbound Region 0 - Bar 0 - Point to SRAM_BASE_ADDR */
> > +     rc  = goya_iatu_write(hdev, 0x114, lower_32_bits(SRAM_BASE_ADDR));
> > +     rc |= goya_iatu_write(hdev, 0x118, upper_32_bits(SRAM_BASE_ADDR));
> > +     rc |= goya_iatu_write(hdev, 0x100, 0);
> > +     /* Enable + Bar match + match enable */
> > +     rc |= goya_iatu_write(hdev, 0x104, 0xC0080000);
> > +
> > +     /* Inbound Region 1 - Bar 4 - Point to DDR */
> > +     rc |= goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE);
> > +
> > +     /* Outbound Region 0 - Point to Host */
> > +     rc |= goya_iatu_write(hdev, 0x008, lower_32_bits(HOST_PHYS_BASE));
> > +     rc |= goya_iatu_write(hdev, 0x00C, upper_32_bits(HOST_PHYS_BASE));
> > +     rc |= goya_iatu_write(hdev, 0x010,
> > +             lower_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1));
> > +     rc |= goya_iatu_write(hdev, 0x014, 0);
> > +     rc |= goya_iatu_write(hdev, 0x018, 0);
> > +     rc |= goya_iatu_write(hdev, 0x020,
> > +             upper_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1));
> > +     /* Increase region size */
> > +     rc |= goya_iatu_write(hdev, 0x000, 0x00002000);
> > +     /* Enable */
> > +     rc |= goya_iatu_write(hdev, 0x004, 0x80000000);
> > +
> > +     /* Return the DBI window to the default location */
> > +     rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0);
> > +     rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0);
> > +
> > +     return rc;
>
> Ditto
Fixed
>
> > +}
> > +
> > +/**
> > + * goya_early_init - GOYA early initialization code
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * Verify PCI bars
> > + * Set DMA masks
> > + * PCI controller initialization
> > + * Map PCI bars
> > + *
> > + */
> > +static int goya_early_init(struct hl_device *hdev)
> > +{
> > +     struct asic_fixed_properties *prop = &hdev->asic_prop;
> > +     struct pci_dev *pdev = hdev->pdev;
> > +     u32 val;
> > +     int rc;
> > +
> > +     goya_get_fixed_properties(hdev);
> > +
> > +     /* Check BAR sizes */
> > +     if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) {
> > +             dev_err(hdev->dev,
> > +                     "Not " HL_NAME "? BAR %d size %llu, expecting %llu\n",
> > +                     SRAM_CFG_BAR_ID,
> > +                     pci_resource_len(pdev, SRAM_CFG_BAR_ID),
> > +                     CFG_BAR_SIZE);
> > +             return -ENODEV;
> > +     }
> > +
> > +     if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) {
> > +             dev_err(hdev->dev,
> > +                     "Not " HL_NAME "? BAR %d size %llu, expecting %llu\n",
> > +                     MSIX_BAR_ID, pci_resource_len(pdev, MSIX_BAR_ID),
> > +                     MSIX_BAR_SIZE);
> > +             return -ENODEV;
> > +     }
> > +
> > +     prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
> > +
> > +     /* set DMA mask for GOYA */
> > +     rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(39));
> > +     if (rc) {
> > +             dev_warn(hdev->dev, "Unable to set pci dma mask to 39 bits\n");
> > +             rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
> > +             if (rc) {
> > +                     dev_err(hdev->dev,
> > +                             "Unable to set pci dma mask to 32 bits\n");
> > +                     return rc;
> > +             }
> > +     }
> > +
> > +     rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(39));
> > +     if (rc) {
> > +             dev_warn(hdev->dev,
> > +                     "Unable to set pci consistent dma mask to 39 bits\n");
> > +             rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
> > +             if (rc) {
> > +                     dev_err(hdev->dev,
> > +                             "Unable to set pci consistent dma mask to 32 bits\n");
> > +                     return rc;
> > +             }
> > +     }
> > +
> > +     if (hdev->reset_pcilink)
> > +             goya_reset_link_through_bridge(hdev);
> > +
> > +     rc = pci_enable_device_mem(pdev);
> > +     if (rc) {
> > +             dev_err(hdev->dev, "can't enable PCI device\n");
> > +             return rc;
> > +     }
> > +
> > +     pci_set_master(pdev);
> > +
> > +     rc = goya_init_iatu(hdev);
> > +     if (rc) {
> > +             dev_err(hdev->dev, "Failed to initialize iATU\n");
> > +             goto disable_device;
> > +     }
> > +
> > +     rc = goya_pci_bars_map(hdev);
> > +     if (rc) {
> > +             dev_err(hdev->dev, "Failed to initialize PCI BARS\n");
> > +             goto disable_device;
> > +     }
> > +
> > +     val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
> > +     if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK)
> > +             dev_warn(hdev->dev,
> > +                     "PCI strap is not configured correctly, PCI bus errors may occur\n");
> > +
> > +     return 0;
> > +
> > +disable_device:
> > +     pci_clear_master(pdev);
> > +     pci_disable_device(pdev);
> > +
> > +     return rc;
> > +}
> > +
> > +/**
> > + * goya_early_fini - GOYA early finalization code
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + * Unmap PCI bars
> > + *
> > + */
> > +int goya_early_fini(struct hl_device *hdev)
> > +{
> > +     goya_pci_bars_unmap(hdev);
> > +
> > +     pci_clear_master(hdev->pdev);
> > +     pci_disable_device(hdev->pdev);
> > +
> > +     return 0;
> > +}
> > +
> > +/**
> > + * goya_sw_init - Goya software initialization code
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + */
> > +static int goya_sw_init(struct hl_device *hdev)
> > +{
> > +     struct goya_device *goya;
> > +     int rc;
> > +
> > +     /* Allocate device structure */
> > +     goya = kzalloc(sizeof(*goya), GFP_KERNEL);
>
> Consider using devm_k[mz]alloc() for memory allocations throughout the
> driver. I didn't check all the spots where it can be applicable.
I honestly wasn't aware of that. We never used that in AMD drivers
(which where I spent most of my kernel time).
I'll look into that offline but for now I don't really want to change
into it blindly in all locations, unless there is some hard kernel
rule for using that in drivers.

>
> > +     if (!goya)
> > +             return -ENOMEM;
> > +
> > +     /* according to goya_init_iatu */
> > +     goya->ddr_bar_cur_addr = DRAM_PHYS_BASE;
> > +     hdev->asic_specific = goya;
> > +
> > +     /* Create DMA pool for small allocations */
> > +     hdev->dma_pool = dma_pool_create(dev_name(hdev->dev),
> > +                     &hdev->pdev->dev, GOYA_DMA_POOL_BLK_SIZE, 8, 0);
> > +     if (!hdev->dma_pool) {
> > +             dev_err(hdev->dev, "failed to create DMA pool\n");
> > +             rc = -ENOMEM;
> > +             goto free_goya_device;
> > +     }
> > +
> > +     hdev->cpu_accessible_dma_mem =
> > +                     hdev->asic_funcs->dma_alloc_coherent(hdev,
> > +                                     CPU_ACCESSIBLE_MEM_SIZE,
> > +                                     &hdev->cpu_accessible_dma_address,
> > +                                     GFP_KERNEL | __GFP_ZERO);
> > +
> > +     if (!hdev->cpu_accessible_dma_mem) {
> > +             dev_err(hdev->dev,
> > +                     "failed to allocate %d of dma memory for CPU accessible memory space\n",
> > +                     CPU_ACCESSIBLE_MEM_SIZE);
> > +             rc = -ENOMEM;
> > +             goto free_dma_pool;
> > +     }
> > +
> > +     hdev->cpu_accessible_dma_pool = gen_pool_create(CPU_PKT_SHIFT, -1);
> > +     if (!hdev->cpu_accessible_dma_pool) {
> > +             dev_err(hdev->dev,
> > +                     "Failed to create CPU accessible DMA pool\n");
> > +             rc = -ENOMEM;
>
> You could init rc = -ENOMEM at the beginning and save the duplication.
Again, I don't agree with that programming paradigm. If I do that, and
then I'll add code at the beginning of the function in the likes of:
rc = foo()
I will insert a bug.
So I prefer the duplication and make the code more robust to future changes.

>
> > +             goto free_cpu_pq_dma_mem;
> > +     }
> > +
> > +     rc = gen_pool_add(hdev->cpu_accessible_dma_pool,
> > +                             (u64) hdev->cpu_accessible_dma_mem,
> > +                             CPU_ACCESSIBLE_MEM_SIZE, -1);
> > +     if (rc) {
> > +             dev_err(hdev->dev,
> > +                     "Failed to add memory to CPU accessible DMA pool\n");
> > +             rc = -EFAULT;
> > +             goto free_cpu_pq_pool;
> > +     }
> > +
> > +     spin_lock_init(&goya->hw_queues_lock);
> > +
> > +     return 0;
> > +
> > +free_cpu_pq_pool:
> > +     gen_pool_destroy(hdev->cpu_accessible_dma_pool);
> > +free_cpu_pq_dma_mem:
> > +     hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE,
> > +                     hdev->cpu_accessible_dma_mem,
> > +                     hdev->cpu_accessible_dma_address);
> > +free_dma_pool:
> > +     dma_pool_destroy(hdev->dma_pool);
> > +free_goya_device:
> > +     kfree(goya);
> > +
> > +     return rc;
> > +}
> > +
> > +/**
> > + * goya_sw_fini - Goya software tear-down code
> > + *
> > + * @hdev: pointer to hl_device structure
> > + *
> > + */
> > +int goya_sw_fini(struct hl_device *hdev)
> > +{
> > +     struct goya_device *goya = hdev->asic_specific;
> > +
> > +     gen_pool_destroy(hdev->cpu_accessible_dma_pool);
> > +
> > +     hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE,
> > +                     hdev->cpu_accessible_dma_mem,
> > +                     hdev->cpu_accessible_dma_address);
> > +
> > +     dma_pool_destroy(hdev->dma_pool);
> > +
> > +     kfree(goya);
> > +
> > +     return 0;
> > +}
> > +
> > +int goya_suspend(struct hl_device *hdev)
> > +{
> > +     return 0;
> > +}
> > +
> > +int goya_resume(struct hl_device *hdev)
> > +{
> > +     return 0;
> > +}
> > +
> > +void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
> > +                                     dma_addr_t *dma_handle, gfp_t flags)
> > +{
> > +     return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags);
> > +}
> > +
> > +void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr,
> > +                             dma_addr_t dma_handle)
> > +{
> > +     dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
> > +}
> > +
> > +static const struct hl_asic_funcs goya_funcs = {
> > +     .early_init = goya_early_init,
> > +     .early_fini = goya_early_fini,
> > +     .sw_init = goya_sw_init,
> > +     .sw_fini = goya_sw_fini,
> > +     .suspend = goya_suspend,
> > +     .resume = goya_resume,
> > +     .dma_alloc_coherent = goya_dma_alloc_coherent,
> > +     .dma_free_coherent = goya_dma_free_coherent,
>
> Is there any additional functionality that is planned in goya or gaudi in
> these two functions?
> It seems like they are not really needed, at least at the moment and for
> sure that don't need to be part of ASIC ops.

So this relates to the simulator support, because there the
implementation of these two functions is totally different as I don't
have pci device.

>
> > +};
> > +
> > +/**
> > + * goya_set_asic_funcs - set Goya function pointers
> > + *
> > + * @*hdev: pointer to hl_device structure
> > + *
> > + */
> > +void goya_set_asic_funcs(struct hl_device *hdev)
> > +{
> > +     hdev->asic_funcs = &goya_funcs;
> > +}
> > diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
> > new file mode 100644
> > index 000000000000..0e12c56472bd
> > --- /dev/null
> > +++ b/drivers/misc/habanalabs/goya/goyaP.h
> > @@ -0,0 +1,125 @@
> > +/* SPDX-License-Identifier: GPL-2.0
> > + *
> > + * Copyright 2016-2018 HabanaLabs, Ltd.
> > + * All Rights Reserved.
> > + *
> > + */
> > +
> > +#ifndef GOYAP_H_
> > +#define GOYAP_H_
> > +
> > +#include "habanalabs.h"
> > +#include "include/goya/goya.h"
> > +
> > +#define NUMBER_OF_CMPLT_QUEUES               5
> > +#define NUMBER_OF_EXT_HW_QUEUES              5
> > +#define NUMBER_OF_CPU_HW_QUEUES              1
> > +#define NUMBER_OF_INT_HW_QUEUES              9
> > +#define NUMBER_OF_HW_QUEUES          (NUMBER_OF_EXT_HW_QUEUES + \
> > +                                     NUMBER_OF_CPU_HW_QUEUES + \
> > +                                     NUMBER_OF_INT_HW_QUEUES)
> > +
> > +/*
> > + * Number of MSIX interrupts IDS:
> > + * Each completion queue has 1 ID
> > + * The event queue has 1 ID
> > + * ArmCP reset has 1 ID
> > + */
> > +#define NUMBER_OF_INTERRUPTS         (NUMBER_OF_CMPLT_QUEUES + 2)
> > +
> > +#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES)
> > +#error "Number of H/W queues must be smaller than HL_MAX_QUEUES"
> > +#endif
> > +
> > +#if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES)
> > +#error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES"
> > +#endif
> > +
> > +#define QMAN_FENCE_TIMEOUT_USEC              10000   /* 10 ms */
> > +
> > +#define QMAN_STOP_TIMEOUT_USEC               100000  /* 100 ms */
> > +
> > +#define TPC_MAX_NUM                  8
> > +#define TPC_ENABLED_MASK             0xFF
> > +
> > +#define DMA_MAX_NUM                  5
> > +
> > +#define PLL_HIGH_DEFAULT             1575000000      /* 1.575 GHz */
> > +
> > +#define GOYA_ARMCP_INFO_TIMEOUT              10000000        /* 10s */
> > +
> > +#define DRAM_PHYS_DEFAULT_SIZE               0x100000000ull  /* 4GB */
> > +
> > +/*
> > + * SRAM Memory Map for KMD
> > + *
> > + * KMD occupies KMD_SRAM_SIZE bytes from the start of SRAM. It is used for
> > + * MME/TPC QMANs
> > + *
> > + */
> > +
> > +#define MME_QMAN_BASE_OFFSET 0x000000        /* Must be 0 */
> > +#define MME_QMAN_LENGTH              64
> > +#define TPC_QMAN_LENGTH              64
> > +
> > +#define TPC0_QMAN_BASE_OFFSET        (MME_QMAN_BASE_OFFSET + \
> > +                             (MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC1_QMAN_BASE_OFFSET        (TPC0_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC2_QMAN_BASE_OFFSET        (TPC1_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC3_QMAN_BASE_OFFSET        (TPC2_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC4_QMAN_BASE_OFFSET        (TPC3_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC5_QMAN_BASE_OFFSET        (TPC4_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC6_QMAN_BASE_OFFSET        (TPC5_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +#define TPC7_QMAN_BASE_OFFSET        (TPC6_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +
> > +#define SRAM_KMD_RES_OFFSET  (TPC7_QMAN_BASE_OFFSET + \
> > +                             (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
> > +
> > +#if (SRAM_KMD_RES_OFFSET >= KMD_SRAM_RESERVED_SIZE)
> > +#error "MME/TPC QMANs SRAM space exceeds limit"
> > +#endif
> > +
> > +#define SRAM_USER_BASE_OFFSET        KMD_SRAM_RESERVED_SIZE
> > +
> > +#define DMA_MAX_TRANSFER_SIZE        0xFFFFFFFF
> > +
> > +#define HW_CAP_PLL           0x00000001
> > +#define HW_CAP_DDR_0         0x00000002
> > +#define HW_CAP_DDR_1         0x00000004
> > +#define HW_CAP_MME           0x00000008
> > +#define HW_CAP_CPU           0x00000010
> > +#define HW_CAP_DMA           0x00000020
> > +#define HW_CAP_MSIX          0x00000040
> > +#define HW_CAP_CPU_Q         0x00000080
> > +#define HW_CAP_MMU           0x00000100
> > +#define HW_CAP_TPC_MBIST     0x00000200
> > +#define HW_CAP_GOLDEN                0x00000400
> > +#define HW_CAP_TPC           0x00000800
> > +
> > +#define CPU_PKT_SHIFT                5
> > +#define CPU_PKT_SIZE         (1 << CPU_PKT_SHIFT)
> > +#define CPU_PKT_MASK         (~((1 << CPU_PKT_SHIFT) - 1))
> > +#define CPU_MAX_PKTS_IN_CB   32
> > +#define CPU_CB_SIZE          (CPU_PKT_SIZE * CPU_MAX_PKTS_IN_CB)
> > +#define CPU_ACCESSIBLE_MEM_SIZE      (HL_QUEUE_LENGTH * CPU_CB_SIZE)
> > +
> > +enum goya_fw_component {
> > +     FW_COMP_UBOOT,
> > +     FW_COMP_PREBOOT
> > +};
> > +
> > +struct goya_device {
> > +     /* TODO: remove hw_queues_lock after moving to scheduler code */
> > +     spinlock_t      hw_queues_lock;
> > +     u64             ddr_bar_cur_addr;
> > +     u32             hw_cap_initialized;
> > +};
> > +
> > +#endif /* GOYAP_H_ */
> > diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
> > index 7e1b088b677c..97844825f7a8 100644
> > --- a/drivers/misc/habanalabs/habanalabs.h
> > +++ b/drivers/misc/habanalabs/habanalabs.h
> > @@ -21,11 +21,64 @@
> >
> >  #define HL_NAME                              "habanalabs"
> >
> > +#define HL_MAX_QUEUES                        128
> > +
> >  struct hl_device;
> >
> >
> >
> >
> > +/**
> > + * struct asic_fixed_properties - ASIC specific immutable properties.
> > + * @sram_base_address: SRAM physical start address.
> > + * @sram_end_address: SRAM physical end address.
> > + * @sram_user_base_address - SRAM physical start address for user access.
> > + * @dram_base_address: DRAM physical start address.
> > + * @dram_end_address: DRAM physical end address.
> > + * @dram_user_base_address: DRAM physical start address for user access.
> > + * @dram_size: DRAM total size.
> > + * @dram_pci_bar_size: size of PCI bar towards DRAM.
> > + * @host_phys_base_address: base physical address of host memory for
> > + *                           transactions that the device generates.
> > + * @va_space_host_start_address: base address of virtual memory range for
> > + *                               mapping host memory.
> > + * @va_space_host_end_address: end address of virtual memory range for
> > + *                             mapping host memory.
> > + * @va_space_dram_start_address: base address of virtual memory range for
> > + *                               mapping DRAM memory.
> > + * @va_space_dram_end_address: end address of virtual memory range for
> > + *                             mapping DRAM memory.
> > + * @cfg_size: configuration space size on SRAM.
> > + * @sram_size: total size of SRAM.
> > + * @max_asid: maximum number of open contexts (ASIDs).
> > + * @completion_queues_count: number of completion queues.
> > + * @high_pll: high PLL frequency used by the device.
> > + * @tpc_enabled_mask: which TPCs are enabled.
> > + */
> > +struct asic_fixed_properties {
> > +     u64                     sram_base_address;
> > +     u64                     sram_end_address;
> > +     u64                     sram_user_base_address;
> > +     u64                     dram_base_address;
> > +     u64                     dram_end_address;
> > +     u64                     dram_user_base_address;
> > +     u64                     dram_size;
> > +     u64                     dram_pci_bar_size;
> > +     u64                     host_phys_base_address;
> > +     u64                     va_space_host_start_address;
> > +     u64                     va_space_host_end_address;
> > +     u64                     va_space_dram_start_address;
> > +     u64                     va_space_dram_end_address;
> > +     u32                     cfg_size;
> > +     u32                     sram_size;
> > +     u32                     max_asid;
> > +     u32                     high_pll;
> > +     u8                      completion_queues_count;
> > +     u8                      tpc_enabled_mask;
> > +};
> > +
> > +
> > +#define HL_QUEUE_LENGTH                      256
> >
> >
> >  /*
> > @@ -47,6 +100,30 @@ enum hl_asic_type {
> >
> >
> >
> > +/**
> > + * struct hl_asic_funcs - ASIC specific functions that are can be called from
> > + *                        common code.
> > + * @early_init: sets up early driver state (pre sw_init), doesn't configure H/W.
> > + * @early_fini: tears down what was done in early_init.
> > + * @sw_init: sets up driver state, does not configure H/W.
> > + * @sw_fini: tears down driver state, does not configure H/W.
> > + * @suspend: handles IP specific H/W or SW changes for suspend.
> > + * @resume: handles IP specific H/W or SW changes for resume.
> > + * @dma_alloc_coherent: DMA allocate coherent memory.
> > + * @dma_free_coherent: free DMA allocation.
> > + */
> > +struct hl_asic_funcs {
> > +     int (*early_init)(struct hl_device *hdev);
> > +     int (*early_fini)(struct hl_device *hdev);
> > +     int (*sw_init)(struct hl_device *hdev);
> > +     int (*sw_fini)(struct hl_device *hdev);
> > +     int (*suspend)(struct hl_device *hdev);
> > +     int (*resume)(struct hl_device *hdev);
> > +     void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
> > +                                     dma_addr_t *dma_handle, gfp_t flag);
> > +     void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
> > +                                     void *cpu_addr, dma_addr_t dma_handle);
> > +};
> >
> >  /*
> >   * FILE PRIVATE STRUCTURE
> > @@ -78,26 +155,78 @@ struct hl_fpriv {
> >   */
> >  #define HL_MAX_MINORS        256
> >
> > +/*
> > + * Registers read & write functions.
> > + */
> > +
> > +u32 hl_rreg(struct hl_device *hdev, u32 reg);
> > +void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
> > +
> > +#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
> > +     readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us)
> > +
> > +#define RREG32(reg) hl_rreg(hdev, (reg))
> > +#define WREG32(reg, v) hl_wreg(hdev, (reg), (v))
> > +#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \
> > +                             hl_rreg(hdev, (reg)))
> > +
> > +#define WREG32_P(reg, val, mask)                             \
> > +     do {                                                    \
> > +             u32 tmp_ = RREG32(reg);                         \
> > +             tmp_ &= (mask);                                 \
> > +             tmp_ |= ((val) & ~(mask));                      \
> > +             WREG32(reg, tmp_);                              \
> > +     } while (0)
> > +#define WREG32_AND(reg, and) WREG32_P(reg, 0, and)
> > +#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or))
> > +
> > +#define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT
> > +#define REG_FIELD_MASK(reg, field) reg##_##field##_MASK
> > +#define WREG32_FIELD(reg, field, val)        \
> > +     WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
> > +                     (val) << REG_FIELD_SHIFT(reg, field))
> > +
> >  /**
> >   * struct hl_device - habanalabs device structure.
> >   * @pdev: pointer to PCI device, can be NULL in case of simulator device.
> > + * @pcie_bar: array of available PCIe bars.
> > + * @rmmio: configuration area address on SRAM.
> >   * @cdev: related char device.
> >   * @dev: realted kernel basic device structure.
> >   * @asic_name: ASIC specific nmae.
> >   * @asic_type: ASIC specific type.
> > + * @dma_pool: DMA pool for small allocations.
> > + * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
> > + * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address.
> > + * @cpu_accessible_dma_pool: KMD <-> ArmCP shared memory pool.
> > + * @asic_prop: ASIC specific immutable properties.
> > + * @asic_funcs: ASIC specific functions.
> > + * @asic_specific: ASIC specific information to use only from ASIC files.
> >   * @major: habanalabs KMD major.
> >   * @id: device minor.
> >   * @disabled: is device disabled.
> >   */
> >  struct hl_device {
> >       struct pci_dev                  *pdev;
> > +     void __iomem                    *pcie_bar[6];
> > +     void __iomem                    *rmmio;
> >       struct cdev                     cdev;
> >       struct device                   *dev;
> >       char                            asic_name[16];
> >       enum hl_asic_type               asic_type;
> > +     struct dma_pool                 *dma_pool;
> > +     void                            *cpu_accessible_dma_mem;
> > +     dma_addr_t                      cpu_accessible_dma_address;
> > +     struct gen_pool                 *cpu_accessible_dma_pool;
> > +     struct asic_fixed_properties    asic_prop;
> > +     const struct hl_asic_funcs      *asic_funcs;
> > +     void                            *asic_specific;
> >       u32                             major;
> >       u16                             id;
> >       u8                              disabled;
> > +
> > +     /* Parameters for bring-up */
> > +     u8                              reset_pcilink;
> >  };
> >
> >  /*
> > @@ -146,4 +275,6 @@ void hl_device_fini(struct hl_device *hdev);
> >  int hl_device_suspend(struct hl_device *hdev);
> >  int hl_device_resume(struct hl_device *hdev);
> >
> > +void goya_set_asic_funcs(struct hl_device *hdev);
> > +
> >  #endif /* HABANALABSP_H_ */
> > diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
> > index 15217975327b..79545003b7c2 100644
> > --- a/drivers/misc/habanalabs/habanalabs_drv.c
> > +++ b/drivers/misc/habanalabs/habanalabs_drv.c
> > @@ -136,6 +136,9 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
> >
> >       hdev->major = hl_major;
> >
> > +     /* Parameters for bring-up - set them to defaults */
> > +     hdev->reset_pcilink = 0;
> > +
> >       hdev->disabled = true;
> >       hdev->pdev = pdev; /* can be NULL in case of simulator device */
> >
> > diff --git a/drivers/misc/habanalabs/include/goya/goya.h b/drivers/misc/habanalabs/include/goya/goya.h
> > new file mode 100644
> > index 000000000000..192a1450cbb1
> > --- /dev/null
> > +++ b/drivers/misc/habanalabs/include/goya/goya.h
> > @@ -0,0 +1,115 @@
> > +/* SPDX-License-Identifier: GPL-2.0
> > + *
> > + * Copyright 2016-2018 HabanaLabs, Ltd.
> > + * All Rights Reserved.
> > + *
> > + * Author: Oded Gabbay <oded.gabbay@gmail.com>
> > + *
> > + */
> > +
> > +#ifndef GOYA_H
> > +#define GOYA_H
> > +
> > +#include "asic_reg/goya_regs.h"
> > +
> > +#include <linux/types.h>
> > +
> > +#define SRAM_CFG_BAR_ID              0
> > +#define MSIX_BAR_ID          2
> > +#define DDR_BAR_ID           4
> > +
> > +#define CFG_BAR_SIZE         0x10000000ull           /* 256MB */
> > +#define MSIX_BAR_SIZE                0x1000ull               /* 4KB */
> > +
> > +#define CFG_BASE             0x7FFC000000ull
> > +#define CFG_SIZE             0x4000000               /* 32MB CFG + 32MB DBG*/
> > +
> > +#define SRAM_BASE_ADDR               0x7FF0000000ull
> > +#define SRAM_SIZE            0x32A0000               /* 50.625MB */
> > +#define KMD_SRAM_RESERVED_SIZE       0x8000                  /* 32KB */
> > +
> > +#define SRAM_BASE_ADDR_USER  (0x7FF0000000ull + KMD_SRAM_RESERVED_SIZE)
> > +#define SRAM_SIZE_USER               (SRAM_SIZE - KMD_SRAM_RESERVED_SIZE)
> > +
> > +#define DRAM_PHYS_BASE               0x0ull
> > +
> > +#define CPU_FW_IMAGE_SIZE    0x10000000      /* 256MB */
> > +#define MMU_PAGE_TABLES_SIZE 0x0E000000      /* 224MB */
> > +#define CPU_PQ_PKT_SIZE              0x00001000      /* 4KB */
> > +#define CPU_PQ_DATA_SIZE     0x01FFF000      /* 32MB - 4KB  */
> > +
> > +#define CPU_FW_IMAGE_ADDR    DRAM_PHYS_BASE
> > +#define MMU_PAGE_TABLES_ADDR (CPU_FW_IMAGE_ADDR + CPU_FW_IMAGE_SIZE)
> > +#define CPU_PQ_PKT_ADDR              (MMU_PAGE_TABLES_ADDR + MMU_PAGE_TABLES_SIZE)
> > +#define CPU_PQ_DATA_ADDR     (CPU_PQ_PKT_ADDR + CPU_PQ_PKT_SIZE)
> > +#define DRAM_BASE_ADDR_USER  (CPU_PQ_DATA_ADDR + CPU_PQ_DATA_SIZE)
> > +
> > +#define HOST_PHYS_BASE               0x8000000000ull         /* 0.5TB */
> > +#define HOST_PHYS_SIZE               0x1000000000000ull      /* 0.25PB (48 bits) */
> > +
> > +#define VA_HOST_SPACE_START  0x1000000000000ull      /* 256TB */
> > +#define VA_HOST_SPACE_END    0x3FF8000000000ull      /* 1PB - 1TB */
> > +#define VA_HOST_SPACE_SIZE   (VA_HOST_SPACE_END - \
> > +                                     VA_HOST_SPACE_START) /* 767TB */
> > +
> > +#define VA_DDR_SPACE_START   0x800000000ull          /* 32GB */
> > +#define VA_DDR_SPACE_END     0x2000000000ull         /* 128GB */
> > +#define VA_DDR_SPACE_SIZE    (VA_DDR_SPACE_END - \
> > +                                     VA_DDR_SPACE_START)     /* 128GB */
> > +
> > +#define CPU_BOOT_ADDR                0x7FF8040000ull
> > +
> > +#define UBOOT_FW_OFFSET              0x100000                /* 1MB in SRAM */
> > +#define LINUX_FW_OFFSET              0x800000                /* 8BM in DDR */
> > +
> > +#define GOYA_MSIX_ENTRIES    8
> > +#define EVENT_QUEUE_MSIX_IDX 5
> > +#define ARMCP_RESET_MSIX_IDX 6
> > +
> > +#define QMAN_PQ_ENTRY_SIZE   16                      /* Bytes */
> > +
> > +#define MAX_ASID             1024
> > +
> > +#define PROT_BITS_OFFS               0xF80
> > +
> > +/*
> > + * Queue Numbering
> > + *
> > + * The external queues (DMA channels + CPU) MUST be before the internal queues
> > + * and each group (DMA channels + CPU and internal) must be contiguous inside
> > + * itself but there can be a gap between the two groups (although not
> > + * recommended)
> > + */
> > +
> > +enum goya_queue_id {
> > +     GOYA_QUEUE_ID_DMA_0 = 0,
> > +     GOYA_QUEUE_ID_DMA_1,
> > +     GOYA_QUEUE_ID_DMA_2,
> > +     GOYA_QUEUE_ID_DMA_3,
> > +     GOYA_QUEUE_ID_DMA_4,
> > +     GOYA_QUEUE_ID_CPU_PQ,
> > +     GOYA_QUEUE_ID_MME,
> > +     GOYA_QUEUE_ID_TPC0,
> > +     GOYA_QUEUE_ID_TPC1,
> > +     GOYA_QUEUE_ID_TPC2,
> > +     GOYA_QUEUE_ID_TPC3,
> > +     GOYA_QUEUE_ID_TPC4,
> > +     GOYA_QUEUE_ID_TPC5,
> > +     GOYA_QUEUE_ID_TPC6,
> > +     GOYA_QUEUE_ID_TPC7,
> > +     GOYA_QUEUE_ID_SIZE
> > +};
> > +
> > +enum goya_pll_index {
> > +     CPU_PLL = 0,
> > +     IC_PLL,
> > +     MC_PLL,
> > +     MME_PLL,
> > +     PCI_PLL,
> > +     EMMC_PLL,
> > +     TPC_PLL
> > +};
> > +
> > +#define GOYA_PLL_FREQ_LOW            50000000 /* 50 MHz */
> > +
> > +#endif /* GOYA_H */
> > --
> > 2.17.1
> >
>
> --
> Sincerely yours,
> Mike.
>