virtualization.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
@ 2024-03-27 11:21 Srujana Challa
  2024-03-29  4:28 ` Jason Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 28+ messages in thread
From: Srujana Challa @ 2024-03-27 11:21 UTC (permalink / raw)
  To: virtualization, mst, jasowang
  Cc: xuanzhuo, schalla, vattunuru, sthotton, ndabilpuram, jerinj

This commit introduces a new vDPA driver specifically designed for
managing the virtio control plane over the vDPA bus for OCTEON DPU
devices. The driver consists of two layers:

1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
operations and configurations related to the DPU device.

2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
implements device operations for the vDPA bus. It handles device
probing, bus attachment, vring operations, and other relevant tasks.

Signed-off-by: Srujana Challa <schalla@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Shijith Thotton <sthotton@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
 MAINTAINERS                              |   5 +
 drivers/vdpa/Kconfig                     |   9 +
 drivers/vdpa/Makefile                    |   1 +
 drivers/vdpa/octeon_ep/Makefile          |   4 +
 drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
 drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
 drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903 +++++++++++++++++++++++
 7 files changed, 1472 insertions(+)
 create mode 100644 drivers/vdpa/octeon_ep/Makefile
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index cfe44a876d8a..539ce209a960 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13127,6 +13127,11 @@ S:	Supported
 F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
 F:	drivers/mmc/host/sdhci-xenon*
 
+MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
+R:	schalla@marvell.com
+R:	vattunuru@marvell.com
+F:	drivers/vdpa/octeon_ep/
+
 MATROX FRAMEBUFFER DRIVER
 L:	linux-fbdev@vger.kernel.org
 S:	Orphan
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 656c1cb541de..775149fad476 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -126,4 +126,13 @@ config PDS_VDPA
 	  With this driver, the VirtIO dataplane can be
 	  offloaded to an AMD/Pensando DSC device.
 
+config OCTEONEP_VDPA
+	tristate "vDPA driver for Octeon DPU devices"
+	depends on m
+	depends on PCI_MSI
+	help
+	  vDPA driver for Marvell's Octeon DPU devices.
+	  With this driver, the VirtIO dataplane can be
+	  offloaded to a Octeon DPU device.
+
 endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index 8f53c6f3cca7..5654d36707af 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
 obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
 obj-$(CONFIG_SNET_VDPA) += solidrun/
 obj-$(CONFIG_PDS_VDPA) += pds/
+obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
new file mode 100644
index 000000000000..e23e2ff14f33
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
new file mode 100644
index 000000000000..60d2efc8f8b8
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (C) 2024 Marvell.
+ */
+#ifndef __OCTEP_VDPA_H__
+#define __OCTEP_VDPA_H__
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_blk.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+#include <uapi/linux/vdpa.h>
+
+#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
+#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
+#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
+#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
+#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
+#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
+
+#define OCTEP_HW_MBOX_BAR 0
+#define OCTEP_HW_CAPS_BAR 4
+
+#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
+
+#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
+#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
+#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
+
+#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
+#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
+
+#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
+#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
+
+enum octep_vdpa_dev_status {
+	OCTEP_VDPA_DEV_STATUS_INVALID,
+	OCTEP_VDPA_DEV_STATUS_ALLOC,
+	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
+	OCTEP_VDPA_DEV_STATUS_INIT,
+	OCTEP_VDPA_DEV_STATUS_READY,
+	OCTEP_VDPA_DEV_STATUS_UNINIT
+};
+
+struct octep_vring_info {
+	struct vdpa_callback cb;
+	void __iomem *notify_addr;
+	u32 __iomem *cb_notify_addr;
+	phys_addr_t notify_pa;
+	char msix_name[256];
+};
+
+struct octep_hw {
+	struct pci_dev *pdev;
+	u8 __iomem *base[PCI_STD_NUM_BARS];
+	struct virtio_pci_common_cfg __iomem *common_cfg;
+	u8 __iomem *dev_cfg;
+	u8 __iomem *isr;
+	void __iomem *notify_base;
+	phys_addr_t notify_base_pa;
+	u32 notify_off_multiplier;
+	u8 notify_bar;
+	struct octep_vring_info *vqs;
+	struct vdpa_callback config_cb;
+	u64 features;
+	u64 drv_features;
+	u16 nr_vring;
+	u32 config_size;
+	int irq;
+};
+
+u8 octep_hw_get_status(struct octep_hw *oct_hw);
+void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
+void octep_hw_reset(struct octep_hw *oct_hw);
+void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
+			 u64 device_area);
+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
+u16 octep_get_vq_size(struct octep_hw *oct_hw);
+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
+
+#endif /* __OCTEP_VDPA_H__ */
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
new file mode 100644
index 000000000000..0a2f1d09c4ab
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
@@ -0,0 +1,457 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Marvell. */
+
+#include <linux/iopoll.h>
+
+#include "octep_vdpa.h"
+
+enum octep_mbox_ids {
+	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
+	OCTEP_MBOX_MSG_GET_VQ_STATE,
+};
+
+#define OCTEP_HW_TIMEOUT       10000000
+
+#define MBOX_OFFSET            64
+#define MBOX_RSP_MASK          0x00000001
+#define MBOX_RC_MASK           0x0000FFFE
+
+#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
+#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
+#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
+
+struct octep_mbox_hdr {
+	u8 ver;
+	u8 rsvd1;
+	u16 id;
+	u16 rsvd2;
+#define MBOX_REQ_SIG (0xdead)
+#define MBOX_RSP_SIG (0xbeef)
+	u16 sig;
+};
+
+struct octep_mbox_sts {
+	u16 rsp:1;
+	u16 rc:15;
+	u16 rsvd;
+};
+
+struct octep_mbox {
+	struct octep_mbox_hdr hdr;
+	struct octep_mbox_sts sts;
+	u64 rsvd;
+	u32 data[];
+};
+
+static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
+{
+	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
+}
+
+static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
+{
+	u32 val;
+
+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
+				  OCTEP_HW_TIMEOUT);
+}
+
+static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
+{
+	u32 val;
+
+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
+				  OCTEP_HW_TIMEOUT);
+}
+
+static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
+{
+	iowrite16(id, &mbox->hdr.id);
+	iowrite16(sig, &mbox->hdr.sig);
+}
+
+static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
+{
+	return ioread16(&mbox->hdr.sig);
+}
+
+static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
+{
+	iowrite32(sts, &mbox->sts);
+}
+
+static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
+{
+	return ioread32(&mbox->sts);
+}
+
+static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
+{
+	return ioread32(&mbox->data[word_idx]);
+}
+
+static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
+{
+	return iowrite32(word, &mbox->data[word_idx]);
+}
+
+static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
+			      u32 buf_size, bool write)
+{
+	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
+	struct pci_dev *pdev = oct_hw->pdev;
+	u32 *p = (u32 *)buffer;
+	u16 data_wds;
+	int ret, i;
+	u32 val;
+
+	if (!IS_ALIGNED(buf_size, 4))
+		return -EINVAL;
+
+	/* Make sure mbox space is available */
+	ret = octep_wait_for_mbox_avail(mbox);
+	if (ret) {
+		dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
+		return ret;
+	}
+	data_wds = buf_size / 4;
+
+	if (write) {
+		for (i = 1; i <= data_wds; i++) {
+			octep_write32_word(mbox, i, *p);
+			p++;
+		}
+	}
+	octep_write32_word(mbox, 0, (u32)qid);
+	octep_write_sts(mbox, 0);
+
+	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
+
+	ret = octep_wait_for_mbox_rsp(mbox);
+	if (ret) {
+		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
+		return ret;
+	}
+
+	val = octep_read_sig(mbox);
+	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
+		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
+		return ret;
+	}
+
+	val = octep_read_sts(mbox);
+	if (val & MBOX_RC_MASK) {
+		ret = MBOX_RSP_TO_ERR(val);
+		dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
+		return ret;
+	}
+
+	if (!write)
+		for (i = 1; i <= data_wds; i++)
+			*p++ = octep_read32_word(mbox, i);
+
+	return 0;
+}
+
+static void octep_mbox_init(struct octep_mbox __iomem *mbox)
+{
+	iowrite32(1, &mbox->sts);
+}
+
+u8 octep_hw_get_status(struct octep_hw *oct_hw)
+{
+	return ioread8(&oct_hw->common_cfg->device_status);
+}
+
+void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
+{
+	iowrite8(status, &oct_hw->common_cfg->device_status);
+}
+
+void octep_hw_reset(struct octep_hw *oct_hw)
+{
+	u8 val;
+
+	octep_hw_set_status(oct_hw, 0 | BIT(7));
+	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
+			       OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
+		return;
+	}
+}
+
+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
+{
+	u32 features_lo, features_hi;
+	u32 val, select;
+
+	select = 0;
+	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
+
+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
+		return 0ULL;
+	}
+	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
+
+	select = 1;
+	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
+
+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
+		return 0ULL;
+	}
+	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
+
+	return ((u64)features_hi << 32) | features_lo;
+}
+
+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
+{
+	u32 val, select;
+
+	select = 0;
+	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
+
+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
+		return;
+	}
+	iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
+
+	select = 1;
+	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
+
+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
+		return;
+	}
+	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
+}
+
+void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
+{
+	u16 val;
+
+	iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
+
+	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
+			       10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
+		return;
+	}
+}
+
+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
+{
+	iowrite16(qid, oct_hw->vqs[qid].notify_addr);
+}
+
+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
+{
+	u8 old_gen, new_gen, *p;
+	int i;
+
+	WARN_ON(offset + length > oct_hw->config_size);
+	do {
+		old_gen = ioread8(&oct_hw->common_cfg->config_generation);
+		p = dst;
+		for (i = 0; i < length; i++)
+			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
+
+		new_gen = ioread8(&oct_hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
+			 u64 device_area)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(qid, oct_hw);
+	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
+			     &cfg->queue_desc_hi);
+	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
+			     &cfg->queue_avail_hi);
+	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
+			     &cfg->queue_used_hi);
+
+	return 0;
+}
+
+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
+{
+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
+				  sizeof(*state), 0);
+}
+
+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
+{
+	struct vdpa_vq_state q_state;
+
+	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
+				  sizeof(*state), 1);
+}
+
+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(qid, oct_hw);
+	iowrite16(num, &cfg->queue_size);
+}
+
+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(qid, oct_hw);
+	iowrite16(ready, &cfg->queue_enable);
+}
+
+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(qid, oct_hw);
+	return ioread16(&cfg->queue_enable);
+}
+
+u16 octep_get_vq_size(struct octep_hw *oct_hw)
+{
+	octep_write_queue_select(0, oct_hw);
+	return ioread16(&oct_hw->common_cfg->queue_size);
+}
+
+static u32 octep_get_config_size(struct octep_hw *oct_hw)
+{
+	return sizeof(struct virtio_net_config);
+}
+
+static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
+{
+	struct device *dev = &oct_hw->pdev->dev;
+	u32 length = cap->length;
+	u32 offset = cap->offset;
+	u8  bar    = cap->bar;
+	u32 len;
+
+	if (bar != OCTEP_HW_CAPS_BAR) {
+		dev_err(dev, "Invalid bar: %u\n", bar);
+		return NULL;
+	}
+	if (offset + length < offset) {
+		dev_err(dev, "offset(%u) + length(%u) overflows\n",
+			offset, length);
+		return NULL;
+	}
+	len = pci_resource_len(oct_hw->pdev, bar);
+	if (offset + length > len) {
+		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
+			offset + length, len);
+		return NULL;
+	}
+	return oct_hw->base[bar] + offset;
+}
+
+static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
+{
+	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
+	u8 *p = buf;
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		*p++ = ioread8(bar + offset + i);
+}
+
+static int pci_signature_verify(struct octep_hw *oct_hw)
+{
+	u32 signature[2];
+
+	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
+
+	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
+		return -1;
+
+	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
+		return -1;
+
+	return 0;
+}
+
+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
+{
+	struct octep_mbox __iomem *mbox;
+	struct device *dev = &pdev->dev;
+	struct virtio_pci_cap cap;
+	int ret;
+	u8 pos;
+
+	oct_hw->pdev = pdev;
+	ret = pci_signature_verify(oct_hw);
+	if (ret) {
+		dev_err(dev, "Octeon Virtio FW is not initialized\n");
+		return -EIO;
+	}
+
+	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
+
+	while (pos) {
+		pci_caps_read(oct_hw, &cap, 2, pos);
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
+			break;
+		}
+
+		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
+
+		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
+			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
+				      4, pos + sizeof(cap));
+
+			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
+			oct_hw->notify_bar = cap.bar;
+			oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + cap.offset;
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			oct_hw->isr = get_cap_addr(oct_hw, &cap);
+			break;
+		}
+
+		pos = cap.cap_next;
+	}
+	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
+	    !oct_hw->dev_cfg    || !oct_hw->isr) {
+		dev_err(dev, "Incomplete PCI capabilities");
+		return -EIO;
+	}
+	oct_hw->config_size = octep_get_config_size(oct_hw);
+
+	mbox = octep_get_mbox(oct_hw);
+	octep_mbox_init(mbox);
+
+	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
+	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
+	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
+	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
+		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
+	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
+
+	return 0;
+}
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
new file mode 100644
index 000000000000..845fd35368ff
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Marvell. */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/iommu.h>
+#include "octep_vdpa.h"
+
+#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
+
+struct octep_pf {
+	u8 __iomem *base[PCI_STD_NUM_BARS];
+	struct pci_dev *pdev;
+	struct resource res;
+	u64 vf_base;
+	int enabled_vfs;
+	u32 vf_stride;
+	u16 vf_devid;
+};
+
+struct octep_vdpa {
+	struct vdpa_device vdpa;
+	struct octep_hw *oct_hw;
+	struct pci_dev *pdev;
+};
+
+struct octep_vdpa_mgmt_dev {
+	struct vdpa_mgmt_dev mdev;
+	struct octep_hw oct_hw;
+	struct pci_dev *pdev;
+	/* Work entry to handle device setup */
+	struct work_struct setup_task;
+	/* Device status */
+	atomic_t status;
+};
+
+static int verify_features(u64 features)
+{
+	/* Minimum features to expect */
+	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
+		return -EOPNOTSUPP;
+
+	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
+		return -EOPNOTSUPP;
+
+	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
+		return -EOPNOTSUPP;
+
+	/* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
+	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
+	 */
+	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
+	    BIT_ULL(VIRTIO_NET_F_MQ))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
+{
+	struct octep_vdpa *oct_vdpa;
+
+	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
+
+	return oct_vdpa->oct_hw;
+}
+
+static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
+{
+	struct octep_hw *oct_hw = data;
+	int i;
+
+	for (i = 0; i < oct_hw->nr_vring; i++) {
+		if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr) {
+			*oct_hw->vqs[i].cb_notify_addr = 0;
+			oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void octep_free_irqs(struct octep_hw *oct_hw)
+{
+	struct pci_dev *pdev = oct_hw->pdev;
+
+	if (oct_hw->irq != -1) {
+		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
+		oct_hw->irq = -1;
+	}
+	pci_free_irq_vectors(pdev);
+}
+
+static int octep_request_irqs(struct octep_hw *oct_hw)
+{
+	struct pci_dev *pdev = oct_hw->pdev;
+	int ret, irq;
+
+	/* Use one ring/interrupt per VF for virtio call interface. */
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to alloc msix vector");
+		return ret;
+	}
+
+	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
+		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
+			       oct_hw->vqs->msix_name, oct_hw);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
+		goto free_irq_vec;
+	}
+	oct_hw->irq = irq;
+
+	return 0;
+
+free_irq_vec:
+	pci_free_irq_vectors(pdev);
+	return ret;
+}
+
+static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return oct_hw->features;
+}
+
+static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	int ret;
+
+	pr_debug("Driver Features: %llx\n", features);
+	ret = verify_features(features);
+	if (ret)
+		return ret;
+
+	octep_hw_set_drv_features(oct_hw, features);
+	oct_hw->drv_features = features;
+
+	return 0;
+}
+
+static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return oct_hw->features & oct_hw->drv_features;
+}
+
+static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_hw_get_status(oct_hw);
+}
+
+static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u8 status_old;
+
+	status_old = octep_hw_get_status(oct_hw);
+
+	if (status_old == status)
+		return;
+
+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		if (octep_request_irqs(oct_hw))
+			status = status_old | VIRTIO_CONFIG_S_FAILED;
+	}
+	octep_hw_set_status(oct_hw, status);
+}
+
+static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u8 status = octep_hw_get_status(oct_hw);
+	u16 qid;
+
+	if (status == 0)
+		return 0;
+
+	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
+		oct_hw->vqs[qid].cb.callback = NULL;
+		oct_hw->vqs[qid].cb.private = NULL;
+		oct_hw->config_cb.callback = NULL;
+		oct_hw->config_cb.private = NULL;
+	}
+	octep_hw_reset(oct_hw);
+
+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
+		octep_free_irqs(oct_hw);
+
+	return 0;
+}
+
+static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_size(oct_hw);
+}
+
+static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   struct vdpa_vq_state *state)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_state(oct_hw, qid, state);
+}
+
+static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   const struct vdpa_vq_state *state)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_set_vq_state(oct_hw, qid, state);
+}
+
+static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	oct_hw->vqs[qid].cb = *cb;
+}
+
+static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_set_vq_ready(oct_hw, qid, ready);
+}
+
+static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_ready(oct_hw, qid);
+}
+
+static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_set_vq_num(oct_hw, qid, num);
+}
+
+static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
+				     u64 driver_area, u64 device_area)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
+	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
+	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
+
+	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
+}
+
+static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_notify_queue(oct_hw, qid);
+}
+
+static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u16 idx = data & 0xFFFF;
+
+	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
+}
+
+static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return vp_ioread8(&oct_hw->common_cfg->config_generation);
+}
+
+static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+	return VIRTIO_ID_NET;
+}
+
+static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+	return PCI_VENDOR_ID_CAVIUM;
+}
+
+static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+	return PAGE_SIZE;
+}
+
+static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return oct_hw->config_size;
+}
+
+static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
+				  unsigned int len)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_read_dev_config(oct_hw, offset, buf, len);
+}
+
+static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
+				  const void *buf, unsigned int len)
+{
+	/* Not supported */
+}
+
+static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	oct_hw->config_cb.callback = cb->callback;
+	oct_hw->config_cb.private = cb->private;
+}
+
+static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
+							       u16 idx)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	struct vdpa_notification_area area;
+
+	area.addr = oct_hw->vqs[idx].notify_pa;
+	area.size = PAGE_SIZE;
+
+	return area;
+}
+
+static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
+			      struct vhost_iotlb *iotlb)
+{
+	return 0;
+}
+
+static struct vdpa_config_ops octep_vdpa_ops = {
+	.get_device_features = octep_vdpa_get_device_features,
+	.set_driver_features = octep_vdpa_set_driver_features,
+	.get_driver_features = octep_vdpa_get_driver_features,
+	.get_status	= octep_vdpa_get_status,
+	.set_status	= octep_vdpa_set_status,
+	.reset		= octep_vdpa_reset,
+	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
+	.get_vq_state	= octep_vdpa_get_vq_state,
+	.set_vq_state	= octep_vdpa_set_vq_state,
+	.set_vq_cb	= octep_vdpa_set_vq_cb,
+	.set_vq_ready	= octep_vdpa_set_vq_ready,
+	.get_vq_ready	= octep_vdpa_get_vq_ready,
+	.set_vq_num	= octep_vdpa_set_vq_num,
+	.set_vq_address	= octep_vdpa_set_vq_address,
+	.get_vq_irq	= NULL,
+	.kick_vq	= octep_vdpa_kick_vq,
+	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
+	.get_generation	= octep_vdpa_get_generation,
+	.get_device_id	= octep_vdpa_get_device_id,
+	.get_vendor_id	= octep_vdpa_get_vendor_id,
+	.get_vq_align	= octep_vdpa_get_vq_align,
+	.get_config_size	= octep_vdpa_get_config_size,
+	.get_config	= octep_vdpa_get_config,
+	.set_config	= octep_vdpa_set_config,
+	.set_config_cb  = octep_vdpa_set_config_cb,
+	.get_vq_notification = octep_get_vq_notification,
+};
+
+static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
+{
+	int ret;
+
+	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
+		return ret;
+	}
+
+	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
+	if (!tbl[bar]) {
+		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
+		pci_release_region(pdev, bar);
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
+{
+	pci_iounmap(pdev, tbl[bar]);
+	pci_release_region(pdev, bar);
+}
+
+static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
+{
+	struct pci_dev *pf_dev = octpf->pdev;
+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct pci_bus_region bus_region;
+
+	octpf->res.start = res->start;
+	octpf->res.end = res->end;
+	octpf->vf_base = res->start;
+
+	bus_region.start = res->start;
+	bus_region.end = res->start - 1;
+
+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
+}
+
+static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
+{
+	struct pci_dev *pf_dev = octpf->pdev;
+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct pci_bus_region bus_region;
+
+	bus_region.start = octpf->res.start;
+	bus_region.end = octpf->res.end;
+
+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
+}
+
+static void octep_vdpa_remove_pf(struct pci_dev *pdev)
+{
+	struct octep_pf *octpf = pci_get_drvdata(pdev);
+
+	pci_disable_sriov(pdev);
+
+	if (octpf->base[OCTEP_HW_CAPS_BAR])
+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
+
+	if (octpf->base[OCTEP_HW_MBOX_BAR])
+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+
+	octep_vdpa_pf_bar_expand(octpf);
+}
+
+static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
+{
+	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
+
+	memset(vf_res, 0, sizeof(*vf_res));
+}
+
+static void octep_vdpa_remove_vf(struct pci_dev *pdev)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
+	struct octep_hw *oct_hw;
+	int status;
+
+	oct_hw = &mgmt_dev->oct_hw;
+	status = atomic_read(&mgmt_dev->status);
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
+
+	if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
+		cancel_work_sync(&mgmt_dev->setup_task);
+	} else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
+		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
+		kfree(mgmt_dev->oct_hw.vqs);
+	}
+
+	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+
+	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
+
+	octep_vdpa_vf_bar_shrink(pdev);
+}
+
+static void octep_vdpa_remove(struct pci_dev *pdev)
+{
+	if (pdev->is_virtfn)
+		octep_vdpa_remove_vf(pdev);
+	else
+		octep_vdpa_remove_pf(pdev);
+}
+
+static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			      const struct vdpa_dev_set_config *config)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
+	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
+	struct pci_dev *pdev = oct_hw->pdev;
+	struct vdpa_device *vdpa_dev;
+	struct octep_vdpa *oct_vdpa;
+	u64 device_features;
+	u16 notify_off;
+	int i, ret;
+
+	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
+				     NULL, false);
+	if (IS_ERR(oct_vdpa)) {
+		dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
+		return PTR_ERR(oct_vdpa);
+	}
+
+	oct_vdpa->pdev = pdev;
+	oct_vdpa->vdpa.dma_dev = &pdev->dev;
+	oct_vdpa->vdpa.mdev = mdev;
+	oct_vdpa->oct_hw = oct_hw;
+	vdpa_dev = &oct_vdpa->vdpa;
+
+	device_features = oct_hw->features;
+	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+		if (config->device_features & ~device_features) {
+			dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
+				config->device_features, device_features);
+			return -EINVAL;
+		}
+		device_features &= config->device_features;
+	}
+
+	oct_hw->features = device_features;
+
+	if (verify_features(device_features)) {
+		dev_warn(mdev->device,
+			 "Must provision minimum features 0x%llx for this device",
+			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
+			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
+		return -EOPNOTSUPP;
+	}
+
+	oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
+	if (!oct_hw->vqs)
+		return -ENOMEM;
+
+	oct_hw->irq = -1;
+
+	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
+	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
+
+	for (i = 0; i < oct_hw->nr_vring; i++) {
+		octep_write_queue_select(i, oct_hw);
+		notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
+		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
+			notify_off * oct_hw->notify_off_multiplier;
+		oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw->vqs[i].notify_addr + 1;
+		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
+			notify_off * oct_hw->notify_off_multiplier;
+	}
+
+	if (name)
+		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
+	else
+		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
+
+	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register to vDPA bus");
+		goto free_vqs;
+	}
+	return 0;
+
+free_vqs:
+	put_device(&oct_vdpa->vdpa.dev);
+	kfree(oct_hw->vqs);
+	return ret;
+}
+
+static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+	_vdpa_unregister_device(dev);
+}
+
+static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
+	.dev_add = octep_vdpa_dev_add,
+	.dev_del = octep_vdpa_dev_del
+};
+
+static bool get_device_ready_status(u8 __iomem *addr)
+{
+	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
+
+	if (signature == OCTEP_DEV_READY_SIGNATURE) {
+		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
+		return true;
+	}
+
+	return false;
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static void octep_vdpa_setup_task(struct work_struct *work)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
+							    setup_task);
+	struct pci_dev *pdev = mgmt_dev->pdev;
+	struct device *dev = &pdev->dev;
+	struct octep_hw *oct_hw;
+	unsigned long timeout;
+	int ret;
+
+	oct_hw = &mgmt_dev->oct_hw;
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
+
+	/* Wait for a maximum of 5 sec */
+	timeout = jiffies + msecs_to_jiffies(5000);
+	while (!time_after(jiffies, timeout)) {
+		if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
+			atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
+			break;
+		}
+
+		if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
+			dev_info(dev, "Stopping vDPA setup task.\n");
+			return;
+		}
+
+		usleep_range(1000, 1500);
+	}
+
+	if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
+		dev_err(dev, "BAR initialization is timed out\n");
+		return;
+	}
+
+	ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+	if (ret)
+		return;
+
+	ret = octep_hw_caps_read(oct_hw, pdev);
+	if (ret < 0)
+		goto unmap_region;
+
+	oct_hw->features = octep_hw_get_dev_features(oct_hw);
+	ret = verify_features(oct_hw->features);
+	if (ret) {
+		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
+		goto unmap_region;
+	}
+	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
+
+	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
+	mgmt_dev->mdev.id_table = id_table;
+	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
+	mgmt_dev->mdev.supported_features = oct_hw->features;
+	mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
+	mgmt_dev->mdev.device = dev;
+
+	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
+	if (ret) {
+		dev_err(dev, "Failed to register vdpa management interface\n");
+		goto unmap_region;
+	}
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
+
+	return;
+
+unmap_region:
+	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
+}
+
+static int octep_vdpa_probe_vf(struct pci_dev *pdev)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev;
+	struct device *dev = &pdev->dev;
+	struct iommu_domain *domain;
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		dev_err(dev, "Failed to enable device\n");
+		return ret;
+	}
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(dev, "No usable DMA configuration\n");
+		return ret;
+	}
+	pci_set_master(pdev);
+
+	domain = iommu_get_domain_for_dev(dev);
+	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
+		dev_info(dev, "NO-IOMMU\n");
+		octep_vdpa_ops.set_map = octep_vdpa_set_map;
+	}
+
+	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
+	if (!mgmt_dev)
+		return -ENOMEM;
+
+	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
+	if (ret)
+		return ret;
+
+	mgmt_dev->pdev = pdev;
+	pci_set_drvdata(pdev, mgmt_dev);
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
+	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
+	schedule_work(&mgmt_dev->setup_task);
+	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
+
+	return 0;
+}
+
+static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
+{
+	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct octep_pf *pf = pci_get_drvdata(pf_dev);
+	struct pci_bus_region bus_region;
+
+	vf_res->name = pci_name(vf_dev);
+	vf_res->flags = pf_res->flags;
+	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
+
+	bus_region.start = pf->vf_base + idx * pf->vf_stride;
+	bus_region.end = bus_region.start + pf->vf_stride - 1;
+	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
+}
+
+static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct octep_pf *pf = pci_get_drvdata(pdev);
+	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
+	int ret, i;
+
+	if (num_vfs > 0) {
+		struct pci_dev *vf_pdev = NULL;
+		bool done = false;
+		int index = 0;
+
+		ret = pci_enable_sriov(pdev, num_vfs);
+		if (ret)
+			return ret;
+
+		pf->enabled_vfs = num_vfs;
+
+		while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {
+			if (vf_pdev->device != pf->vf_devid)
+				continue;
+
+			octep_vdpa_assign_barspace(vf_pdev, pdev, index);
+			if (++index == num_vfs) {
+				done = true;
+				break;
+			}
+		}
+
+		if (done) {
+			for (i = 0; i < pf->enabled_vfs; i++)
+				writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
+		}
+	} else {
+		if (!pci_num_vf(pdev))
+			return 0;
+
+		pci_disable_sriov(pdev);
+		pf->enabled_vfs = 0;
+	}
+
+	return num_vfs;
+}
+
+static u16 octep_get_vf_devid(struct pci_dev *pdev)
+{
+	u16 did;
+
+	switch (pdev->device) {
+	case OCTEP_VDPA_DEVID_CN106K_PF:
+		did = OCTEP_VDPA_DEVID_CN106K_VF;
+		break;
+	case OCTEP_VDPA_DEVID_CN105K_PF:
+		did = OCTEP_VDPA_DEVID_CN105K_VF;
+		break;
+	case OCTEP_VDPA_DEVID_CN103K_PF:
+		did = OCTEP_VDPA_DEVID_CN103K_VF;
+		break;
+	default:
+		did = 0xFFFF;
+		break;
+	}
+
+	return did;
+}
+
+static int octep_vdpa_pf_setup(struct octep_pf *octpf)
+{
+	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
+	struct pci_dev *pdev = octpf->pdev;
+	int totalvfs;
+	u64 val, len;
+
+	totalvfs = pci_sriov_get_totalvfs(pdev);
+	if (unlikely(!totalvfs)) {
+		dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
+		return 0;
+	}
+
+	addr = octpf->base[OCTEP_HW_MBOX_BAR];
+	val = readq(addr + OCTEP_EPF_RINFO(0));
+	if (val == 0) {
+		dev_err(&pdev->dev, "Invalid device configuration\n");
+		return -EINVAL;
+	}
+
+	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
+		val &= ~GENMASK_ULL(35, 32);
+		val |= BIT_ULL(32);
+		writeq(val, addr + OCTEP_EPF_RINFO(0));
+	}
+
+	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
+
+	octpf->vf_stride = len / totalvfs;
+	octpf->vf_devid = octep_get_vf_devid(pdev);
+
+	octep_vdpa_pf_bar_shrink(octpf);
+
+	return 0;
+}
+
+static int octep_vdpa_probe_pf(struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct octep_pf *octpf;
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		dev_err(dev, "Failed to enable device\n");
+		return ret;
+	}
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(dev, "No usable DMA configuration\n");
+		return ret;
+	}
+	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
+	if (!octpf)
+		return -ENOMEM;
+
+	ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+	if (ret)
+		return ret;
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, octpf);
+	octpf->pdev = pdev;
+
+	ret = octep_vdpa_pf_setup(octpf);
+	if (ret)
+		goto unmap_region;
+
+	return 0;
+
+unmap_region:
+	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+	return ret;
+}
+
+static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	if (pdev->is_virtfn)
+		return octep_vdpa_probe_vf(pdev);
+	else
+		return octep_vdpa_probe_pf(pdev);
+}
+
+static struct pci_device_id octep_pci_vdpa_map[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
+	{ 0 },
+};
+
+static struct pci_driver octep_pci_vdpa = {
+	.name     = OCTEP_VDPA_DRIVER_NAME,
+	.id_table = octep_pci_vdpa_map,
+	.probe    = octep_vdpa_probe,
+	.remove   = octep_vdpa_remove,
+	.sriov_configure = octep_vdpa_sriov_configure
+};
+
+module_pci_driver(octep_pci_vdpa);
+
+MODULE_AUTHOR("Marvell");
+MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
+MODULE_LICENSE("GPL");
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-27 11:21 [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices Srujana Challa
@ 2024-03-29  4:28 ` Jason Wang
  2024-03-29 12:34   ` [EXTERNAL] " Srujana Challa
  2024-03-29 11:37 ` Stefano Garzarella
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2024-03-29  4:28 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, xuanzhuo, vattunuru, sthotton, ndabilpuram,
	jerinj, eperezma

On Wed, Mar 27, 2024 at 7:22 PM Srujana Challa <schalla@marvell.com> wrote:
>
> This commit introduces a new vDPA driver specifically designed for
> managing the virtio control plane over the vDPA bus for OCTEON DPU
> devices. The driver consists of two layers:
>
> 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> operations and configurations related to the DPU device.
>
> 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> implements device operations for the vDPA bus. It handles device
> probing, bus attachment, vring operations, and other relevant tasks.
>
> Signed-off-by: Srujana Challa <schalla@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> ---
>  MAINTAINERS                              |   5 +
>  drivers/vdpa/Kconfig                     |   9 +
>  drivers/vdpa/Makefile                    |   1 +
>  drivers/vdpa/octeon_ep/Makefile          |   4 +
>  drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
>  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
>  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903 +++++++++++++++++++++++
>  7 files changed, 1472 insertions(+)
>  create mode 100644 drivers/vdpa/octeon_ep/Makefile
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cfe44a876d8a..539ce209a960 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13127,6 +13127,11 @@ S:     Supported
>  F:     Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
>  F:     drivers/mmc/host/sdhci-xenon*
>
> +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> +R:     schalla@marvell.com
> +R:     vattunuru@marvell.com
> +F:     drivers/vdpa/octeon_ep/
> +
>  MATROX FRAMEBUFFER DRIVER
>  L:     linux-fbdev@vger.kernel.org
>  S:     Orphan
> diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> index 656c1cb541de..775149fad476 100644
> --- a/drivers/vdpa/Kconfig
> +++ b/drivers/vdpa/Kconfig
> @@ -126,4 +126,13 @@ config PDS_VDPA
>           With this driver, the VirtIO dataplane can be
>           offloaded to an AMD/Pensando DSC device.
>
> +config OCTEONEP_VDPA
> +       tristate "vDPA driver for Octeon DPU devices"
> +       depends on m
> +       depends on PCI_MSI
> +       help
> +         vDPA driver for Marvell's Octeon DPU devices.
> +         With this driver, the VirtIO dataplane can be
> +         offloaded to a Octeon DPU device.
> +
>  endif # VDPA
> diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> index 8f53c6f3cca7..5654d36707af 100644
> --- a/drivers/vdpa/Makefile
> +++ b/drivers/vdpa/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
>  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
>  obj-$(CONFIG_SNET_VDPA) += solidrun/
>  obj-$(CONFIG_PDS_VDPA) += pds/
> +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
> new file mode 100644
> index 000000000000..e23e2ff14f33
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
> new file mode 100644
> index 000000000000..60d2efc8f8b8
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> @@ -0,0 +1,93 @@
> +/* SPDX-License-Identifier: GPL-2.0-only
> + * Copyright (C) 2024 Marvell.
> + */
> +#ifndef __OCTEP_VDPA_H__
> +#define __OCTEP_VDPA_H__
> +
> +#include <linux/pci.h>
> +#include <linux/pci_regs.h>
> +#include <linux/vdpa.h>
> +#include <linux/virtio_pci_modern.h>
> +#include <uapi/linux/virtio_net.h>
> +#include <uapi/linux/virtio_blk.h>
> +#include <uapi/linux/virtio_config.h>
> +#include <uapi/linux/virtio_pci.h>
> +#include <uapi/linux/vdpa.h>
> +
> +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> +
> +#define OCTEP_HW_MBOX_BAR 0
> +#define OCTEP_HW_CAPS_BAR 4
> +
> +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> +
> +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> +
> +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> +
> +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> +
> +enum octep_vdpa_dev_status {
> +       OCTEP_VDPA_DEV_STATUS_INVALID,
> +       OCTEP_VDPA_DEV_STATUS_ALLOC,
> +       OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> +       OCTEP_VDPA_DEV_STATUS_INIT,
> +       OCTEP_VDPA_DEV_STATUS_READY,
> +       OCTEP_VDPA_DEV_STATUS_UNINIT
> +};
> +
> +struct octep_vring_info {
> +       struct vdpa_callback cb;
> +       void __iomem *notify_addr;
> +       u32 __iomem *cb_notify_addr;
> +       phys_addr_t notify_pa;
> +       char msix_name[256];
> +};
> +
> +struct octep_hw {
> +       struct pci_dev *pdev;
> +       u8 __iomem *base[PCI_STD_NUM_BARS];
> +       struct virtio_pci_common_cfg __iomem *common_cfg;

This is a hint that any chance we can reuse the modern virtio-pci
library (virtio_pci_modern_dev.c) instead of duplicating codes?

> +       u8 __iomem *dev_cfg;
> +       u8 __iomem *isr;
> +       void __iomem *notify_base;
> +       phys_addr_t notify_base_pa;
> +       u32 notify_off_multiplier;
> +       u8 notify_bar;
> +       struct octep_vring_info *vqs;
> +       struct vdpa_callback config_cb;
> +       u64 features;
> +       u64 drv_features;
> +       u16 nr_vring;
> +       u32 config_size;
> +       int irq;
> +};
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> +void octep_hw_reset(struct octep_hw *oct_hw);
> +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +                        u64 device_area);
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
> +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> +
> +#endif /* __OCTEP_VDPA_H__ */
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> new file mode 100644
> index 000000000000..0a2f1d09c4ab
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> @@ -0,0 +1,457 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/iopoll.h>
> +
> +#include "octep_vdpa.h"
> +
> +enum octep_mbox_ids {
> +       OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> +       OCTEP_MBOX_MSG_GET_VQ_STATE,
> +};
> +
> +#define OCTEP_HW_TIMEOUT       10000000
> +
> +#define MBOX_OFFSET            64
> +#define MBOX_RSP_MASK          0x00000001
> +#define MBOX_RC_MASK           0x0000FFFE
> +
> +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
> +
> +struct octep_mbox_hdr {
> +       u8 ver;
> +       u8 rsvd1;
> +       u16 id;
> +       u16 rsvd2;
> +#define MBOX_REQ_SIG (0xdead)
> +#define MBOX_RSP_SIG (0xbeef)
> +       u16 sig;
> +};
> +
> +struct octep_mbox_sts {
> +       u16 rsp:1;
> +       u16 rc:15;
> +       u16 rsvd;
> +};
> +
> +struct octep_mbox {
> +       struct octep_mbox_hdr hdr;
> +       struct octep_mbox_sts sts;
> +       u64 rsvd;
> +       u32 data[];
> +};
> +
> +static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
> +{
> +       return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
> +}
> +
> +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
> +{
> +       u32 val;
> +
> +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
> +                                 OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
> +{
> +       u32 val;
> +
> +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
> +                                 OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
> +{
> +       iowrite16(id, &mbox->hdr.id);
> +       iowrite16(sig, &mbox->hdr.sig);
> +}
> +
> +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> +{
> +       return ioread16(&mbox->hdr.sig);
> +}
> +
> +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
> +{
> +       iowrite32(sts, &mbox->sts);
> +}
> +
> +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> +{
> +       return ioread32(&mbox->sts);
> +}
> +
> +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
> +{
> +       return ioread32(&mbox->data[word_idx]);
> +}
> +
> +static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
> +{
> +       return iowrite32(word, &mbox->data[word_idx]);
> +}
> +
> +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
> +                             u32 buf_size, bool write)
> +{
> +       struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> +       struct pci_dev *pdev = oct_hw->pdev;
> +       u32 *p = (u32 *)buffer;
> +       u16 data_wds;
> +       int ret, i;
> +       u32 val;
> +
> +       if (!IS_ALIGNED(buf_size, 4))
> +               return -EINVAL;
> +
> +       /* Make sure mbox space is available */
> +       ret = octep_wait_for_mbox_avail(mbox);
> +       if (ret) {
> +               dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
> +               return ret;
> +       }
> +       data_wds = buf_size / 4;
> +
> +       if (write) {
> +               for (i = 1; i <= data_wds; i++) {
> +                       octep_write32_word(mbox, i, *p);
> +                       p++;
> +               }
> +       }
> +       octep_write32_word(mbox, 0, (u32)qid);
> +       octep_write_sts(mbox, 0);
> +
> +       octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> +
> +       ret = octep_wait_for_mbox_rsp(mbox);
> +       if (ret) {
> +               dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
> +               return ret;
> +       }
> +
> +       val = octep_read_sig(mbox);
> +       if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> +               dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
> +               return ret;
> +       }
> +
> +       val = octep_read_sts(mbox);
> +       if (val & MBOX_RC_MASK) {
> +               ret = MBOX_RSP_TO_ERR(val);
> +               dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
> +               return ret;
> +       }
> +
> +       if (!write)
> +               for (i = 1; i <= data_wds; i++)
> +                       *p++ = octep_read32_word(mbox, i);
> +
> +       return 0;
> +}
> +
> +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> +{
> +       iowrite32(1, &mbox->sts);
> +}
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> +{
> +       return ioread8(&oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> +{
> +       iowrite8(status, &oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_reset(struct octep_hw *oct_hw)
> +{
> +       u8 val;
> +
> +       octep_hw_set_status(oct_hw, 0 | BIT(7));

Could we have macros for the magic numbers like 7 and 15 below?

> +       if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
> +                              OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> +               return;
> +       }
> +}
> +
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> +{
> +       u32 features_lo, features_hi;
> +       u32 val, select;
> +
> +       select = 0;
> +       iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);

Ok I see something different with virtio. For any "select" it seems to
require BIT(15).

> +
> +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
> +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +               return 0ULL;
> +       }
> +       features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +       select = 1;
> +       iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
> +
> +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
> +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +               return 0ULL;
> +       }
> +       features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +       return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> +{
> +       u32 val, select;
> +
> +       select = 0;
> +       iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
> +
> +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
> +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +               return;
> +       }

Could we introduce a helper for such write and readback && timeout
logic? It seems it is used in a lot of places.

> +       iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
> +
> +       select = 1;
> +       iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
> +
> +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
> +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +               return;
> +       }
> +       iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> +}
> +
> +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
> +{
> +       u16 val;
> +
> +       iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
> +
> +       if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
> +                              10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> +               return;
> +       }
> +}
> +
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> +{
> +       iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> +}
> +
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
> +{
> +       u8 old_gen, new_gen, *p;
> +       int i;
> +
> +       WARN_ON(offset + length > oct_hw->config_size);
> +       do {
> +               old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +               p = dst;
> +               for (i = 0; i < length; i++)
> +                       *p++ = ioread8(oct_hw->dev_cfg + offset + i);
> +
> +               new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +       } while (old_gen != new_gen);
> +}

This looks similar to vp_vdpa_get_config(), any chance to unify them?

> +
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +                        u64 device_area)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(qid, oct_hw);
> +       vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +                            &cfg->queue_desc_hi);
> +       vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +                            &cfg->queue_avail_hi);
> +       vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +                            &cfg->queue_used_hi);

It's kind of interesting that there's no read back for validation here.

> +
> +       return 0;
> +}
> +
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
> +{
> +       return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> +                                 sizeof(*state), 0);
> +}
> +
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
> +{
> +       struct vdpa_vq_state q_state;
> +
> +       memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> +       return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> +                                 sizeof(*state), 1);
> +}
> +
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(qid, oct_hw);
> +       iowrite16(num, &cfg->queue_size);
> +}
> +
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(qid, oct_hw);
> +       iowrite16(ready, &cfg->queue_enable);
> +}
> +
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(qid, oct_hw);
> +       return ioread16(&cfg->queue_enable);
> +}
> +
> +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> +{
> +       octep_write_queue_select(0, oct_hw);
> +       return ioread16(&oct_hw->common_cfg->queue_size);
> +}
> +
> +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> +{
> +       return sizeof(struct virtio_net_config);
> +}
> +
> +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
> +{
> +       struct device *dev = &oct_hw->pdev->dev;
> +       u32 length = cap->length;
> +       u32 offset = cap->offset;
> +       u8  bar    = cap->bar;
> +       u32 len;
> +
> +       if (bar != OCTEP_HW_CAPS_BAR) {
> +               dev_err(dev, "Invalid bar: %u\n", bar);
> +               return NULL;
> +       }
> +       if (offset + length < offset) {
> +               dev_err(dev, "offset(%u) + length(%u) overflows\n",
> +                       offset, length);
> +               return NULL;
> +       }
> +       len = pci_resource_len(oct_hw->pdev, bar);
> +       if (offset + length > len) {
> +               dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> +                       offset + length, len);
> +               return NULL;
> +       }
> +       return oct_hw->base[bar] + offset;
> +}
> +
> +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
> +{
> +       u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> +       u8 *p = buf;
> +       size_t i;
> +
> +       for (i = 0; i < len; i++)
> +               *p++ = ioread8(bar + offset + i);
> +}
> +
> +static int pci_signature_verify(struct octep_hw *oct_hw)
> +{
> +       u32 signature[2];
> +
> +       pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> +
> +       if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> +               return -1;
> +
> +       if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> +               return -1;
> +
> +       return 0;
> +}
> +
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> +{
> +       struct octep_mbox __iomem *mbox;
> +       struct device *dev = &pdev->dev;
> +       struct virtio_pci_cap cap;
> +       int ret;
> +       u8 pos;
> +
> +       oct_hw->pdev = pdev;
> +       ret = pci_signature_verify(oct_hw);
> +       if (ret) {
> +               dev_err(dev, "Octeon Virtio FW is not initialized\n");
> +               return -EIO;
> +       }
> +
> +       pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> +
> +       while (pos) {
> +               pci_caps_read(oct_hw, &cap, 2, pos);
> +
> +               if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> +                       dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
> +                       break;
> +               }
> +
> +               pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> +
> +               dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
> +                        pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> +
> +               switch (cap.cfg_type) {
> +               case VIRTIO_PCI_CAP_COMMON_CFG:
> +                       oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               case VIRTIO_PCI_CAP_NOTIFY_CFG:
> +                       pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> +                                     4, pos + sizeof(cap));
> +
> +                       oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> +                       oct_hw->notify_bar = cap.bar;
> +                       oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + cap.offset;
> +                       break;
> +               case VIRTIO_PCI_CAP_DEVICE_CFG:
> +                       oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               case VIRTIO_PCI_CAP_ISR_CFG:
> +                       oct_hw->isr = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               }
> +
> +               pos = cap.cap_next;
> +       }
> +       if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> +           !oct_hw->dev_cfg    || !oct_hw->isr) {
> +               dev_err(dev, "Incomplete PCI capabilities");
> +               return -EIO;
> +       }
> +       oct_hw->config_size = octep_get_config_size(oct_hw);
> +
> +       mbox = octep_get_mbox(oct_hw);
> +       octep_mbox_init(mbox);
> +
> +       dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
> +       dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
> +       dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> +       dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> +                (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> +       dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> +
> +       return 0;
> +}

PDS vDPA reuses vp_modern_probe(), can we reuse it as well here?

For example, vp_modern_probe() allows vendor specific bars and id
check which looks to be useful here.

> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> new file mode 100644
> index 000000000000..845fd35368ff
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> @@ -0,0 +1,903 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/iommu.h>
> +#include "octep_vdpa.h"
> +
> +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> +
> +struct octep_pf {
> +       u8 __iomem *base[PCI_STD_NUM_BARS];
> +       struct pci_dev *pdev;
> +       struct resource res;
> +       u64 vf_base;
> +       int enabled_vfs;
> +       u32 vf_stride;
> +       u16 vf_devid;
> +};
> +
> +struct octep_vdpa {
> +       struct vdpa_device vdpa;
> +       struct octep_hw *oct_hw;
> +       struct pci_dev *pdev;
> +};
> +
> +struct octep_vdpa_mgmt_dev {
> +       struct vdpa_mgmt_dev mdev;
> +       struct octep_hw oct_hw;
> +       struct pci_dev *pdev;
> +       /* Work entry to handle device setup */
> +       struct work_struct setup_task;
> +       /* Device status */
> +       atomic_t status;
> +};
> +
> +static int verify_features(u64 features)
> +{
> +       /* Minimum features to expect */
> +       if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> +               return -EOPNOTSUPP;
> +
> +       if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> +               return -EOPNOTSUPP;

Any reason we need to mandate the NOTIFICATION_DATA. It seems can work
without that becasue:

void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
{
        iowrite16(qid, oct_hw->vqs[qid].notify_addr);
}

> +
> +       if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> +               return -EOPNOTSUPP;

Does this mean the packed virtqueue is mandatory?

> +
> +       /* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
> +        * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
> +        */

Right, but we have a lot of other features that depend on the CVQ, why
is MQ special here?

> +       if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
> +           BIT_ULL(VIRTIO_NET_F_MQ))
> +               return -EINVAL;
> +
> +       return 0;
> +}
> +
> +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_vdpa *oct_vdpa;
> +
> +       oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> +
> +       return oct_vdpa->oct_hw;
> +}
> +
> +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> +{
> +       struct octep_hw *oct_hw = data;
> +       int i;
> +
> +       for (i = 0; i < oct_hw->nr_vring; i++) {
> +               if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr) {
> +                       *oct_hw->vqs[i].cb_notify_addr = 0;

I didn't get how cb_notify_addr is useful here.

> +                       oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> +               }
> +       }
> +
> +       return IRQ_HANDLED;
> +}
> +
> +static void octep_free_irqs(struct octep_hw *oct_hw)
> +{
> +       struct pci_dev *pdev = oct_hw->pdev;
> +
> +       if (oct_hw->irq != -1) {
> +               devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> +               oct_hw->irq = -1;
> +       }
> +       pci_free_irq_vectors(pdev);
> +}
> +
> +static int octep_request_irqs(struct octep_hw *oct_hw)
> +{
> +       struct pci_dev *pdev = oct_hw->pdev;
> +       int ret, irq;
> +
> +       /* Use one ring/interrupt per VF for virtio call interface. */

Is this a hardware limitation? If not, it would be slow.

> +       ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> +       if (ret < 0) {
> +               dev_err(&pdev->dev, "Failed to alloc msix vector");
> +               return ret;
> +       }
> +
> +       snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> +                OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> +
> +       irq = pci_irq_vector(pdev, 0);
> +       ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> +                              oct_hw->vqs->msix_name, oct_hw);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> +               goto free_irq_vec;
> +       }
> +       oct_hw->irq = irq;
> +
> +       return 0;
> +
> +free_irq_vec:
> +       pci_free_irq_vectors(pdev);
> +       return ret;
> +}
> +
> +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return oct_hw->features;
> +}
> +
> +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       int ret;
> +
> +       pr_debug("Driver Features: %llx\n", features);
> +       ret = verify_features(features);
> +       if (ret)
> +               return ret;
> +
> +       octep_hw_set_drv_features(oct_hw, features);
> +       oct_hw->drv_features = features;

It is not guaranteed that the device will accept all those features or
it would be painful to maintain per fw feature white/blacklist.

More below.

> +
> +       return 0;
> +}
> +
> +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return oct_hw->features & oct_hw->drv_features;

So here, the drv_features needs to be read from the device then
everything is fine.

> +}
> +
> +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return octep_hw_get_status(oct_hw);
> +}
> +
> +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       u8 status_old;
> +
> +       status_old = octep_hw_get_status(oct_hw);
> +
> +       if (status_old == status)
> +               return;
> +
> +       if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> +           !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +               if (octep_request_irqs(oct_hw))
> +                       status = status_old | VIRTIO_CONFIG_S_FAILED;
> +       }
> +       octep_hw_set_status(oct_hw, status);
> +}
> +
> +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       u8 status = octep_hw_get_status(oct_hw);
> +       u16 qid;
> +
> +       if (status == 0)
> +               return 0;
> +
> +       for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> +               oct_hw->vqs[qid].cb.callback = NULL;
> +               oct_hw->vqs[qid].cb.private = NULL;
> +               oct_hw->config_cb.callback = NULL;
> +               oct_hw->config_cb.private = NULL;
> +       }
> +       octep_hw_reset(oct_hw);
> +
> +       if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> +               octep_free_irqs(oct_hw);
> +
> +       return 0;
> +}
> +
> +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return octep_get_vq_size(oct_hw);
> +}
> +
> +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +                                  struct vdpa_vq_state *state)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return octep_get_vq_state(oct_hw, qid, state);
> +}
> +
> +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +                                  const struct vdpa_vq_state *state)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return octep_set_vq_state(oct_hw, qid, state);
> +}
> +
> +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       oct_hw->vqs[qid].cb = *cb;
> +}
> +
> +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       octep_set_vq_ready(oct_hw, qid, ready);
> +}
> +
> +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return octep_get_vq_ready(oct_hw, qid);
> +}
> +
> +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       octep_set_vq_num(oct_hw, qid, num);
> +}
> +
> +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
> +                                    u64 driver_area, u64 device_area)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> +       pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> +       pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> +
> +       return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
> +}
> +
> +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       octep_notify_queue(oct_hw, qid);
> +}
> +
> +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       u16 idx = data & 0xFFFF;
> +
> +       vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
> +}
> +
> +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return vp_ioread8(&oct_hw->common_cfg->config_generation);
> +}
> +
> +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
> +{
> +       return VIRTIO_ID_NET;
> +}
> +
> +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
> +{
> +       return PCI_VENDOR_ID_CAVIUM;
> +}
> +
> +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
> +{
> +       return PAGE_SIZE;
> +}
> +
> +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return oct_hw->config_size;
> +}
> +
> +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
> +                                 unsigned int len)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       octep_read_dev_config(oct_hw, offset, buf, len);
> +}
> +
> +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
> +                                 const void *buf, unsigned int len)
> +{
> +       /* Not supported */

We probably need to filter out VIRTIO_NET_F_ANNOUNCE and other
features that depend on the config writing.

> +}
> +
> +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       oct_hw->config_cb.callback = cb->callback;
> +       oct_hw->config_cb.private = cb->private;
> +}
> +
> +static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
> +                                                              u16 idx)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       struct vdpa_notification_area area;
> +
> +       area.addr = oct_hw->vqs[idx].notify_pa;
> +       area.size = PAGE_SIZE;
> +
> +       return area;
> +}
> +
> +static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
> +                             struct vhost_iotlb *iotlb)
> +{
> +       return 0;
> +}
> +
> +static struct vdpa_config_ops octep_vdpa_ops = {
> +       .get_device_features = octep_vdpa_get_device_features,
> +       .set_driver_features = octep_vdpa_set_driver_features,
> +       .get_driver_features = octep_vdpa_get_driver_features,
> +       .get_status     = octep_vdpa_get_status,
> +       .set_status     = octep_vdpa_set_status,
> +       .reset          = octep_vdpa_reset,
> +       .get_vq_num_max = octep_vdpa_get_vq_num_max,
> +       .get_vq_state   = octep_vdpa_get_vq_state,
> +       .set_vq_state   = octep_vdpa_set_vq_state,
> +       .set_vq_cb      = octep_vdpa_set_vq_cb,
> +       .set_vq_ready   = octep_vdpa_set_vq_ready,
> +       .get_vq_ready   = octep_vdpa_get_vq_ready,
> +       .set_vq_num     = octep_vdpa_set_vq_num,
> +       .set_vq_address = octep_vdpa_set_vq_address,
> +       .get_vq_irq     = NULL,
> +       .kick_vq        = octep_vdpa_kick_vq,
> +       .kick_vq_with_data      = octep_vdpa_kick_vq_with_data,
> +       .get_generation = octep_vdpa_get_generation,
> +       .get_device_id  = octep_vdpa_get_device_id,
> +       .get_vendor_id  = octep_vdpa_get_vendor_id,
> +       .get_vq_align   = octep_vdpa_get_vq_align,
> +       .get_config_size        = octep_vdpa_get_config_size,
> +       .get_config     = octep_vdpa_get_config,
> +       .set_config     = octep_vdpa_set_config,
> +       .set_config_cb  = octep_vdpa_set_config_cb,
> +       .get_vq_notification = octep_get_vq_notification,
> +};
> +
> +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +       int ret;
> +
> +       ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
> +               return ret;
> +       }
> +
> +       tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> +       if (!tbl[bar]) {
> +               dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> +               pci_release_region(pdev, bar);
> +               ret = -ENOMEM;
> +       }
> +
> +       return ret;
> +}
> +
> +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +       pci_iounmap(pdev, tbl[bar]);
> +       pci_release_region(pdev, bar);
> +}
> +
> +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
> +{
> +       struct pci_dev *pf_dev = octpf->pdev;
> +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +       struct pci_bus_region bus_region;
> +
> +       octpf->res.start = res->start;
> +       octpf->res.end = res->end;
> +       octpf->vf_base = res->start;
> +
> +       bus_region.start = res->start;
> +       bus_region.end = res->start - 1;

Is this inteneded?

> +
> +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
> +{
> +       struct pci_dev *pf_dev = octpf->pdev;
> +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +       struct pci_bus_region bus_region;
> +
> +       bus_region.start = octpf->res.start;
> +       bus_region.end = octpf->res.end;
> +
> +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_remove_pf(struct pci_dev *pdev)
> +{
> +       struct octep_pf *octpf = pci_get_drvdata(pdev);
> +
> +       pci_disable_sriov(pdev);
> +
> +       if (octpf->base[OCTEP_HW_CAPS_BAR])
> +               octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
> +
> +       if (octpf->base[OCTEP_HW_MBOX_BAR])
> +               octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +
> +       octep_vdpa_pf_bar_expand(octpf);
> +}
> +
> +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
> +{
> +       struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> +
> +       memset(vf_res, 0, sizeof(*vf_res));
> +}
> +
> +static void octep_vdpa_remove_vf(struct pci_dev *pdev)
> +{
> +       struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> +       struct octep_hw *oct_hw;
> +       int status;
> +
> +       oct_hw = &mgmt_dev->oct_hw;
> +       status = atomic_read(&mgmt_dev->status);
> +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
> +
> +       if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
> +               cancel_work_sync(&mgmt_dev->setup_task);

This seems to be racy, can we call cancel_work_sync() unconditionally?

> +       } else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
> +               vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> +               kfree(mgmt_dev->oct_hw.vqs);
> +       }
> +
> +       if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> +               octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +
> +       if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> +               octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
> +
> +       octep_vdpa_vf_bar_shrink(pdev);
> +}
> +
> +static void octep_vdpa_remove(struct pci_dev *pdev)
> +{
> +       if (pdev->is_virtfn)
> +               octep_vdpa_remove_vf(pdev);
> +       else
> +               octep_vdpa_remove_pf(pdev);
> +}
> +
> +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
> +                             const struct vdpa_dev_set_config *config)
> +{
> +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
> +       struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> +       struct pci_dev *pdev = oct_hw->pdev;
> +       struct vdpa_device *vdpa_dev;
> +       struct octep_vdpa *oct_vdpa;
> +       u64 device_features;
> +       u16 notify_off;
> +       int i, ret;
> +
> +       oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
> +                                    NULL, false);
> +       if (IS_ERR(oct_vdpa)) {
> +               dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
> +               return PTR_ERR(oct_vdpa);
> +       }
> +
> +       oct_vdpa->pdev = pdev;
> +       oct_vdpa->vdpa.dma_dev = &pdev->dev;
> +       oct_vdpa->vdpa.mdev = mdev;
> +       oct_vdpa->oct_hw = oct_hw;
> +       vdpa_dev = &oct_vdpa->vdpa;
> +
> +       device_features = oct_hw->features;
> +       if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> +               if (config->device_features & ~device_features) {
> +                       dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
> +                               config->device_features, device_features);
> +                       return -EINVAL;
> +               }
> +               device_features &= config->device_features;
> +       }
> +
> +       oct_hw->features = device_features;
> +
> +       if (verify_features(device_features)) {
> +               dev_warn(mdev->device,
> +                        "Must provision minimum features 0x%llx for this device",
> +                        BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> +                        BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
> +               return -EOPNOTSUPP;
> +       }
> +
> +       oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
> +       if (!oct_hw->vqs)
> +               return -ENOMEM;
> +
> +       oct_hw->irq = -1;
> +
> +       dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> +       dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> +
> +       for (i = 0; i < oct_hw->nr_vring; i++) {
> +               octep_write_queue_select(i, oct_hw);
> +               notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
> +               oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> +                       notify_off * oct_hw->notify_off_multiplier;
> +               oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw->vqs[i].notify_addr + 1;
> +               oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> +                       notify_off * oct_hw->notify_off_multiplier;
> +       }

I think the reason we can't do the above in the probe is that we need
to wait for the device ready then we can get those information?

> +
> +       if (name)
> +               ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> +       else
> +               ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
> +
> +       ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to register to vDPA bus");
> +               goto free_vqs;
> +       }
> +       return 0;
> +
> +free_vqs:
> +       put_device(&oct_vdpa->vdpa.dev);
> +       kfree(oct_hw->vqs);
> +       return ret;
> +}
> +
> +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
> +{
> +       _vdpa_unregister_device(dev);
> +}
> +
> +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> +       .dev_add = octep_vdpa_dev_add,
> +       .dev_del = octep_vdpa_dev_del
> +};
> +
> +static bool get_device_ready_status(u8 __iomem *addr)
> +{
> +       u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> +
> +       if (signature == OCTEP_DEV_READY_SIGNATURE) {
> +               writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> +               return true;
> +       }
> +
> +       return false;
> +}
> +
> +static struct virtio_device_id id_table[] = {
> +       { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> +       { 0 },
> +};
> +
> +static void octep_vdpa_setup_task(struct work_struct *work)
> +{
> +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
> +                                                           setup_task);
> +       struct pci_dev *pdev = mgmt_dev->pdev;
> +       struct device *dev = &pdev->dev;
> +       struct octep_hw *oct_hw;
> +       unsigned long timeout;
> +       int ret;
> +
> +       oct_hw = &mgmt_dev->oct_hw;
> +
> +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> +
> +       /* Wait for a maximum of 5 sec */
> +       timeout = jiffies + msecs_to_jiffies(5000);
> +       while (!time_after(jiffies, timeout)) {
> +               if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
> +                       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
> +                       break;
> +               }
> +
> +               if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
> +                       dev_info(dev, "Stopping vDPA setup task.\n");
> +                       return;
> +               }
> +
> +               usleep_range(1000, 1500);
> +       }
> +
> +       if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
> +               dev_err(dev, "BAR initialization is timed out\n");
> +               return;
> +       }
> +
> +       ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +       if (ret)
> +               return;
> +
> +       ret = octep_hw_caps_read(oct_hw, pdev);
> +       if (ret < 0)
> +               goto unmap_region;
> +
> +       oct_hw->features = octep_hw_get_dev_features(oct_hw);
> +       ret = verify_features(oct_hw->features);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> +               goto unmap_region;
> +       }
> +       oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
> +
> +       mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> +       mgmt_dev->mdev.id_table = id_table;
> +       mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> +       mgmt_dev->mdev.supported_features = oct_hw->features;
> +       mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
> +       mgmt_dev->mdev.device = dev;
> +
> +       ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> +       if (ret) {
> +               dev_err(dev, "Failed to register vdpa management interface\n");
> +               goto unmap_region;
> +       }
> +
> +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
> +
> +       return;
> +
> +unmap_region:
> +       octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +       oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
> +}
> +
> +static int octep_vdpa_probe_vf(struct pci_dev *pdev)
> +{
> +       struct octep_vdpa_mgmt_dev *mgmt_dev;
> +       struct device *dev = &pdev->dev;
> +       struct iommu_domain *domain;
> +       int ret;
> +
> +       ret = pcim_enable_device(pdev);
> +       if (ret) {
> +               dev_err(dev, "Failed to enable device\n");
> +               return ret;
> +       }
> +
> +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +       if (ret) {
> +               dev_err(dev, "No usable DMA configuration\n");
> +               return ret;
> +       }
> +       pci_set_master(pdev);
> +
> +       domain = iommu_get_domain_for_dev(dev);
> +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> +               dev_info(dev, "NO-IOMMU\n");
> +               octep_vdpa_ops.set_map = octep_vdpa_set_map;

Is this a shortcut to have get better performance? DMA API should have
those greacefully I think.

> +       }
> +
> +       mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
> +       if (!mgmt_dev)
> +               return -ENOMEM;
> +
> +       ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
> +       if (ret)
> +               return ret;
> +
> +       mgmt_dev->pdev = pdev;
> +       pci_set_drvdata(pdev, mgmt_dev);
> +
> +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
> +       INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
> +       schedule_work(&mgmt_dev->setup_task);
> +       dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
> +
> +       return 0;
> +}
> +
> +static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
> +{
> +       struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
> +       struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +       struct octep_pf *pf = pci_get_drvdata(pf_dev);
> +       struct pci_bus_region bus_region;
> +
> +       vf_res->name = pci_name(vf_dev);
> +       vf_res->flags = pf_res->flags;
> +       vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
> +
> +       bus_region.start = pf->vf_base + idx * pf->vf_stride;
> +       bus_region.end = bus_region.start + pf->vf_stride - 1;
> +       pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
> +}
> +
> +static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
> +{
> +       struct octep_pf *pf = pci_get_drvdata(pdev);
> +       u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
> +       int ret, i;
> +
> +       if (num_vfs > 0) {
> +               struct pci_dev *vf_pdev = NULL;
> +               bool done = false;
> +               int index = 0;
> +
> +               ret = pci_enable_sriov(pdev, num_vfs);
> +               if (ret)
> +                       return ret;
> +
> +               pf->enabled_vfs = num_vfs;
> +
> +               while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {

Do we have an exisiting helper to iterate all vf?

> +                       if (vf_pdev->device != pf->vf_devid)
> +                               continue;
> +
> +                       octep_vdpa_assign_barspace(vf_pdev, pdev, index);
> +                       if (++index == num_vfs) {
> +                               done = true;
> +                               break;
> +                       }
> +               }
> +
> +               if (done) {
> +                       for (i = 0; i < pf->enabled_vfs; i++)
> +                               writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
> +               }
> +       } else {
> +               if (!pci_num_vf(pdev))
> +                       return 0;
> +
> +               pci_disable_sriov(pdev);
> +               pf->enabled_vfs = 0;
> +       }
> +
> +       return num_vfs;
> +}
> +
> +static u16 octep_get_vf_devid(struct pci_dev *pdev)
> +{
> +       u16 did;
> +
> +       switch (pdev->device) {
> +       case OCTEP_VDPA_DEVID_CN106K_PF:
> +               did = OCTEP_VDPA_DEVID_CN106K_VF;
> +               break;
> +       case OCTEP_VDPA_DEVID_CN105K_PF:
> +               did = OCTEP_VDPA_DEVID_CN105K_VF;
> +               break;
> +       case OCTEP_VDPA_DEVID_CN103K_PF:
> +               did = OCTEP_VDPA_DEVID_CN103K_VF;
> +               break;
> +       default:
> +               did = 0xFFFF;
> +               break;
> +       }
> +
> +       return did;
> +}
> +
> +static int octep_vdpa_pf_setup(struct octep_pf *octpf)
> +{
> +       u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +       struct pci_dev *pdev = octpf->pdev;
> +       int totalvfs;
> +       u64 val, len;
> +
> +       totalvfs = pci_sriov_get_totalvfs(pdev);
> +       if (unlikely(!totalvfs)) {
> +               dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
> +               return 0;
> +       }
> +
> +       addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +       val = readq(addr + OCTEP_EPF_RINFO(0));
> +       if (val == 0) {
> +               dev_err(&pdev->dev, "Invalid device configuration\n");
> +               return -EINVAL;
> +       }
> +
> +       if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
> +               val &= ~GENMASK_ULL(35, 32);
> +               val |= BIT_ULL(32);
> +               writeq(val, addr + OCTEP_EPF_RINFO(0));
> +       }
> +
> +       len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
> +
> +       octpf->vf_stride = len / totalvfs;
> +       octpf->vf_devid = octep_get_vf_devid(pdev);
> +
> +       octep_vdpa_pf_bar_shrink(octpf);
> +
> +       return 0;
> +}
> +
> +static int octep_vdpa_probe_pf(struct pci_dev *pdev)
> +{
> +       struct device *dev = &pdev->dev;
> +       struct octep_pf *octpf;
> +       int ret;
> +
> +       ret = pcim_enable_device(pdev);
> +       if (ret) {
> +               dev_err(dev, "Failed to enable device\n");
> +               return ret;
> +       }
> +
> +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +       if (ret) {
> +               dev_err(dev, "No usable DMA configuration\n");
> +               return ret;
> +       }
> +       octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
> +       if (!octpf)
> +               return -ENOMEM;
> +
> +       ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +       if (ret)
> +               return ret;
> +
> +       pci_set_master(pdev);
> +       pci_set_drvdata(pdev, octpf);
> +       octpf->pdev = pdev;
> +
> +       ret = octep_vdpa_pf_setup(octpf);
> +       if (ret)
> +               goto unmap_region;
> +
> +       return 0;
> +
> +unmap_region:
> +       octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +       return ret;
> +}
> +
> +static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> +       if (pdev->is_virtfn)
> +               return octep_vdpa_probe_vf(pdev);
> +       else
> +               return octep_vdpa_probe_pf(pdev);
> +}
> +
> +static struct pci_device_id octep_pci_vdpa_map[] = {
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
> +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
> +       { 0 },
> +};
> +
> +static struct pci_driver octep_pci_vdpa = {
> +       .name     = OCTEP_VDPA_DRIVER_NAME,
> +       .id_table = octep_pci_vdpa_map,
> +       .probe    = octep_vdpa_probe,
> +       .remove   = octep_vdpa_remove,
> +       .sriov_configure = octep_vdpa_sriov_configure
> +};
> +
> +module_pci_driver(octep_pci_vdpa);
> +
> +MODULE_AUTHOR("Marvell");
> +MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
> +MODULE_LICENSE("GPL");
> --
> 2.25.1
>

Thanks


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-27 11:21 [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices Srujana Challa
  2024-03-29  4:28 ` Jason Wang
@ 2024-03-29 11:37 ` Stefano Garzarella
  2024-03-29 13:02   ` [EXTERNAL] " Srujana Challa
  2024-04-22 20:44 ` Michael S. Tsirkin
  2024-04-24 19:08 ` [PATCH v2] " Srujana Challa
  3 siblings, 1 reply; 28+ messages in thread
From: Stefano Garzarella @ 2024-03-29 11:37 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, jasowang, xuanzhuo, vattunuru, sthotton,
	ndabilpuram, jerinj

On Wed, Mar 27, 2024 at 04:51:36PM +0530, Srujana Challa wrote:
>This commit introduces a new vDPA driver specifically designed for
>managing the virtio control plane over the vDPA bus for OCTEON DPU
>devices. The driver consists of two layers:
>
>1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
>operations and configurations related to the DPU device.
>
>2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
>implements device operations for the vDPA bus. It handles device
>probing, bus attachment, vring operations, and other relevant tasks.
>
>Signed-off-by: Srujana Challa <schalla@marvell.com>
>Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>Signed-off-by: Shijith Thotton <sthotton@marvell.com>
>Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>---
> MAINTAINERS                              |   5 +
> drivers/vdpa/Kconfig                     |   9 +
> drivers/vdpa/Makefile                    |   1 +
> drivers/vdpa/octeon_ep/Makefile          |   4 +
> drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
> drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
> drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903 +++++++++++++++++++++++
> 7 files changed, 1472 insertions(+)
> create mode 100644 drivers/vdpa/octeon_ep/Makefile
> create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
> create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c

Building with C=2 I see several warnings, I suggest to take a look:

$ make O=build M=drivers/vdpa -j6 W=1 C=2 modules
   ...
   CHECK   ../drivers/vdpa/octeon_ep/octep_vdpa_main.c
   CHECK   ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:50: warning: cast removes address space '__iomem' of expression
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47: warning: incorrect type in assignment (different address spaces)
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47:    expected unsigned int [noderef] [usertype] __iomem *cb_notify_addr
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47:    got unsigned int [usertype] *
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:74:66: warning: dereference of noderef expression
../drivers/vdpa/octeon_ep/octep_vdpa_main.c:75:40: warning: dereference of noderef expression
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25: warning: incorrect type in initializer (different base types)
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25:    expected unsigned int [usertype] length
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25:    got restricted __le32 [usertype] length
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25: warning: incorrect type in initializer (different base types)
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25:    expected unsigned int [usertype] offset
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25:    got restricted __le32 [usertype] offset
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:427:89: warning: restricted __le32 degrades to integer
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:449:9: warning: cast removes address space '__iomem' of expression
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:450:9: warning: cast removes address space '__iomem' of expression
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:451:9: warning: cast removes address space '__iomem' of expression
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:452:9: warning: cast removes address space '__iomem' of expression
../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:454:9: warning: cast removes 
address space '__iomem' of expression

>
>diff --git a/MAINTAINERS b/MAINTAINERS
>index cfe44a876d8a..539ce209a960 100644
>--- a/MAINTAINERS
>+++ b/MAINTAINERS
>@@ -13127,6 +13127,11 @@ S:	Supported
> F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
> F:	drivers/mmc/host/sdhci-xenon*
>
>+MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
>+R:	schalla@marvell.com
>+R:	vattunuru@marvell.com
>+F:	drivers/vdpa/octeon_ep/
>+
> MATROX FRAMEBUFFER DRIVER
> L:	linux-fbdev@vger.kernel.org
> S:	Orphan
>diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
>index 656c1cb541de..775149fad476 100644
>--- a/drivers/vdpa/Kconfig
>+++ b/drivers/vdpa/Kconfig
>@@ -126,4 +126,13 @@ config PDS_VDPA
> 	  With this driver, the VirtIO dataplane can be
> 	  offloaded to an AMD/Pensando DSC device.
>
>+config OCTEONEP_VDPA
>+	tristate "vDPA driver for Octeon DPU devices"
>+	depends on m

Just for curiosity, why restrict it to module-only?

>+	depends on PCI_MSI
>+	help
>+	  vDPA driver for Marvell's Octeon DPU devices.
>+	  With this driver, the VirtIO dataplane can be
>+	  offloaded to a Octeon DPU device.
>+
> endif # VDPA
>diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
>index 8f53c6f3cca7..5654d36707af 100644
>--- a/drivers/vdpa/Makefile
>+++ b/drivers/vdpa/Makefile
>@@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
> obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
> obj-$(CONFIG_SNET_VDPA) += solidrun/
> obj-$(CONFIG_PDS_VDPA) += pds/
>+obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
>diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
>new file mode 100644
>index 000000000000..e23e2ff14f33
>--- /dev/null
>+++ b/drivers/vdpa/octeon_ep/Makefile
>@@ -0,0 +1,4 @@
>+# SPDX-License-Identifier: GPL-2.0
>+obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
>+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
>+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
>diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
>new file mode 100644
>index 000000000000..60d2efc8f8b8
>--- /dev/null
>+++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
>@@ -0,0 +1,93 @@
>+/* SPDX-License-Identifier: GPL-2.0-only
>+ * Copyright (C) 2024 Marvell.
>+ */
>+#ifndef __OCTEP_VDPA_H__
>+#define __OCTEP_VDPA_H__
>+
>+#include <linux/pci.h>
>+#include <linux/pci_regs.h>
>+#include <linux/vdpa.h>
>+#include <linux/virtio_pci_modern.h>
>+#include <uapi/linux/virtio_net.h>
>+#include <uapi/linux/virtio_blk.h>

It looks like virtio_blk.h is not needed.
I suggest to check also other includes.

>+#include <uapi/linux/virtio_config.h>
>+#include <uapi/linux/virtio_pci.h>
>+#include <uapi/linux/vdpa.h>
>+
>+#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
>+#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
>+#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
>+#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
>+#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
>+#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
>+
>+#define OCTEP_HW_MBOX_BAR 0
>+#define OCTEP_HW_CAPS_BAR 4
>+
>+#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
>+
>+#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
>+#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
>+#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
>+
>+#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
>+#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
>+
>+#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
>+#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
>+
>+enum octep_vdpa_dev_status {
>+	OCTEP_VDPA_DEV_STATUS_INVALID,
>+	OCTEP_VDPA_DEV_STATUS_ALLOC,
>+	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
>+	OCTEP_VDPA_DEV_STATUS_INIT,
>+	OCTEP_VDPA_DEV_STATUS_READY,
>+	OCTEP_VDPA_DEV_STATUS_UNINIT
>+};
>+
>+struct octep_vring_info {
>+	struct vdpa_callback cb;
>+	void __iomem *notify_addr;
>+	u32 __iomem *cb_notify_addr;
>+	phys_addr_t notify_pa;
>+	char msix_name[256];
>+};
>+
>+struct octep_hw {
>+	struct pci_dev *pdev;
>+	u8 __iomem *base[PCI_STD_NUM_BARS];
>+	struct virtio_pci_common_cfg __iomem *common_cfg;
>+	u8 __iomem *dev_cfg;
>+	u8 __iomem *isr;
>+	void __iomem *notify_base;
>+	phys_addr_t notify_base_pa;
>+	u32 notify_off_multiplier;
>+	u8 notify_bar;
>+	struct octep_vring_info *vqs;
>+	struct vdpa_callback config_cb;
>+	u64 features;
>+	u64 drv_features;
>+	u16 nr_vring;
>+	u32 config_size;
>+	int irq;
>+};
>+
>+u8 octep_hw_get_status(struct octep_hw *oct_hw);
>+void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
>+void octep_hw_reset(struct octep_hw *oct_hw);
>+void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
>+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
>+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
>+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
>+			 u64 device_area);
>+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
>+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
>+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
>+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
>+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
>+u16 octep_get_vq_size(struct octep_hw *oct_hw);
>+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
>+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
>+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
>+
>+#endif /* __OCTEP_VDPA_H__ */
>diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>new file mode 100644
>index 000000000000..0a2f1d09c4ab
>--- /dev/null
>+++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>@@ -0,0 +1,457 @@
>+// SPDX-License-Identifier: GPL-2.0-only
>+/* Copyright (C) 2024 Marvell. */
>+
>+#include <linux/iopoll.h>
>+
>+#include "octep_vdpa.h"
>+
>+enum octep_mbox_ids {
>+	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
>+	OCTEP_MBOX_MSG_GET_VQ_STATE,
>+};
>+
>+#define OCTEP_HW_TIMEOUT       10000000
>+
>+#define MBOX_OFFSET            64
>+#define MBOX_RSP_MASK          0x00000001
>+#define MBOX_RC_MASK           0x0000FFFE
>+
>+#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
>+#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
>+#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
>+
>+struct octep_mbox_hdr {
>+	u8 ver;
>+	u8 rsvd1;
>+	u16 id;
>+	u16 rsvd2;
>+#define MBOX_REQ_SIG (0xdead)
>+#define MBOX_RSP_SIG (0xbeef)
>+	u16 sig;
>+};
>+
>+struct octep_mbox_sts {
>+	u16 rsp:1;
>+	u16 rc:15;
>+	u16 rsvd;
>+};
>+
>+struct octep_mbox {
>+	struct octep_mbox_hdr hdr;
>+	struct octep_mbox_sts sts;
>+	u64 rsvd;
>+	u32 data[];
>+};
>+
>+static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
>+{
>+	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
>+}
>+
>+static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
>+{
>+	u32 val;
>+
>+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
>+				  OCTEP_HW_TIMEOUT);
>+}
>+
>+static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
>+{
>+	u32 val;
>+
>+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
>+				  OCTEP_HW_TIMEOUT);
>+}
>+
>+static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
>+{
>+	iowrite16(id, &mbox->hdr.id);
>+	iowrite16(sig, &mbox->hdr.sig);
>+}
>+
>+static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
>+{
>+	return ioread16(&mbox->hdr.sig);
>+}
>+
>+static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
>+{
>+	iowrite32(sts, &mbox->sts);
>+}
>+
>+static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
>+{
>+	return ioread32(&mbox->sts);
>+}
>+
>+static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
>+{
>+	return ioread32(&mbox->data[word_idx]);
>+}
>+
>+static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
>+{
>+	return iowrite32(word, &mbox->data[word_idx]);
>+}
>+
>+static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
>+			      u32 buf_size, bool write)
>+{
>+	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
>+	struct pci_dev *pdev = oct_hw->pdev;
>+	u32 *p = (u32 *)buffer;
>+	u16 data_wds;
>+	int ret, i;
>+	u32 val;
>+
>+	if (!IS_ALIGNED(buf_size, 4))
>+		return -EINVAL;
>+
>+	/* Make sure mbox space is available */
>+	ret = octep_wait_for_mbox_avail(mbox);
>+	if (ret) {
>+		dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
>+		return ret;
>+	}
>+	data_wds = buf_size / 4;
>+
>+	if (write) {
>+		for (i = 1; i <= data_wds; i++) {
>+			octep_write32_word(mbox, i, *p);
>+			p++;
>+		}
>+	}
>+	octep_write32_word(mbox, 0, (u32)qid);
>+	octep_write_sts(mbox, 0);
>+
>+	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
>+
>+	ret = octep_wait_for_mbox_rsp(mbox);
>+	if (ret) {
>+		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
>+		return ret;
>+	}
>+
>+	val = octep_read_sig(mbox);
>+	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
>+		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
>+		return ret;

`ret` here is 0, is it right to return 0 in this case?
If yes, maybe better to return 0 directly.

>+	}
>+
>+	val = octep_read_sts(mbox);
>+	if (val & MBOX_RC_MASK) {
>+		ret = MBOX_RSP_TO_ERR(val);
>+		dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
>+		return ret;
>+	}
>+
>+	if (!write)
>+		for (i = 1; i <= data_wds; i++)
>+			*p++ = octep_read32_word(mbox, i);
>+
>+	return 0;
>+}
>+
>+static void octep_mbox_init(struct octep_mbox __iomem *mbox)
>+{
>+	iowrite32(1, &mbox->sts);
>+}
>+
>+u8 octep_hw_get_status(struct octep_hw *oct_hw)
>+{
>+	return ioread8(&oct_hw->common_cfg->device_status);
>+}
>+
>+void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
>+{
>+	iowrite8(status, &oct_hw->common_cfg->device_status);
>+}
>+
>+void octep_hw_reset(struct octep_hw *oct_hw)
>+{
>+	u8 val;
>+
>+	octep_hw_set_status(oct_hw, 0 | BIT(7));
>+	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
>+			       OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
>+		return;
>+	}
>+}
>+
>+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
>+{
>+	u32 features_lo, features_hi;
>+	u32 val, select;
>+
>+	select = 0;
>+	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
>+
>+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
>+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
>+		return 0ULL;

nit: `return 0;` should be fine too.

>+	}
>+	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
>+
>+	select = 1;
>+	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
>+
>+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
>+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
>+		return 0ULL;
>+	}
>+	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
>+
>+	return ((u64)features_hi << 32) | features_lo;
>+}
>+
>+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
>+{
>+	u32 val, select;
>+
>+	select = 0;
>+	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
>+
>+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
>+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
>+		return;
>+	}
>+	iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
>+
>+	select = 1;
>+	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
>+
>+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
>+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
>+		return;
>+	}
>+	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
>+}
>+
>+void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)

In almost all other fuctions, `oct_hw` is the first parameter, I would
try to unify that here as well.

>+{
>+	u16 val;
>+
>+	iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
>+
>+	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
>+			       10, OCTEP_HW_TIMEOUT)) {
>+		dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
>+		return;
>+	}
>+}
>+
>+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
>+{
>+	iowrite16(qid, oct_hw->vqs[qid].notify_addr);
>+}
>+
>+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
>+{
>+	u8 old_gen, new_gen, *p;
>+	int i;
>+
>+	WARN_ON(offset + length > oct_hw->config_size);
                ^
Should we return early in this case?

>+	do {
>+		old_gen = ioread8(&oct_hw->common_cfg->config_generation);
>+		p = dst;
>+		for (i = 0; i < length; i++)
>+			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
>+
>+		new_gen = ioread8(&oct_hw->common_cfg->config_generation);
>+	} while (old_gen != new_gen);
>+}
>+
>+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
>+			 u64 device_area)
>+{
>+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
>+
>+	octep_write_queue_select(qid, oct_hw);
>+	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
>+			     &cfg->queue_desc_hi);
>+	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
>+			     &cfg->queue_avail_hi);
>+	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
>+			     &cfg->queue_used_hi);
>+
>+	return 0;
>+}
>+
>+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
>+{
>+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
>+				  sizeof(*state), 0);
>+}
>+
>+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
>+{
>+	struct vdpa_vq_state q_state;
>+
>+	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
>+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
>+				  sizeof(*state), 1);
>+}
>+
>+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
>+{
>+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
>+
>+	octep_write_queue_select(qid, oct_hw);
>+	iowrite16(num, &cfg->queue_size);
>+}
>+
>+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
>+{
>+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
>+
>+	octep_write_queue_select(qid, oct_hw);
>+	iowrite16(ready, &cfg->queue_enable);
>+}
>+
>+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
>+{
>+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
>+
>+	octep_write_queue_select(qid, oct_hw);
>+	return ioread16(&cfg->queue_enable);
>+}
>+
>+u16 octep_get_vq_size(struct octep_hw *oct_hw)
>+{
>+	octep_write_queue_select(0, oct_hw);
>+	return ioread16(&oct_hw->common_cfg->queue_size);
>+}
>+
>+static u32 octep_get_config_size(struct octep_hw *oct_hw)
>+{
>+	return sizeof(struct virtio_net_config);
>+}
>+
>+static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
>+{
>+	struct device *dev = &oct_hw->pdev->dev;
>+	u32 length = cap->length;
>+	u32 offset = cap->offset;

Should we use le32_to_cpu() to access that fields (lenght, offset)?

>+	u8  bar    = cap->bar;
>+	u32 len;
>+
>+	if (bar != OCTEP_HW_CAPS_BAR) {
>+		dev_err(dev, "Invalid bar: %u\n", bar);
>+		return NULL;
>+	}
>+	if (offset + length < offset) {
>+		dev_err(dev, "offset(%u) + length(%u) overflows\n",
>+			offset, length);
>+		return NULL;
>+	}
>+	len = pci_resource_len(oct_hw->pdev, bar);
>+	if (offset + length > len) {
>+		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
>+			offset + length, len);
>+		return NULL;
>+	}
>+	return oct_hw->base[bar] + offset;
>+}
>+
>+static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
>+{
>+	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
>+	u8 *p = buf;
>+	size_t i;
>+
>+	for (i = 0; i < len; i++)
>+		*p++ = ioread8(bar + offset + i);
>+}
>+
>+static int pci_signature_verify(struct octep_hw *oct_hw)
>+{
>+	u32 signature[2];
>+
>+	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
>+
>+	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
>+		return -1;
>+
>+	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
>+		return -1;
>+
>+	return 0;
>+}
>+
>+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
>+{
>+	struct octep_mbox __iomem *mbox;
>+	struct device *dev = &pdev->dev;
>+	struct virtio_pci_cap cap;
>+	int ret;
>+	u8 pos;
>+
>+	oct_hw->pdev = pdev;
>+	ret = pci_signature_verify(oct_hw);
>+	if (ret) {
>+		dev_err(dev, "Octeon Virtio FW is not initialized\n");
>+		return -EIO;
>+	}
>+
>+	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
>+
>+	while (pos) {
>+		pci_caps_read(oct_hw, &cap, 2, pos);
>+
>+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
>+			dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
>+			break;
>+		}
>+
>+		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
>+
>+		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
>+			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
>+
>+		switch (cap.cfg_type) {
>+		case VIRTIO_PCI_CAP_COMMON_CFG:
>+			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
>+			break;
>+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
>+			pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
>+				      4, pos + sizeof(cap));
>+
>+			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
>+			oct_hw->notify_bar = cap.bar;
>+			oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + cap.offset;

Should we use le32_to_cpu() to access cap.offset?

>+			break;
>+		case VIRTIO_PCI_CAP_DEVICE_CFG:
>+			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
>+			break;
>+		case VIRTIO_PCI_CAP_ISR_CFG:
>+			oct_hw->isr = get_cap_addr(oct_hw, &cap);
>+			break;
>+		}
>+
>+		pos = cap.cap_next;
>+	}
>+	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
>+	    !oct_hw->dev_cfg    || !oct_hw->isr) {
>+		dev_err(dev, "Incomplete PCI capabilities");
>+		return -EIO;
>+	}
>+	oct_hw->config_size = octep_get_config_size(oct_hw);
>+
>+	mbox = octep_get_mbox(oct_hw);
>+	octep_mbox_init(mbox);
>+
>+	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
>+	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
>+	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
>+	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
>+		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
>+	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
>+
>+	return 0;
>+}
>diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
>new file mode 100644
>index 000000000000..845fd35368ff
>--- /dev/null
>+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
>@@ -0,0 +1,903 @@
>+// SPDX-License-Identifier: GPL-2.0-only
>+/* Copyright (C) 2024 Marvell. */
>+
>+#include <linux/interrupt.h>
>+#include <linux/module.h>
>+#include <linux/iommu.h>
>+#include "octep_vdpa.h"
>+
>+#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
>+
>+struct octep_pf {
>+	u8 __iomem *base[PCI_STD_NUM_BARS];
>+	struct pci_dev *pdev;
>+	struct resource res;
>+	u64 vf_base;
>+	int enabled_vfs;
>+	u32 vf_stride;
>+	u16 vf_devid;
>+};
>+
>+struct octep_vdpa {
>+	struct vdpa_device vdpa;
>+	struct octep_hw *oct_hw;
>+	struct pci_dev *pdev;
>+};
>+
>+struct octep_vdpa_mgmt_dev {
>+	struct vdpa_mgmt_dev mdev;
>+	struct octep_hw oct_hw;
>+	struct pci_dev *pdev;
>+	/* Work entry to handle device setup */
>+	struct work_struct setup_task;
>+	/* Device status */
>+	atomic_t status;
>+};
>+
>+static int verify_features(u64 features)
>+{
>+	/* Minimum features to expect */
>+	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
>+		return -EOPNOTSUPP;
>+
>+	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
>+		return -EOPNOTSUPP;
>+
>+	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
>+		return -EOPNOTSUPP;
>+
>+	/* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
>+	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
>+	 */
>+	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
>+	    BIT_ULL(VIRTIO_NET_F_MQ))
>+		return -EINVAL;
>+
>+	return 0;
>+}
>+
>+static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_vdpa *oct_vdpa;
>+
>+	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
>+
>+	return oct_vdpa->oct_hw;
>+}
>+
>+static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
>+{
>+	struct octep_hw *oct_hw = data;
>+	int i;
>+
>+	for (i = 0; i < oct_hw->nr_vring; i++) {
>+		if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr) {
>+			*oct_hw->vqs[i].cb_notify_addr = 0;
>+			oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
>+		}
>+	}
>+
>+	return IRQ_HANDLED;
>+}
>+
>+static void octep_free_irqs(struct octep_hw *oct_hw)
>+{
>+	struct pci_dev *pdev = oct_hw->pdev;
>+
>+	if (oct_hw->irq != -1) {
>+		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
>+		oct_hw->irq = -1;
>+	}
>+	pci_free_irq_vectors(pdev);
>+}
>+
>+static int octep_request_irqs(struct octep_hw *oct_hw)
>+{
>+	struct pci_dev *pdev = oct_hw->pdev;
>+	int ret, irq;
>+
>+	/* Use one ring/interrupt per VF for virtio call interface. */
>+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
>+	if (ret < 0) {
>+		dev_err(&pdev->dev, "Failed to alloc msix vector");
>+		return ret;
>+	}
>+
>+	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
>+		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
>+
>+	irq = pci_irq_vector(pdev, 0);
>+	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
>+			       oct_hw->vqs->msix_name, oct_hw);
>+	if (ret) {
>+		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
>+		goto free_irq_vec;
>+	}
>+	oct_hw->irq = irq;
>+
>+	return 0;
>+
>+free_irq_vec:
>+	pci_free_irq_vectors(pdev);
>+	return ret;
>+}
>+
>+static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return oct_hw->features;
>+}
>+
>+static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+	int ret;
>+
>+	pr_debug("Driver Features: %llx\n", features);
>+	ret = verify_features(features);
>+	if (ret)
>+		return ret;
>+
>+	octep_hw_set_drv_features(oct_hw, features);
>+	oct_hw->drv_features = features;
>+
>+	return 0;
>+}
>+
>+static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return oct_hw->features & oct_hw->drv_features;
>+}
>+
>+static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return octep_hw_get_status(oct_hw);
>+}
>+
>+static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+	u8 status_old;
>+
>+	status_old = octep_hw_get_status(oct_hw);
>+
>+	if (status_old == status)
>+		return;
>+
>+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
>+	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
>+		if (octep_request_irqs(oct_hw))
>+			status = status_old | VIRTIO_CONFIG_S_FAILED;
>+	}
>+	octep_hw_set_status(oct_hw, status);
>+}
>+
>+static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+	u8 status = octep_hw_get_status(oct_hw);
>+	u16 qid;
>+
>+	if (status == 0)
>+		return 0;
>+
>+	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
>+		oct_hw->vqs[qid].cb.callback = NULL;
>+		oct_hw->vqs[qid].cb.private = NULL;
>+		oct_hw->config_cb.callback = NULL;
>+		oct_hw->config_cb.private = NULL;
>+	}
>+	octep_hw_reset(oct_hw);
>+
>+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
>+		octep_free_irqs(oct_hw);
>+
>+	return 0;
>+}
>+
>+static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return octep_get_vq_size(oct_hw);
>+}
>+
>+static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
>+				   struct vdpa_vq_state *state)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return octep_get_vq_state(oct_hw, qid, state);
>+}
>+
>+static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
>+				   const struct vdpa_vq_state *state)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return octep_set_vq_state(oct_hw, qid, state);
>+}
>+
>+static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	oct_hw->vqs[qid].cb = *cb;
>+}
>+
>+static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	octep_set_vq_ready(oct_hw, qid, ready);
>+}
>+
>+static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return octep_get_vq_ready(oct_hw, qid);
>+}
>+
>+static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	octep_set_vq_num(oct_hw, qid, num);
>+}
>+
>+static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
>+				     u64 driver_area, u64 device_area)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
>+	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
>+	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
>+
>+	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
>+}
>+
>+static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	octep_notify_queue(oct_hw, qid);
>+}
>+
>+static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+	u16 idx = data & 0xFFFF;
>+
>+	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
>+}
>+
>+static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return vp_ioread8(&oct_hw->common_cfg->config_generation);
>+}
>+
>+static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
>+{
>+	return VIRTIO_ID_NET;
>+}
>+
>+static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
>+{
>+	return PCI_VENDOR_ID_CAVIUM;
>+}
>+
>+static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
>+{
>+	return PAGE_SIZE;
>+}
>+
>+static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	return oct_hw->config_size;
>+}
>+
>+static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
>+				  unsigned int len)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	octep_read_dev_config(oct_hw, offset, buf, len);
>+}
>+
>+static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
>+				  const void *buf, unsigned int len)
>+{
>+	/* Not supported */
>+}
>+
>+static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+
>+	oct_hw->config_cb.callback = cb->callback;
>+	oct_hw->config_cb.private = cb->private;
>+}
>+
>+static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
>+							       u16 idx)
>+{
>+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
>+	struct vdpa_notification_area area;
>+
>+	area.addr = oct_hw->vqs[idx].notify_pa;
>+	area.size = PAGE_SIZE;
>+
>+	return area;
>+}
>+
>+static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
>+			      struct vhost_iotlb *iotlb)
>+{
>+	return 0;
>+}
>+
>+static struct vdpa_config_ops octep_vdpa_ops = {
>+	.get_device_features = octep_vdpa_get_device_features,
>+	.set_driver_features = octep_vdpa_set_driver_features,
>+	.get_driver_features = octep_vdpa_get_driver_features,
>+	.get_status	= octep_vdpa_get_status,
>+	.set_status	= octep_vdpa_set_status,
>+	.reset		= octep_vdpa_reset,
>+	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
>+	.get_vq_state	= octep_vdpa_get_vq_state,
>+	.set_vq_state	= octep_vdpa_set_vq_state,
>+	.set_vq_cb	= octep_vdpa_set_vq_cb,
>+	.set_vq_ready	= octep_vdpa_set_vq_ready,
>+	.get_vq_ready	= octep_vdpa_get_vq_ready,
>+	.set_vq_num	= octep_vdpa_set_vq_num,
>+	.set_vq_address	= octep_vdpa_set_vq_address,
>+	.get_vq_irq	= NULL,
>+	.kick_vq	= octep_vdpa_kick_vq,
>+	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
>+	.get_generation	= octep_vdpa_get_generation,
>+	.get_device_id	= octep_vdpa_get_device_id,
>+	.get_vendor_id	= octep_vdpa_get_vendor_id,
>+	.get_vq_align	= octep_vdpa_get_vq_align,
>+	.get_config_size	= octep_vdpa_get_config_size,
>+	.get_config	= octep_vdpa_get_config,
>+	.set_config	= octep_vdpa_set_config,
>+	.set_config_cb  = octep_vdpa_set_config_cb,
>+	.get_vq_notification = octep_get_vq_notification,
>+};
>+
>+static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
>+{
>+	int ret;
>+
>+	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
>+	if (ret) {
>+		dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
>+		return ret;
>+	}
>+
>+	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
>+	if (!tbl[bar]) {
>+		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
>+		pci_release_region(pdev, bar);
>+		ret = -ENOMEM;
>+	}
>+
>+	return ret;
>+}
>+
>+static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
>+{
>+	pci_iounmap(pdev, tbl[bar]);
>+	pci_release_region(pdev, bar);
>+}
>+
>+static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
>+{
>+	struct pci_dev *pf_dev = octpf->pdev;
>+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
>+	struct pci_bus_region bus_region;
>+
>+	octpf->res.start = res->start;
>+	octpf->res.end = res->end;
>+	octpf->vf_base = res->start;
>+
>+	bus_region.start = res->start;
>+	bus_region.end = res->start - 1;
>+
>+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
>+}
>+
>+static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
>+{
>+	struct pci_dev *pf_dev = octpf->pdev;
>+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
>+	struct pci_bus_region bus_region;
>+
>+	bus_region.start = octpf->res.start;
>+	bus_region.end = octpf->res.end;
>+
>+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
>+}
>+
>+static void octep_vdpa_remove_pf(struct pci_dev *pdev)
>+{
>+	struct octep_pf *octpf = pci_get_drvdata(pdev);
>+
>+	pci_disable_sriov(pdev);
>+
>+	if (octpf->base[OCTEP_HW_CAPS_BAR])
>+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
>+
>+	if (octpf->base[OCTEP_HW_MBOX_BAR])
>+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
>+
>+	octep_vdpa_pf_bar_expand(octpf);
>+}
>+
>+static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
>+{
>+	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
>+
>+	memset(vf_res, 0, sizeof(*vf_res));
>+}
>+
>+static void octep_vdpa_remove_vf(struct pci_dev *pdev)
>+{
>+	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
>+	struct octep_hw *oct_hw;
>+	int status;
>+
>+	oct_hw = &mgmt_dev->oct_hw;
>+	status = atomic_read(&mgmt_dev->status);
>+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
>+
>+	if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
>+		cancel_work_sync(&mgmt_dev->setup_task);
>+	} else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
>+		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
>+		kfree(mgmt_dev->oct_hw.vqs);
>+	}
>+
>+	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
>+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
>+
>+	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
>+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
>+
>+	octep_vdpa_vf_bar_shrink(pdev);
>+}
>+
>+static void octep_vdpa_remove(struct pci_dev *pdev)
>+{
>+	if (pdev->is_virtfn)
>+		octep_vdpa_remove_vf(pdev);
>+	else
>+		octep_vdpa_remove_pf(pdev);
>+}
>+
>+static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
>+			      const struct vdpa_dev_set_config *config)
>+{
>+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
>+	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
>+	struct pci_dev *pdev = oct_hw->pdev;
>+	struct vdpa_device *vdpa_dev;
>+	struct octep_vdpa *oct_vdpa;
>+	u64 device_features;
>+	u16 notify_off;
>+	int i, ret;
>+
>+	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
>+				     NULL, false);
>+	if (IS_ERR(oct_vdpa)) {
>+		dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
>+		return PTR_ERR(oct_vdpa);
>+	}
>+
>+	oct_vdpa->pdev = pdev;
>+	oct_vdpa->vdpa.dma_dev = &pdev->dev;
>+	oct_vdpa->vdpa.mdev = mdev;
>+	oct_vdpa->oct_hw = oct_hw;
>+	vdpa_dev = &oct_vdpa->vdpa;
>+
>+	device_features = oct_hw->features;
>+	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
>+		if (config->device_features & ~device_features) {
>+			dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
>+				config->device_features, device_features);
>+			return -EINVAL;
>+		}
>+		device_features &= config->device_features;
>+	}
>+
>+	oct_hw->features = device_features;
>+
>+	if (verify_features(device_features)) {
>+		dev_warn(mdev->device,
>+			 "Must provision minimum features 0x%llx for this device",
>+			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
>+			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
>+		return -EOPNOTSUPP;

What about returning the return value of verify_features()?

>+	}
>+
>+	oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);

What about using devm_kcalloc()?
Memory allocated with that should be automatically freed on driver detach.

>+	if (!oct_hw->vqs)
>+		return -ENOMEM;
>+
>+	oct_hw->irq = -1;
>+
>+	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
>+	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
>+
>+	for (i = 0; i < oct_hw->nr_vring; i++) {
>+		octep_write_queue_select(i, oct_hw);
>+		notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
>+		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
>+			notify_off * oct_hw->notify_off_multiplier;
>+		oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw->vqs[i].notify_addr + 1;
>+		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
>+			notify_off * oct_hw->notify_off_multiplier;
>+	}
>+
>+	if (name)
>+		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
>+	else
>+		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
>+
>+	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
>+	if (ret) {
>+		dev_err(&pdev->dev, "Failed to register to vDPA bus");
>+		goto free_vqs;
>+	}
>+	return 0;
>+
>+free_vqs:
>+	put_device(&oct_vdpa->vdpa.dev);
>+	kfree(oct_hw->vqs);
>+	return ret;
>+}
>+
>+static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
>+{
>+	_vdpa_unregister_device(dev);
>+}
>+
>+static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
>+	.dev_add = octep_vdpa_dev_add,
>+	.dev_del = octep_vdpa_dev_del
>+};
>+
>+static bool get_device_ready_status(u8 __iomem *addr)
>+{
>+	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
>+
>+	if (signature == OCTEP_DEV_READY_SIGNATURE) {
>+		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
>+		return true;
>+	}
>+
>+	return false;
>+}
>+
>+static struct virtio_device_id id_table[] = {
>+	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
>+	{ 0 },
>+};
>+
>+static void octep_vdpa_setup_task(struct work_struct *work)
>+{
>+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
>+							    setup_task);
>+	struct pci_dev *pdev = mgmt_dev->pdev;
>+	struct device *dev = &pdev->dev;
>+	struct octep_hw *oct_hw;
>+	unsigned long timeout;
>+	int ret;
>+
>+	oct_hw = &mgmt_dev->oct_hw;
>+
>+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
>+
>+	/* Wait for a maximum of 5 sec */
>+	timeout = jiffies + msecs_to_jiffies(5000);
>+	while (!time_after(jiffies, timeout)) {
>+		if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
>+			atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
>+			break;
>+		}
>+
>+		if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
>+			dev_info(dev, "Stopping vDPA setup task.\n");
>+			return;
>+		}
>+
>+		usleep_range(1000, 1500);
>+	}
>+
>+	if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
>+		dev_err(dev, "BAR initialization is timed out\n");
>+		return;
>+	}
>+
>+	ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
>+	if (ret)
>+		return;
>+
>+	ret = octep_hw_caps_read(oct_hw, pdev);
>+	if (ret < 0)
>+		goto unmap_region;
>+
>+	oct_hw->features = octep_hw_get_dev_features(oct_hw);
>+	ret = verify_features(oct_hw->features);
>+	if (ret) {
>+		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
>+		goto unmap_region;
>+	}
>+	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
>+
>+	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
>+	mgmt_dev->mdev.id_table = id_table;
>+	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
>+	mgmt_dev->mdev.supported_features = oct_hw->features;
>+	mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
>+	mgmt_dev->mdev.device = dev;
>+
>+	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
>+	if (ret) {
>+		dev_err(dev, "Failed to register vdpa management interface\n");
>+		goto unmap_region;
>+	}
>+
>+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
>+
>+	return;
>+
>+unmap_region:
>+	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
>+	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
>+}
>+
>+static int octep_vdpa_probe_vf(struct pci_dev *pdev)
>+{
>+	struct octep_vdpa_mgmt_dev *mgmt_dev;
>+	struct device *dev = &pdev->dev;
>+	struct iommu_domain *domain;
>+	int ret;
>+
>+	ret = pcim_enable_device(pdev);
>+	if (ret) {
>+		dev_err(dev, "Failed to enable device\n");
>+		return ret;
>+	}
>+
>+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
>+	if (ret) {
>+		dev_err(dev, "No usable DMA configuration\n");
>+		return ret;
>+	}
>+	pci_set_master(pdev);
>+
>+	domain = iommu_get_domain_for_dev(dev);
>+	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
>+		dev_info(dev, "NO-IOMMU\n");
>+		octep_vdpa_ops.set_map = octep_vdpa_set_map;
>+	}
>+
>+	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
>+	if (!mgmt_dev)
>+		return -ENOMEM;
>+
>+	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
>+	if (ret)
>+		return ret;
>+
>+	mgmt_dev->pdev = pdev;
>+	pci_set_drvdata(pdev, mgmt_dev);
>+
>+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
>+	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
>+	schedule_work(&mgmt_dev->setup_task);
>+	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
>+
>+	return 0;
>+}
>+
>+static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
>+{
>+	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
>+	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
>+	struct octep_pf *pf = pci_get_drvdata(pf_dev);
>+	struct pci_bus_region bus_region;
>+
>+	vf_res->name = pci_name(vf_dev);
>+	vf_res->flags = pf_res->flags;
>+	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
>+
>+	bus_region.start = pf->vf_base + idx * pf->vf_stride;
>+	bus_region.end = bus_region.start + pf->vf_stride - 1;
>+	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
>+}
>+
>+static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
>+{
>+	struct octep_pf *pf = pci_get_drvdata(pdev);
>+	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
>+	int ret, i;
>+
>+	if (num_vfs > 0) {
>+		struct pci_dev *vf_pdev = NULL;
>+		bool done = false;
>+		int index = 0;
>+
>+		ret = pci_enable_sriov(pdev, num_vfs);
>+		if (ret)
>+			return ret;
>+
>+		pf->enabled_vfs = num_vfs;
>+
>+		while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {
>+			if (vf_pdev->device != pf->vf_devid)
>+				continue;
>+
>+			octep_vdpa_assign_barspace(vf_pdev, pdev, index);
>+			if (++index == num_vfs) {
>+				done = true;
>+				break;
>+			}
>+		}
>+
>+		if (done) {
>+			for (i = 0; i < pf->enabled_vfs; i++)
>+				writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
>+		}
>+	} else {

I suggest moving the smaller branch to the top and possibly returning
earlier.

>+		if (!pci_num_vf(pdev))
>+			return 0;
>+
>+		pci_disable_sriov(pdev);
>+		pf->enabled_vfs = 0;
>+	}
>+
>+	return num_vfs;
>+}
>+
>+static u16 octep_get_vf_devid(struct pci_dev *pdev)
>+{
>+	u16 did;
>+
>+	switch (pdev->device) {
>+	case OCTEP_VDPA_DEVID_CN106K_PF:
>+		did = OCTEP_VDPA_DEVID_CN106K_VF;
>+		break;
>+	case OCTEP_VDPA_DEVID_CN105K_PF:
>+		did = OCTEP_VDPA_DEVID_CN105K_VF;
>+		break;
>+	case OCTEP_VDPA_DEVID_CN103K_PF:
>+		did = OCTEP_VDPA_DEVID_CN103K_VF;
>+		break;
>+	default:
>+		did = 0xFFFF;
>+		break;
>+	}
>+
>+	return did;
>+}
>+
>+static int octep_vdpa_pf_setup(struct octep_pf *octpf)
>+{
>+	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
>+	struct pci_dev *pdev = octpf->pdev;
>+	int totalvfs;
>+	u64 val, len;
>+
>+	totalvfs = pci_sriov_get_totalvfs(pdev);
>+	if (unlikely(!totalvfs)) {
>+		dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
>+		return 0;
>+	}
>+
>+	addr = octpf->base[OCTEP_HW_MBOX_BAR];
>+	val = readq(addr + OCTEP_EPF_RINFO(0));
>+	if (val == 0) {
>+		dev_err(&pdev->dev, "Invalid device configuration\n");
>+		return -EINVAL;
>+	}
>+
>+	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
>+		val &= ~GENMASK_ULL(35, 32);
>+		val |= BIT_ULL(32);
>+		writeq(val, addr + OCTEP_EPF_RINFO(0));
>+	}
>+
>+	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
>+
>+	octpf->vf_stride = len / totalvfs;
>+	octpf->vf_devid = octep_get_vf_devid(pdev);
>+
>+	octep_vdpa_pf_bar_shrink(octpf);
>+
>+	return 0;
>+}
>+
>+static int octep_vdpa_probe_pf(struct pci_dev *pdev)
>+{
>+	struct device *dev = &pdev->dev;
>+	struct octep_pf *octpf;
>+	int ret;
>+
>+	ret = pcim_enable_device(pdev);
>+	if (ret) {
>+		dev_err(dev, "Failed to enable device\n");
>+		return ret;
>+	}
>+
>+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
>+	if (ret) {
>+		dev_err(dev, "No usable DMA configuration\n");
>+		return ret;
>+	}
>+	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
>+	if (!octpf)
>+		return -ENOMEM;
>+
>+	ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
>+	if (ret)
>+		return ret;
>+
>+	pci_set_master(pdev);
>+	pci_set_drvdata(pdev, octpf);
>+	octpf->pdev = pdev;
>+
>+	ret = octep_vdpa_pf_setup(octpf);
>+	if (ret)
>+		goto unmap_region;
>+
>+	return 0;
>+
>+unmap_region:
>+	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
>+	return ret;
>+}
>+
>+static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>+{
>+	if (pdev->is_virtfn)
>+		return octep_vdpa_probe_vf(pdev);
>+	else
>+		return octep_vdpa_probe_pf(pdev);
>+}
>+
>+static struct pci_device_id octep_pci_vdpa_map[] = {
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
>+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
>+	{ 0 },
>+};
>+
>+static struct pci_driver octep_pci_vdpa = {
>+	.name     = OCTEP_VDPA_DRIVER_NAME,
>+	.id_table = octep_pci_vdpa_map,
>+	.probe    = octep_vdpa_probe,
>+	.remove   = octep_vdpa_remove,
>+	.sriov_configure = octep_vdpa_sriov_configure
>+};
>+
>+module_pci_driver(octep_pci_vdpa);
>+
>+MODULE_AUTHOR("Marvell");
>+MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
>+MODULE_LICENSE("GPL");
>-- 
>2.25.1
>
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29  4:28 ` Jason Wang
@ 2024-03-29 12:34   ` Srujana Challa
  2024-03-31 11:31     ` Michael S. Tsirkin
  2024-04-07  3:34     ` Jason Wang
  0 siblings, 2 replies; 28+ messages in thread
From: Srujana Challa @ 2024-03-29 12:34 UTC (permalink / raw)
  To: Jason Wang
  Cc: virtualization, mst, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

> Subject: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON
> DPU devices
> 
> Prioritize security for external emails: Confirm sender and content safety
> before clicking links or opening attachments
> 
> ----------------------------------------------------------------------
> On Wed, Mar 27, 2024 at 7:22 PM Srujana Challa <schalla@marvell.com>
> wrote:
> >
> > This commit introduces a new vDPA driver specifically designed for
> > managing the virtio control plane over the vDPA bus for OCTEON DPU
> > devices. The driver consists of two layers:
> >
> > 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> > operations and configurations related to the DPU device.
> >
> > 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> > implements device operations for the vDPA bus. It handles device
> > probing, bus attachment, vring operations, and other relevant tasks.
> >
> > Signed-off-by: Srujana Challa <schalla@marvell.com>
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> > Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> > ---
> >  MAINTAINERS                              |   5 +
> >  drivers/vdpa/Kconfig                     |   9 +
> >  drivers/vdpa/Makefile                    |   1 +
> >  drivers/vdpa/octeon_ep/Makefile          |   4 +
> >  drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
> >  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
> >  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903
> +++++++++++++++++++++++
> >  7 files changed, 1472 insertions(+)
> >  create mode 100644 drivers/vdpa/octeon_ep/Makefile
> >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
> >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index cfe44a876d8a..539ce209a960 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -13127,6 +13127,11 @@ S:     Supported
> >  F:     Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
> >  F:     drivers/mmc/host/sdhci-xenon*
> >
> > +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> > +R:     schalla@marvell.com
> > +R:     vattunuru@marvell.com
> > +F:     drivers/vdpa/octeon_ep/
> > +
> >  MATROX FRAMEBUFFER DRIVER
> >  L:     linux-fbdev@vger.kernel.org
> >  S:     Orphan
> > diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> > index 656c1cb541de..775149fad476 100644
> > --- a/drivers/vdpa/Kconfig
> > +++ b/drivers/vdpa/Kconfig
> > @@ -126,4 +126,13 @@ config PDS_VDPA
> >           With this driver, the VirtIO dataplane can be
> >           offloaded to an AMD/Pensando DSC device.
> >
> > +config OCTEONEP_VDPA
> > +       tristate "vDPA driver for Octeon DPU devices"
> > +       depends on m
> > +       depends on PCI_MSI
> > +       help
> > +         vDPA driver for Marvell's Octeon DPU devices.
> > +         With this driver, the VirtIO dataplane can be
> > +         offloaded to a Octeon DPU device.
> > +
> >  endif # VDPA
> > diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> > index 8f53c6f3cca7..5654d36707af 100644
> > --- a/drivers/vdpa/Makefile
> > +++ b/drivers/vdpa/Makefile
> > @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
> >  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
> >  obj-$(CONFIG_SNET_VDPA) += solidrun/
> >  obj-$(CONFIG_PDS_VDPA) += pds/
> > +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> > diff --git a/drivers/vdpa/octeon_ep/Makefile
> b/drivers/vdpa/octeon_ep/Makefile
> > new file mode 100644
> > index 000000000000..e23e2ff14f33
> > --- /dev/null
> > +++ b/drivers/vdpa/octeon_ep/Makefile
> > @@ -0,0 +1,4 @@
> > +# SPDX-License-Identifier: GPL-2.0
> > +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> > +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> > +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h
> b/drivers/vdpa/octeon_ep/octep_vdpa.h
> > new file mode 100644
> > index 000000000000..60d2efc8f8b8
> > --- /dev/null
> > +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> > @@ -0,0 +1,93 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only
> > + * Copyright (C) 2024 Marvell.
> > + */
> > +#ifndef __OCTEP_VDPA_H__
> > +#define __OCTEP_VDPA_H__
> > +
> > +#include <linux/pci.h>
> > +#include <linux/pci_regs.h>
> > +#include <linux/vdpa.h>
> > +#include <linux/virtio_pci_modern.h>
> > +#include <uapi/linux/virtio_net.h>
> > +#include <uapi/linux/virtio_blk.h>
> > +#include <uapi/linux/virtio_config.h>
> > +#include <uapi/linux/virtio_pci.h>
> > +#include <uapi/linux/vdpa.h>
> > +
> > +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> > +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> > +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> > +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> > +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> > +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> > +
> > +#define OCTEP_HW_MBOX_BAR 0
> > +#define OCTEP_HW_CAPS_BAR 4
> > +
> > +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> > +
> > +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> > +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> > +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> > +
> > +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> > +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> > +
> > +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> > +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> > +
> > +enum octep_vdpa_dev_status {
> > +       OCTEP_VDPA_DEV_STATUS_INVALID,
> > +       OCTEP_VDPA_DEV_STATUS_ALLOC,
> > +       OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> > +       OCTEP_VDPA_DEV_STATUS_INIT,
> > +       OCTEP_VDPA_DEV_STATUS_READY,
> > +       OCTEP_VDPA_DEV_STATUS_UNINIT
> > +};
> > +
> > +struct octep_vring_info {
> > +       struct vdpa_callback cb;
> > +       void __iomem *notify_addr;
> > +       u32 __iomem *cb_notify_addr;
> > +       phys_addr_t notify_pa;
> > +       char msix_name[256];
> > +};
> > +
> > +struct octep_hw {
> > +       struct pci_dev *pdev;
> > +       u8 __iomem *base[PCI_STD_NUM_BARS];
> > +       struct virtio_pci_common_cfg __iomem *common_cfg;
> 
> This is a hint that any chance we can reuse the modern virtio-pci
> library (virtio_pci_modern_dev.c) instead of duplicating codes?
Yes, the modern pci library would be a great help but in current
Octeon DPU device, the virtio config space completely emulated
by the device's firmware. So, the standard pci config read apis can't
be used for probing the virtio configuration. Hence, it's not possible to
use vp_modern_probe() here.

> 
> > +       u8 __iomem *dev_cfg;
> > +       u8 __iomem *isr;
> > +       void __iomem *notify_base;
> > +       phys_addr_t notify_base_pa;
> > +       u32 notify_off_multiplier;
> > +       u8 notify_bar;
> > +       struct octep_vring_info *vqs;
> > +       struct vdpa_callback config_cb;
> > +       u64 features;
> > +       u64 drv_features;
> > +       u16 nr_vring;
> > +       u32 config_size;
> > +       int irq;
> > +};
> > +
> > +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> > +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> > +void octep_hw_reset(struct octep_hw *oct_hw);
> > +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
> > +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> > +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
> *dst, int length);
> > +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64
> desc_area, u64 driver_area,
> > +                        u64 device_area);
> > +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> > +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> > +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> > +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> vdpa_vq_state *state);
> > +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct
> vdpa_vq_state *state);
> > +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> > +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> > +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> > +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> > +
> > +#endif /* __OCTEP_VDPA_H__ */
> > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > new file mode 100644
> > index 000000000000..0a2f1d09c4ab
> > --- /dev/null
> > +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > @@ -0,0 +1,457 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (C) 2024 Marvell. */
> > +
> > +#include <linux/iopoll.h>
> > +
> > +#include "octep_vdpa.h"
> > +
> > +enum octep_mbox_ids {
> > +       OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> > +       OCTEP_MBOX_MSG_GET_VQ_STATE,
> > +};
> > +
> > +#define OCTEP_HW_TIMEOUT       10000000
> > +
> > +#define MBOX_OFFSET            64
> > +#define MBOX_RSP_MASK          0x00000001
> > +#define MBOX_RC_MASK           0x0000FFFE
> > +
> > +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> > +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> > +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK |
> MBOX_RSP_MASK))
> > +
> > +struct octep_mbox_hdr {
> > +       u8 ver;
> > +       u8 rsvd1;
> > +       u16 id;
> > +       u16 rsvd2;
> > +#define MBOX_REQ_SIG (0xdead)
> > +#define MBOX_RSP_SIG (0xbeef)
> > +       u16 sig;
> > +};
> > +
> > +struct octep_mbox_sts {
> > +       u16 rsp:1;
> > +       u16 rc:15;
> > +       u16 rsvd;
> > +};
> > +
> > +struct octep_mbox {
> > +       struct octep_mbox_hdr hdr;
> > +       struct octep_mbox_sts sts;
> > +       u64 rsvd;
> > +       u32 data[];
> > +};
> > +
> > +static inline struct octep_mbox __iomem *octep_get_mbox(struct
> octep_hw *oct_hw)
> > +{
> > +       return (struct octep_mbox __iomem *)(oct_hw->dev_cfg +
> MBOX_OFFSET);
> > +}
> > +
> > +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem
> *mbox)
> > +{
> > +       u32 val;
> > +
> > +       return readx_poll_timeout(ioread32, &mbox->sts, val,
> MBOX_AVAIL(val), 10,
> > +                                 OCTEP_HW_TIMEOUT);
> > +}
> > +
> > +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem
> *mbox)
> > +{
> > +       u32 val;
> > +
> > +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val),
> 10,
> > +                                 OCTEP_HW_TIMEOUT);
> > +}
> > +
> > +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16
> id, u16 sig)
> > +{
> > +       iowrite16(id, &mbox->hdr.id);
> > +       iowrite16(sig, &mbox->hdr.sig);
> > +}
> > +
> > +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> > +{
> > +       return ioread16(&mbox->hdr.sig);
> > +}
> > +
> > +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32
> sts)
> > +{
> > +       iowrite32(sts, &mbox->sts);
> > +}
> > +
> > +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> > +{
> > +       return ioread32(&mbox->sts);
> > +}
> > +
> > +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox,
> u16 word_idx)
> > +{
> > +       return ioread32(&mbox->data[word_idx]);
> > +}
> > +
> > +static inline void octep_write32_word(struct octep_mbox __iomem *mbox,
> u16 word_idx, u32 word)
> > +{
> > +       return iowrite32(word, &mbox->data[word_idx]);
> > +}
> > +
> > +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid,
> void *buffer,
> > +                             u32 buf_size, bool write)
> > +{
> > +       struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> > +       struct pci_dev *pdev = oct_hw->pdev;
> > +       u32 *p = (u32 *)buffer;
> > +       u16 data_wds;
> > +       int ret, i;
> > +       u32 val;
> > +
> > +       if (!IS_ALIGNED(buf_size, 4))
> > +               return -EINVAL;
> > +
> > +       /* Make sure mbox space is available */
> > +       ret = octep_wait_for_mbox_avail(mbox);
> > +       if (ret) {
> > +               dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to
> be consumed\n");
> > +               return ret;
> > +       }
> > +       data_wds = buf_size / 4;
> > +
> > +       if (write) {
> > +               for (i = 1; i <= data_wds; i++) {
> > +                       octep_write32_word(mbox, i, *p);
> > +                       p++;
> > +               }
> > +       }
> > +       octep_write32_word(mbox, 0, (u32)qid);
> > +       octep_write_sts(mbox, 0);
> > +
> > +       octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> > +
> > +       ret = octep_wait_for_mbox_rsp(mbox);
> > +       if (ret) {
> > +               dev_warn(&pdev->dev, "Timeout waiting for mbox : %d
> response\n", id);
> > +               return ret;
> > +       }
> > +
> > +       val = octep_read_sig(mbox);
> > +       if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> > +               dev_warn(&pdev->dev, "Invalid Signature from mbox : %d
> response\n", id);
> > +               return ret;
> > +       }
> > +
> > +       val = octep_read_sts(mbox);
> > +       if (val & MBOX_RC_MASK) {
> > +               ret = MBOX_RSP_TO_ERR(val);
> > +               dev_warn(&pdev->dev, "Error while processing mbox : %d, err
> %d\n", id, ret);
> > +               return ret;
> > +       }
> > +
> > +       if (!write)
> > +               for (i = 1; i <= data_wds; i++)
> > +                       *p++ = octep_read32_word(mbox, i);
> > +
> > +       return 0;
> > +}
> > +
> > +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> > +{
> > +       iowrite32(1, &mbox->sts);
> > +}
> > +
> > +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> > +{
> > +       return ioread8(&oct_hw->common_cfg->device_status);
> > +}
> > +
> > +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> > +{
> > +       iowrite8(status, &oct_hw->common_cfg->device_status);
> > +}
> > +
> > +void octep_hw_reset(struct octep_hw *oct_hw)
> > +{
> > +       u8 val;
> > +
> > +       octep_hw_set_status(oct_hw, 0 | BIT(7));
> 
> Could we have macros for the magic numbers like 7 and 15 below?
Sure, will add proper macros in next version.

> 
> > +       if (readx_poll_timeout(ioread8, &oct_hw->common_cfg-
> >device_status, val, !val, 10,
> > +                              OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> > +               return;
> > +       }
> > +}
> > +
> > +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> > +{
> > +       u32 features_lo, features_hi;
> > +       u32 val, select;
> > +
> > +       select = 0;
> > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> >device_feature_select);
> 
> Ok I see something different with virtio. For any "select" it seems to
> require BIT(15).
> 
> > +
> > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >device_feature_select, val,
> > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> > +               return 0ULL;
> > +       }
> > +       features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> > +
> > +       select = 1;
> > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> >device_feature_select);
> > +
> > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >device_feature_select, val,
> > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> > +               return 0ULL;
> > +       }
> > +       features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> > +
> > +       return ((u64)features_hi << 32) | features_lo;
> > +}
> > +
> > +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> > +{
> > +       u32 val, select;
> > +
> > +       select = 0;
> > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> >guest_feature_select);
> > +
> > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >guest_feature_select, val,
> > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> > +               return;
> > +       }
> 
> Could we introduce a helper for such write and readback && timeout
> logic? It seems it is used in a lot of places.
Sure, I will move repetitive ones(ioread32/iowrite32) to helper function.

> 
> > +       iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg-
> >guest_feature);
> > +
> > +       select = 1;
> > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> >guest_feature_select);
> > +
> > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >guest_feature_select, val,
> > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> > +               return;
> > +       }
> > +       iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> > +}
> > +
> > +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
> > +{
> > +       u16 val;
> > +
> > +       iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
> > +
> > +       if (readx_poll_timeout(ioread16, &oct_hw->common_cfg-
> >queue_select, val, val == queue_id,
> > +                              10, OCTEP_HW_TIMEOUT)) {
> > +               dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> > +               return;
> > +       }
> > +}
> > +
> > +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> > +{
> > +       iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> > +}
> > +
> > +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
> *dst, int length)
> > +{
> > +       u8 old_gen, new_gen, *p;
> > +       int i;
> > +
> > +       WARN_ON(offset + length > oct_hw->config_size);
> > +       do {
> > +               old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> > +               p = dst;
> > +               for (i = 0; i < length; i++)
> > +                       *p++ = ioread8(oct_hw->dev_cfg + offset + i);
> > +
> > +               new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> > +       } while (old_gen != new_gen);
> > +}
> 
> This looks similar to vp_vdpa_get_config(), any chance to unify them?
It might not be workout as we can't use vp_modern_probe().

> 
> > +
> > +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64
> desc_area, u64 driver_area,
> > +                        u64 device_area)
> > +{
> > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > +
> > +       octep_write_queue_select(qid, oct_hw);
> > +       vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> > +                            &cfg->queue_desc_hi);
> > +       vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> > +                            &cfg->queue_avail_hi);
> > +       vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> > +                            &cfg->queue_used_hi);
> 
> It's kind of interesting that there's no read back for validation here.
Read back validation is not required for these writes. Firmware takes care of it.

> 
> > +
> > +       return 0;
> > +}
> > +
> > +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct
> vdpa_vq_state *state)
> > +{
> > +       return octep_process_mbox(oct_hw,
> OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> > +                                 sizeof(*state), 0);
> > +}
> > +
> > +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> vdpa_vq_state *state)
> > +{
> > +       struct vdpa_vq_state q_state;
> > +
> > +       memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> > +       return octep_process_mbox(oct_hw,
> OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> > +                                 sizeof(*state), 1);
> > +}
> > +
> > +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> > +{
> > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > +
> > +       octep_write_queue_select(qid, oct_hw);
> > +       iowrite16(num, &cfg->queue_size);
> > +}
> > +
> > +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> > +{
> > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > +
> > +       octep_write_queue_select(qid, oct_hw);
> > +       iowrite16(ready, &cfg->queue_enable);
> > +}
> > +
> > +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> > +{
> > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > +
> > +       octep_write_queue_select(qid, oct_hw);
> > +       return ioread16(&cfg->queue_enable);
> > +}
> > +
> > +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> > +{
> > +       octep_write_queue_select(0, oct_hw);
> > +       return ioread16(&oct_hw->common_cfg->queue_size);
> > +}
> > +
> > +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> > +{
> > +       return sizeof(struct virtio_net_config);
> > +}
> > +
> > +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct
> virtio_pci_cap *cap)
> > +{
> > +       struct device *dev = &oct_hw->pdev->dev;
> > +       u32 length = cap->length;
> > +       u32 offset = cap->offset;
> > +       u8  bar    = cap->bar;
> > +       u32 len;
> > +
> > +       if (bar != OCTEP_HW_CAPS_BAR) {
> > +               dev_err(dev, "Invalid bar: %u\n", bar);
> > +               return NULL;
> > +       }
> > +       if (offset + length < offset) {
> > +               dev_err(dev, "offset(%u) + length(%u) overflows\n",
> > +                       offset, length);
> > +               return NULL;
> > +       }
> > +       len = pci_resource_len(oct_hw->pdev, bar);
> > +       if (offset + length > len) {
> > +               dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> > +                       offset + length, len);
> > +               return NULL;
> > +       }
> > +       return oct_hw->base[bar] + offset;
> > +}
> > +
> > +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len,
> off_t offset)
> > +{
> > +       u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> > +       u8 *p = buf;
> > +       size_t i;
> > +
> > +       for (i = 0; i < len; i++)
> > +               *p++ = ioread8(bar + offset + i);
> > +}
> > +
> > +static int pci_signature_verify(struct octep_hw *oct_hw)
> > +{
> > +       u32 signature[2];
> > +
> > +       pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> > +
> > +       if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> > +               return -1;
> > +
> > +       if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> > +               return -1;
> > +
> > +       return 0;
> > +}
> > +
> > +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> > +{
> > +       struct octep_mbox __iomem *mbox;
> > +       struct device *dev = &pdev->dev;
> > +       struct virtio_pci_cap cap;
> > +       int ret;
> > +       u8 pos;
> > +
> > +       oct_hw->pdev = pdev;
> > +       ret = pci_signature_verify(oct_hw);
> > +       if (ret) {
> > +               dev_err(dev, "Octeon Virtio FW is not initialized\n");
> > +               return -EIO;
> > +       }
> > +
> > +       pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> > +
> > +       while (pos) {
> > +               pci_caps_read(oct_hw, &cap, 2, pos);
> > +
> > +               if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> > +                       dev_err(dev, "Found invalid capability vndr id: %d\n",
> cap.cap_vndr);
> > +                       break;
> > +               }
> > +
> > +               pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> > +
> > +               dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len:
> %u\n",
> > +                        pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> > +
> > +               switch (cap.cfg_type) {
> > +               case VIRTIO_PCI_CAP_COMMON_CFG:
> > +                       oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> > +                       break;
> > +               case VIRTIO_PCI_CAP_NOTIFY_CFG:
> > +                       pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> > +                                     4, pos + sizeof(cap));
> > +
> > +                       oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> > +                       oct_hw->notify_bar = cap.bar;
> > +                       oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
> cap.offset;
> > +                       break;
> > +               case VIRTIO_PCI_CAP_DEVICE_CFG:
> > +                       oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> > +                       break;
> > +               case VIRTIO_PCI_CAP_ISR_CFG:
> > +                       oct_hw->isr = get_cap_addr(oct_hw, &cap);
> > +                       break;
> > +               }
> > +
> > +               pos = cap.cap_next;
> > +       }
> > +       if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> > +           !oct_hw->dev_cfg    || !oct_hw->isr) {
> > +               dev_err(dev, "Incomplete PCI capabilities");
> > +               return -EIO;
> > +       }
> > +       oct_hw->config_size = octep_get_config_size(oct_hw);
> > +
> > +       mbox = octep_get_mbox(oct_hw);
> > +       octep_mbox_init(mbox);
> > +
> > +       dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> >common_cfg);
> > +       dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> >dev_cfg);
> > +       dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> > +       dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> > +                (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> > +       dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> > +
> > +       return 0;
> > +}
> 
> PDS vDPA reuses vp_modern_probe(), can we reuse it as well here?
> 
> For example, vp_modern_probe() allows vendor specific bars and id
> check which looks to be useful here.
> 
> > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > new file mode 100644
> > index 000000000000..845fd35368ff
> > --- /dev/null
> > +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > @@ -0,0 +1,903 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (C) 2024 Marvell. */
> > +
> > +#include <linux/interrupt.h>
> > +#include <linux/module.h>
> > +#include <linux/iommu.h>
> > +#include "octep_vdpa.h"
> > +
> > +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> > +
> > +struct octep_pf {
> > +       u8 __iomem *base[PCI_STD_NUM_BARS];
> > +       struct pci_dev *pdev;
> > +       struct resource res;
> > +       u64 vf_base;
> > +       int enabled_vfs;
> > +       u32 vf_stride;
> > +       u16 vf_devid;
> > +};
> > +
> > +struct octep_vdpa {
> > +       struct vdpa_device vdpa;
> > +       struct octep_hw *oct_hw;
> > +       struct pci_dev *pdev;
> > +};
> > +
> > +struct octep_vdpa_mgmt_dev {
> > +       struct vdpa_mgmt_dev mdev;
> > +       struct octep_hw oct_hw;
> > +       struct pci_dev *pdev;
> > +       /* Work entry to handle device setup */
> > +       struct work_struct setup_task;
> > +       /* Device status */
> > +       atomic_t status;
> > +};
> > +
> > +static int verify_features(u64 features)
> > +{
> > +       /* Minimum features to expect */
> > +       if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> > +               return -EOPNOTSUPP;
> > +
> > +       if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> > +               return -EOPNOTSUPP;
> 
> Any reason we need to mandate the NOTIFICATION_DATA. It seems can work
> without that becasue:
We are only supporting with NOTIFICATION_DATA feature enabled currently.
I will remove below op in next version.

> 
> void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> {
>         iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> }
> 
> > +
> > +       if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> > +               return -EOPNOTSUPP;
> 
> Does this mean the packed virtqueue is mandatory?
Yes.

> 
> > +
> > +       /* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
> > +        * requirements: "VIRTIO_NET_F_MQ Requires
> VIRTIO_NET_F_CTRL_VQ".
> > +        */
> 
> Right, but we have a lot of other features that depend on the CVQ, why
> is MQ special here?
It's not just MQ. I will check whether this condition is required or not.

> 
> > +       if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) |
> BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
> > +           BIT_ULL(VIRTIO_NET_F_MQ))
> > +               return -EINVAL;
> > +
> > +       return 0;
> > +}
> > +
> > +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_vdpa *oct_vdpa;
> > +
> > +       oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> > +
> > +       return oct_vdpa->oct_hw;
> > +}
> > +
> > +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> > +{
> > +       struct octep_hw *oct_hw = data;
> > +       int i;
> > +
> > +       for (i = 0; i < oct_hw->nr_vring; i++) {
> > +               if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr)
> {
> > +                       *oct_hw->vqs[i].cb_notify_addr = 0;
> 
> I didn't get how cb_notify_addr is useful here.
We are using this as acknowledgment to the device's firmware. We will rename
the variable.

> 
> > +                       oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> > +               }
> > +       }
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> > +static void octep_free_irqs(struct octep_hw *oct_hw)
> > +{
> > +       struct pci_dev *pdev = oct_hw->pdev;
> > +
> > +       if (oct_hw->irq != -1) {
> > +               devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> > +               oct_hw->irq = -1;
> > +       }
> > +       pci_free_irq_vectors(pdev);
> > +}
> > +
> > +static int octep_request_irqs(struct octep_hw *oct_hw)
> > +{
> > +       struct pci_dev *pdev = oct_hw->pdev;
> > +       int ret, irq;
> > +
> > +       /* Use one ring/interrupt per VF for virtio call interface. */
> 
> Is this a hardware limitation? If not, it would be slow.
Yes, it's a hardware limitation currently.

> 
> > +       ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> > +       if (ret < 0) {
> > +               dev_err(&pdev->dev, "Failed to alloc msix vector");
> > +               return ret;
> > +       }
> > +
> > +       snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> > +                OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> > +
> > +       irq = pci_irq_vector(pdev, 0);
> > +       ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> > +                              oct_hw->vqs->msix_name, oct_hw);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> > +               goto free_irq_vec;
> > +       }
> > +       oct_hw->irq = irq;
> > +
> > +       return 0;
> > +
> > +free_irq_vec:
> > +       pci_free_irq_vectors(pdev);
> > +       return ret;
> > +}
> > +
> > +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return oct_hw->features;
> > +}
> > +
> > +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev,
> u64 features)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +       int ret;
> > +
> > +       pr_debug("Driver Features: %llx\n", features);
> > +       ret = verify_features(features);
> > +       if (ret)
> > +               return ret;
> > +
> > +       octep_hw_set_drv_features(oct_hw, features);
> > +       oct_hw->drv_features = features;
> 
> It is not guaranteed that the device will accept all those features or
> it would be painful to maintain per fw feature white/blacklist.
> 
> More below.
> 
> > +
> > +       return 0;
> > +}
> > +
> > +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return oct_hw->features & oct_hw->drv_features;
> 
> So here, the drv_features needs to be read from the device then
> everything is fine.
Sure, we will change it to read from the device.

> 
> > +}
> > +
> > +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return octep_hw_get_status(oct_hw);
> > +}
> > +
> > +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8
> status)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +       u8 status_old;
> > +
> > +       status_old = octep_hw_get_status(oct_hw);
> > +
> > +       if (status_old == status)
> > +               return;
> > +
> > +       if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> > +           !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> > +               if (octep_request_irqs(oct_hw))
> > +                       status = status_old | VIRTIO_CONFIG_S_FAILED;
> > +       }
> > +       octep_hw_set_status(oct_hw, status);
> > +}
> > +
> > +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +       u8 status = octep_hw_get_status(oct_hw);
> > +       u16 qid;
> > +
> > +       if (status == 0)
> > +               return 0;
> > +
> > +       for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> > +               oct_hw->vqs[qid].cb.callback = NULL;
> > +               oct_hw->vqs[qid].cb.private = NULL;
> > +               oct_hw->config_cb.callback = NULL;
> > +               oct_hw->config_cb.private = NULL;
> > +       }
> > +       octep_hw_reset(oct_hw);
> > +
> > +       if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> > +               octep_free_irqs(oct_hw);
> > +
> > +       return 0;
> > +}
> > +
> > +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return octep_get_vq_size(oct_hw);
> > +}
> > +
> > +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> > +                                  struct vdpa_vq_state *state)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return octep_get_vq_state(oct_hw, qid, state);
> > +}
> > +
> > +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> > +                                  const struct vdpa_vq_state *state)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return octep_set_vq_state(oct_hw, qid, state);
> > +}
> > +
> > +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
> struct vdpa_callback *cb)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       oct_hw->vqs[qid].cb = *cb;
> > +}
> > +
> > +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16
> qid, bool ready)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       octep_set_vq_ready(oct_hw, qid, ready);
> > +}
> > +
> > +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16
> qid)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return octep_get_vq_ready(oct_hw, qid);
> > +}
> > +
> > +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16
> qid, u32 num)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       octep_set_vq_num(oct_hw, qid, num);
> > +}
> > +
> > +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16
> qid, u64 desc_area,
> > +                                    u64 driver_area, u64 device_area)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> > +       pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> > +       pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> > +
> > +       return octep_set_vq_address(oct_hw, qid, desc_area, driver_area,
> device_area);
> > +}
> > +
> > +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       octep_notify_queue(oct_hw, qid);
> > +}
> > +
> > +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev,
> u32 data)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +       u16 idx = data & 0xFFFF;
> > +
> > +       vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
> > +}
> > +
> > +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return vp_ioread8(&oct_hw->common_cfg->config_generation);
> > +}
> > +
> > +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
> > +{
> > +       return VIRTIO_ID_NET;
> > +}
> > +
> > +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
> > +{
> > +       return PCI_VENDOR_ID_CAVIUM;
> > +}
> > +
> > +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
> > +{
> > +       return PAGE_SIZE;
> > +}
> > +
> > +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       return oct_hw->config_size;
> > +}
> > +
> > +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned
> int offset, void *buf,
> > +                                 unsigned int len)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       octep_read_dev_config(oct_hw, offset, buf, len);
> > +}
> > +
> > +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned
> int offset,
> > +                                 const void *buf, unsigned int len)
> > +{
> > +       /* Not supported */
> 
> We probably need to filter out VIRTIO_NET_F_ANNOUNCE and other
> features that depend on the config writing.
As per modern virtio spec(v1.2), device config is read only for the driver.
Still do we need to filter out these features?

> 
> > +}
> > +
> > +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct
> vdpa_callback *cb)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +
> > +       oct_hw->config_cb.callback = cb->callback;
> > +       oct_hw->config_cb.private = cb->private;
> > +}
> > +
> > +static struct vdpa_notification_area octep_get_vq_notification(struct
> vdpa_device *vdpa_dev,
> > +                                                              u16 idx)
> > +{
> > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > +       struct vdpa_notification_area area;
> > +
> > +       area.addr = oct_hw->vqs[idx].notify_pa;
> > +       area.size = PAGE_SIZE;
> > +
> > +       return area;
> > +}
> > +
> > +static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
> > +                             struct vhost_iotlb *iotlb)
> > +{
> > +       return 0;
> > +}
> > +
> > +static struct vdpa_config_ops octep_vdpa_ops = {
> > +       .get_device_features = octep_vdpa_get_device_features,
> > +       .set_driver_features = octep_vdpa_set_driver_features,
> > +       .get_driver_features = octep_vdpa_get_driver_features,
> > +       .get_status     = octep_vdpa_get_status,
> > +       .set_status     = octep_vdpa_set_status,
> > +       .reset          = octep_vdpa_reset,
> > +       .get_vq_num_max = octep_vdpa_get_vq_num_max,
> > +       .get_vq_state   = octep_vdpa_get_vq_state,
> > +       .set_vq_state   = octep_vdpa_set_vq_state,
> > +       .set_vq_cb      = octep_vdpa_set_vq_cb,
> > +       .set_vq_ready   = octep_vdpa_set_vq_ready,
> > +       .get_vq_ready   = octep_vdpa_get_vq_ready,
> > +       .set_vq_num     = octep_vdpa_set_vq_num,
> > +       .set_vq_address = octep_vdpa_set_vq_address,
> > +       .get_vq_irq     = NULL,
> > +       .kick_vq        = octep_vdpa_kick_vq,
> > +       .kick_vq_with_data      = octep_vdpa_kick_vq_with_data,
> > +       .get_generation = octep_vdpa_get_generation,
> > +       .get_device_id  = octep_vdpa_get_device_id,
> > +       .get_vendor_id  = octep_vdpa_get_vendor_id,
> > +       .get_vq_align   = octep_vdpa_get_vq_align,
> > +       .get_config_size        = octep_vdpa_get_config_size,
> > +       .get_config     = octep_vdpa_get_config,
> > +       .set_config     = octep_vdpa_set_config,
> > +       .set_config_cb  = octep_vdpa_set_config_cb,
> > +       .get_vq_notification = octep_get_vq_notification,
> > +};
> > +
> > +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8
> bar)
> > +{
> > +       int ret;
> > +
> > +       ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
> > +               return ret;
> > +       }
> > +
> > +       tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> > +       if (!tbl[bar]) {
> > +               dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> > +               pci_release_region(pdev, bar);
> > +               ret = -ENOMEM;
> > +       }
> > +
> > +       return ret;
> > +}
> > +
> > +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl,
> u8 bar)
> > +{
> > +       pci_iounmap(pdev, tbl[bar]);
> > +       pci_release_region(pdev, bar);
> > +}
> > +
> > +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
> > +{
> > +       struct pci_dev *pf_dev = octpf->pdev;
> > +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> > +       struct pci_bus_region bus_region;
> > +
> > +       octpf->res.start = res->start;
> > +       octpf->res.end = res->end;
> > +       octpf->vf_base = res->start;
> > +
> > +       bus_region.start = res->start;
> > +       bus_region.end = res->start - 1;
> 
> Is this inteneded?
Yes,  it is required for emulating virtio config space.

> 
> > +
> > +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> > +}
> > +
> > +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
> > +{
> > +       struct pci_dev *pf_dev = octpf->pdev;
> > +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> > +       struct pci_bus_region bus_region;
> > +
> > +       bus_region.start = octpf->res.start;
> > +       bus_region.end = octpf->res.end;
> > +
> > +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> > +}
> > +
> > +static void octep_vdpa_remove_pf(struct pci_dev *pdev)
> > +{
> > +       struct octep_pf *octpf = pci_get_drvdata(pdev);
> > +
> > +       pci_disable_sriov(pdev);
> > +
> > +       if (octpf->base[OCTEP_HW_CAPS_BAR])
> > +               octep_iounmap_region(pdev, octpf->base,
> OCTEP_HW_CAPS_BAR);
> > +
> > +       if (octpf->base[OCTEP_HW_MBOX_BAR])
> > +               octep_iounmap_region(pdev, octpf->base,
> OCTEP_HW_MBOX_BAR);
> > +
> > +       octep_vdpa_pf_bar_expand(octpf);
> > +}
> > +
> > +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
> > +{
> > +       struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> > +
> > +       memset(vf_res, 0, sizeof(*vf_res));
> > +}
> > +
> > +static void octep_vdpa_remove_vf(struct pci_dev *pdev)
> > +{
> > +       struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> > +       struct octep_hw *oct_hw;
> > +       int status;
> > +
> > +       oct_hw = &mgmt_dev->oct_hw;
> > +       status = atomic_read(&mgmt_dev->status);
> > +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
> > +
> > +       if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
> > +               cancel_work_sync(&mgmt_dev->setup_task);
> 
> This seems to be racy, can we call cancel_work_sync() unconditionally?
> 
> > +       } else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
> > +               vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> > +               kfree(mgmt_dev->oct_hw.vqs);
> > +       }
> > +
> > +       if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> > +               octep_iounmap_region(pdev, oct_hw->base,
> OCTEP_HW_CAPS_BAR);
> > +
> > +       if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> > +               octep_iounmap_region(pdev, oct_hw->base,
> OCTEP_HW_MBOX_BAR);
> > +
> > +       octep_vdpa_vf_bar_shrink(pdev);
> > +}
> > +
> > +static void octep_vdpa_remove(struct pci_dev *pdev)
> > +{
> > +       if (pdev->is_virtfn)
> > +               octep_vdpa_remove_vf(pdev);
> > +       else
> > +               octep_vdpa_remove_pf(pdev);
> > +}
> > +
> > +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char
> *name,
> > +                             const struct vdpa_dev_set_config *config)
> > +{
> > +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct
> octep_vdpa_mgmt_dev, mdev);
> > +       struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> > +       struct pci_dev *pdev = oct_hw->pdev;
> > +       struct vdpa_device *vdpa_dev;
> > +       struct octep_vdpa *oct_vdpa;
> > +       u64 device_features;
> > +       u16 notify_off;
> > +       int i, ret;
> > +
> > +       oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev,
> &octep_vdpa_ops, 1, 1,
> > +                                    NULL, false);
> > +       if (IS_ERR(oct_vdpa)) {
> > +               dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep
> vdpa device");
> > +               return PTR_ERR(oct_vdpa);
> > +       }
> > +
> > +       oct_vdpa->pdev = pdev;
> > +       oct_vdpa->vdpa.dma_dev = &pdev->dev;
> > +       oct_vdpa->vdpa.mdev = mdev;
> > +       oct_vdpa->oct_hw = oct_hw;
> > +       vdpa_dev = &oct_vdpa->vdpa;
> > +
> > +       device_features = oct_hw->features;
> > +       if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> > +               if (config->device_features & ~device_features) {
> > +                       dev_err(&pdev->dev, "The provisioned features 0x%llx are not
> supported by this device with features 0x%llx\n",
> > +                               config->device_features, device_features);
> > +                       return -EINVAL;
> > +               }
> > +               device_features &= config->device_features;
> > +       }
> > +
> > +       oct_hw->features = device_features;
> > +
> > +       if (verify_features(device_features)) {
> > +               dev_warn(mdev->device,
> > +                        "Must provision minimum features 0x%llx for this device",
> > +                        BIT_ULL(VIRTIO_F_VERSION_1) |
> BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> > +                        BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) |
> BIT_ULL(VIRTIO_F_IN_ORDER));
> > +               return -EOPNOTSUPP;
> > +       }
> > +
> > +       oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs),
> GFP_KERNEL);
> > +       if (!oct_hw->vqs)
> > +               return -ENOMEM;
> > +
> > +       oct_hw->irq = -1;
> > +
> > +       dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> > +       dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> > +
> > +       for (i = 0; i < oct_hw->nr_vring; i++) {
> > +               octep_write_queue_select(i, oct_hw);
> > +               notify_off = vp_ioread16(&oct_hw->common_cfg-
> >queue_notify_off);
> > +               oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> > +                       notify_off * oct_hw->notify_off_multiplier;
> > +               oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw-
> >vqs[i].notify_addr + 1;
> > +               oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> > +                       notify_off * oct_hw->notify_off_multiplier;
> > +       }
> 
> I think the reason we can't do the above in the probe is that we need
> to wait for the device ready then we can get those information?
Yes, your understanding is correct.

> 
> > +
> > +       if (name)
> > +               ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> > +       else
> > +               ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev-
> >index);
> > +
> > +       ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "Failed to register to vDPA bus");
> > +               goto free_vqs;
> > +       }
> > +       return 0;
> > +
> > +free_vqs:
> > +       put_device(&oct_vdpa->vdpa.dev);
> > +       kfree(oct_hw->vqs);
> > +       return ret;
> > +}
> > +
> > +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct
> vdpa_device *dev)
> > +{
> > +       _vdpa_unregister_device(dev);
> > +}
> > +
> > +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> > +       .dev_add = octep_vdpa_dev_add,
> > +       .dev_del = octep_vdpa_dev_del
> > +};
> > +
> > +static bool get_device_ready_status(u8 __iomem *addr)
> > +{
> > +       u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> > +
> > +       if (signature == OCTEP_DEV_READY_SIGNATURE) {
> > +               writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> > +               return true;
> > +       }
> > +
> > +       return false;
> > +}
> > +
> > +static struct virtio_device_id id_table[] = {
> > +       { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> > +       { 0 },
> > +};
> > +
> > +static void octep_vdpa_setup_task(struct work_struct *work)
> > +{
> > +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct
> octep_vdpa_mgmt_dev,
> > +                                                           setup_task);
> > +       struct pci_dev *pdev = mgmt_dev->pdev;
> > +       struct device *dev = &pdev->dev;
> > +       struct octep_hw *oct_hw;
> > +       unsigned long timeout;
> > +       int ret;
> > +
> > +       oct_hw = &mgmt_dev->oct_hw;
> > +
> > +       atomic_set(&mgmt_dev->status,
> OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> > +
> > +       /* Wait for a maximum of 5 sec */
> > +       timeout = jiffies + msecs_to_jiffies(5000);
> > +       while (!time_after(jiffies, timeout)) {
> > +               if (get_device_ready_status(oct_hw-
> >base[OCTEP_HW_MBOX_BAR])) {
> > +                       atomic_set(&mgmt_dev->status,
> OCTEP_VDPA_DEV_STATUS_INIT);
> > +                       break;
> > +               }
> > +
> > +               if (atomic_read(&mgmt_dev->status) >=
> OCTEP_VDPA_DEV_STATUS_READY) {
> > +                       dev_info(dev, "Stopping vDPA setup task.\n");
> > +                       return;
> > +               }
> > +
> > +               usleep_range(1000, 1500);
> > +       }
> > +
> > +       if (atomic_read(&mgmt_dev->status) !=
> OCTEP_VDPA_DEV_STATUS_INIT) {
> > +               dev_err(dev, "BAR initialization is timed out\n");
> > +               return;
> > +       }
> > +
> > +       ret = octep_iomap_region(pdev, oct_hw->base,
> OCTEP_HW_CAPS_BAR);
> > +       if (ret)
> > +               return;
> > +
> > +       ret = octep_hw_caps_read(oct_hw, pdev);
> > +       if (ret < 0)
> > +               goto unmap_region;
> > +
> > +       oct_hw->features = octep_hw_get_dev_features(oct_hw);
> > +       ret = verify_features(oct_hw->features);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> > +               goto unmap_region;
> > +       }
> > +       oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg-
> >num_queues);
> > +
> > +       mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> > +       mgmt_dev->mdev.id_table = id_table;
> > +       mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> > +       mgmt_dev->mdev.supported_features = oct_hw->features;
> > +       mgmt_dev->mdev.config_attr_mask = (1 <<
> VDPA_ATTR_DEV_FEATURES);
> > +       mgmt_dev->mdev.device = dev;
> > +
> > +       ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> > +       if (ret) {
> > +               dev_err(dev, "Failed to register vdpa management interface\n");
> > +               goto unmap_region;
> > +       }
> > +
> > +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
> > +
> > +       return;
> > +
> > +unmap_region:
> > +       octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> > +       oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
> > +}
> > +
> > +static int octep_vdpa_probe_vf(struct pci_dev *pdev)
> > +{
> > +       struct octep_vdpa_mgmt_dev *mgmt_dev;
> > +       struct device *dev = &pdev->dev;
> > +       struct iommu_domain *domain;
> > +       int ret;
> > +
> > +       ret = pcim_enable_device(pdev);
> > +       if (ret) {
> > +               dev_err(dev, "Failed to enable device\n");
> > +               return ret;
> > +       }
> > +
> > +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> > +       if (ret) {
> > +               dev_err(dev, "No usable DMA configuration\n");
> > +               return ret;
> > +       }
> > +       pci_set_master(pdev);
> > +
> > +       domain = iommu_get_domain_for_dev(dev);
> > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > +               dev_info(dev, "NO-IOMMU\n");
> > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> 
> Is this a shortcut to have get better performance? DMA API should have
> those greacefully I think.
When IOMMU is disabled on host and set_map/dma_map is not set,
vhost-vdpa is reporting an error "Failed to allocate domain, device is not IOMMU cache coherent capable\n".
Hence we are doing this way to get better performance.

> 
> > +       }
> > +
> > +       mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev),
> GFP_KERNEL);
> > +       if (!mgmt_dev)
> > +               return -ENOMEM;
> > +
> > +       ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base,
> OCTEP_HW_MBOX_BAR);
> > +       if (ret)
> > +               return ret;
> > +
> > +       mgmt_dev->pdev = pdev;
> > +       pci_set_drvdata(pdev, mgmt_dev);
> > +
> > +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
> > +       INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
> > +       schedule_work(&mgmt_dev->setup_task);
> > +       dev_info(&pdev->dev, "octep vdpa mgmt device setup task is
> queued\n");
> > +
> > +       return 0;
> > +}
> > +
> > +static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct
> pci_dev *pf_dev, u8 idx)
> > +{
> > +       struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
> > +       struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> > +       struct octep_pf *pf = pci_get_drvdata(pf_dev);
> > +       struct pci_bus_region bus_region;
> > +
> > +       vf_res->name = pci_name(vf_dev);
> > +       vf_res->flags = pf_res->flags;
> > +       vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
> > +
> > +       bus_region.start = pf->vf_base + idx * pf->vf_stride;
> > +       bus_region.end = bus_region.start + pf->vf_stride - 1;
> > +       pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
> > +}
> > +
> > +static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
> > +{
> > +       struct octep_pf *pf = pci_get_drvdata(pdev);
> > +       u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
> > +       int ret, i;
> > +
> > +       if (num_vfs > 0) {
> > +               struct pci_dev *vf_pdev = NULL;
> > +               bool done = false;
> > +               int index = 0;
> > +
> > +               ret = pci_enable_sriov(pdev, num_vfs);
> > +               if (ret)
> > +                       return ret;
> > +
> > +               pf->enabled_vfs = num_vfs;
> > +
> > +               while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
> PCI_ANY_ID, vf_pdev))) {
> 
> Do we have an exisiting helper to iterate all vf?
> 
> > +                       if (vf_pdev->device != pf->vf_devid)
> > +                               continue;
> > +
> > +                       octep_vdpa_assign_barspace(vf_pdev, pdev, index);
> > +                       if (++index == num_vfs) {
> > +                               done = true;
> > +                               break;
> > +                       }
> > +               }
> > +
> > +               if (done) {
> > +                       for (i = 0; i < pf->enabled_vfs; i++)
> > +                               writeq(OCTEP_DEV_READY_SIGNATURE, addr +
> OCTEP_PF_MBOX_DATA(i));
> > +               }
> > +       } else {
> > +               if (!pci_num_vf(pdev))
> > +                       return 0;
> > +
> > +               pci_disable_sriov(pdev);
> > +               pf->enabled_vfs = 0;
> > +       }
> > +
> > +       return num_vfs;
> > +}
> > +
> > +static u16 octep_get_vf_devid(struct pci_dev *pdev)
> > +{
> > +       u16 did;
> > +
> > +       switch (pdev->device) {
> > +       case OCTEP_VDPA_DEVID_CN106K_PF:
> > +               did = OCTEP_VDPA_DEVID_CN106K_VF;
> > +               break;
> > +       case OCTEP_VDPA_DEVID_CN105K_PF:
> > +               did = OCTEP_VDPA_DEVID_CN105K_VF;
> > +               break;
> > +       case OCTEP_VDPA_DEVID_CN103K_PF:
> > +               did = OCTEP_VDPA_DEVID_CN103K_VF;
> > +               break;
> > +       default:
> > +               did = 0xFFFF;
> > +               break;
> > +       }
> > +
> > +       return did;
> > +}
> > +
> > +static int octep_vdpa_pf_setup(struct octep_pf *octpf)
> > +{
> > +       u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
> > +       struct pci_dev *pdev = octpf->pdev;
> > +       int totalvfs;
> > +       u64 val, len;
> > +
> > +       totalvfs = pci_sriov_get_totalvfs(pdev);
> > +       if (unlikely(!totalvfs)) {
> > +               dev_info(&pdev->dev, "Total VFs are %d in PF sriov
> configuration\n", totalvfs);
> > +               return 0;
> > +       }
> > +
> > +       addr = octpf->base[OCTEP_HW_MBOX_BAR];
> > +       val = readq(addr + OCTEP_EPF_RINFO(0));
> > +       if (val == 0) {
> > +               dev_err(&pdev->dev, "Invalid device configuration\n");
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
> > +               val &= ~GENMASK_ULL(35, 32);
> > +               val |= BIT_ULL(32);
> > +               writeq(val, addr + OCTEP_EPF_RINFO(0));
> > +       }
> > +
> > +       len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
> > +
> > +       octpf->vf_stride = len / totalvfs;
> > +       octpf->vf_devid = octep_get_vf_devid(pdev);
> > +
> > +       octep_vdpa_pf_bar_shrink(octpf);
> > +
> > +       return 0;
> > +}
> > +
> > +static int octep_vdpa_probe_pf(struct pci_dev *pdev)
> > +{
> > +       struct device *dev = &pdev->dev;
> > +       struct octep_pf *octpf;
> > +       int ret;
> > +
> > +       ret = pcim_enable_device(pdev);
> > +       if (ret) {
> > +               dev_err(dev, "Failed to enable device\n");
> > +               return ret;
> > +       }
> > +
> > +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> > +       if (ret) {
> > +               dev_err(dev, "No usable DMA configuration\n");
> > +               return ret;
> > +       }
> > +       octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
> > +       if (!octpf)
> > +               return -ENOMEM;
> > +
> > +       ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> > +       if (ret)
> > +               return ret;
> > +
> > +       pci_set_master(pdev);
> > +       pci_set_drvdata(pdev, octpf);
> > +       octpf->pdev = pdev;
> > +
> > +       ret = octep_vdpa_pf_setup(octpf);
> > +       if (ret)
> > +               goto unmap_region;
> > +
> > +       return 0;
> > +
> > +unmap_region:
> > +       octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> > +       return ret;
> > +}
> > +
> > +static int octep_vdpa_probe(struct pci_dev *pdev, const struct
> pci_device_id *id)
> > +{
> > +       if (pdev->is_virtfn)
> > +               return octep_vdpa_probe_vf(pdev);
> > +       else
> > +               return octep_vdpa_probe_pf(pdev);
> > +}
> > +
> > +static struct pci_device_id octep_pci_vdpa_map[] = {
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN106K_PF) },
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN106K_VF) },
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN105K_PF) },
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN105K_VF) },
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN103K_PF) },
> > +       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN103K_VF) },
> > +       { 0 },
> > +};
> > +
> > +static struct pci_driver octep_pci_vdpa = {
> > +       .name     = OCTEP_VDPA_DRIVER_NAME,
> > +       .id_table = octep_pci_vdpa_map,
> > +       .probe    = octep_vdpa_probe,
> > +       .remove   = octep_vdpa_remove,
> > +       .sriov_configure = octep_vdpa_sriov_configure
> > +};
> > +
> > +module_pci_driver(octep_pci_vdpa);
> > +
> > +MODULE_AUTHOR("Marvell");
> > +MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
> > +MODULE_LICENSE("GPL");
> > --
> > 2.25.1
> >
> 
> Thanks

Thanks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29 11:37 ` Stefano Garzarella
@ 2024-03-29 13:02   ` Srujana Challa
  2024-03-29 13:19     ` Stefano Garzarella
  0 siblings, 1 reply; 28+ messages in thread
From: Srujana Challa @ 2024-03-29 13:02 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: virtualization, mst, jasowang, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob

> Subject: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON
> DPU devices
> 
> Prioritize security for external emails: Confirm sender and content safety
> before clicking links or opening attachments
> 
> ----------------------------------------------------------------------
> On Wed, Mar 27, 2024 at 04:51:36PM +0530, Srujana Challa wrote:
> >This commit introduces a new vDPA driver specifically designed for
> >managing the virtio control plane over the vDPA bus for OCTEON DPU
> >devices. The driver consists of two layers:
> >
> >1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> >operations and configurations related to the DPU device.
> >
> >2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> >implements device operations for the vDPA bus. It handles device
> >probing, bus attachment, vring operations, and other relevant tasks.
> >
> >Signed-off-by: Srujana Challa <schalla@marvell.com>
> >Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> >Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> >---
> > MAINTAINERS                              |   5 +
> > drivers/vdpa/Kconfig                     |   9 +
> > drivers/vdpa/Makefile                    |   1 +
> > drivers/vdpa/octeon_ep/Makefile          |   4 +
> > drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
> > drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
> > drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903
> +++++++++++++++++++++++
> > 7 files changed, 1472 insertions(+)
> > create mode 100644 drivers/vdpa/octeon_ep/Makefile  create mode
> 100644
> >drivers/vdpa/octeon_ep/octep_vdpa.h
> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> 
> Building with C=2 I see several warnings, I suggest to take a look:
> 
> $ make O=build M=drivers/vdpa -j6 W=1 C=2 modules
Sure, I will fix these warnings in next version.

>    ...
>    CHECK   ../drivers/vdpa/octeon_ep/octep_vdpa_main.c
>    CHECK   ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:50: warning: cast
> removes address space '__iomem' of expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47: warning: incorrect
> type in assignment (different address spaces)
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47:    expected unsigned
> int [noderef] [usertype] __iomem *cb_notify_addr
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:548:47:    got unsigned int
> [usertype] *
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:74:66: warning: dereference
> of noderef expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_main.c:75:40: warning: dereference
> of noderef expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25: warning: incorrect type
> in initializer (different base types)
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25:    expected unsigned int
> [usertype] length
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:339:25:    got restricted __le32
> [usertype] length
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25: warning: incorrect type
> in initializer (different base types)
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25:    expected unsigned int
> [usertype] offset
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:340:25:    got restricted __le32
> [usertype] offset
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:427:89: warning: restricted
> __le32 degrades to integer
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:449:9: warning: cast removes
> address space '__iomem' of expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:450:9: warning: cast removes
> address space '__iomem' of expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:451:9: warning: cast removes
> address space '__iomem' of expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:452:9: warning: cast removes
> address space '__iomem' of expression
> ../drivers/vdpa/octeon_ep/octep_vdpa_hw.c:454:9: warning: cast removes
> address space '__iomem' of expression
> 
> >
> >diff --git a/MAINTAINERS b/MAINTAINERS
> >index cfe44a876d8a..539ce209a960 100644
> >--- a/MAINTAINERS
> >+++ b/MAINTAINERS
> >@@ -13127,6 +13127,11 @@ S:	Supported
> > F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
> > F:	drivers/mmc/host/sdhci-xenon*
> >
> >+MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> >+R:	schalla@marvell.com
> >+R:	vattunuru@marvell.com
> >+F:	drivers/vdpa/octeon_ep/
> >+
> > MATROX FRAMEBUFFER DRIVER
> > L:	linux-fbdev@vger.kernel.org
> > S:	Orphan
> >diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index
> >656c1cb541de..775149fad476 100644
> >--- a/drivers/vdpa/Kconfig
> >+++ b/drivers/vdpa/Kconfig
> >@@ -126,4 +126,13 @@ config PDS_VDPA
> > 	  With this driver, the VirtIO dataplane can be
> > 	  offloaded to an AMD/Pensando DSC device.
> >
> >+config OCTEONEP_VDPA
> >+	tristate "vDPA driver for Octeon DPU devices"
> >+	depends on m
> 
> Just for curiosity, why restrict it to module-only?

There is an etherent driver for the same class of devices. So,
we made this vDPA driver as module.

> 
> >+	depends on PCI_MSI
> >+	help
> >+	  vDPA driver for Marvell's Octeon DPU devices.
> >+	  With this driver, the VirtIO dataplane can be
> >+	  offloaded to a Octeon DPU device.
> >+
> > endif # VDPA
> >diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile index
> >8f53c6f3cca7..5654d36707af 100644
> >--- a/drivers/vdpa/Makefile
> >+++ b/drivers/vdpa/Makefile
> >@@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
> > obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
> > obj-$(CONFIG_SNET_VDPA) += solidrun/
> > obj-$(CONFIG_PDS_VDPA) += pds/
> >+obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> >diff --git a/drivers/vdpa/octeon_ep/Makefile
> >b/drivers/vdpa/octeon_ep/Makefile new file mode 100644 index
> >000000000000..e23e2ff14f33
> >--- /dev/null
> >+++ b/drivers/vdpa/octeon_ep/Makefile
> >@@ -0,0 +1,4 @@
> >+# SPDX-License-Identifier: GPL-2.0
> >+obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> >+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> >+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> >diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h
> >b/drivers/vdpa/octeon_ep/octep_vdpa.h
> >new file mode 100644
> >index 000000000000..60d2efc8f8b8
> >--- /dev/null
> >+++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> >@@ -0,0 +1,93 @@
> >+/* SPDX-License-Identifier: GPL-2.0-only
> >+ * Copyright (C) 2024 Marvell.
> >+ */
> >+#ifndef __OCTEP_VDPA_H__
> >+#define __OCTEP_VDPA_H__
> >+
> >+#include <linux/pci.h>
> >+#include <linux/pci_regs.h>
> >+#include <linux/vdpa.h>
> >+#include <linux/virtio_pci_modern.h>
> >+#include <uapi/linux/virtio_net.h>
> >+#include <uapi/linux/virtio_blk.h>
> 
> It looks like virtio_blk.h is not needed.
> I suggest to check also other includes.
Sure, I will check and fix in next version.
> 
> >+#include <uapi/linux/virtio_config.h>
> >+#include <uapi/linux/virtio_pci.h>
> >+#include <uapi/linux/vdpa.h>
> >+
> >+#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900 #define
> >+OCTEP_VDPA_DEVID_CN106K_VF 0xb903 #define
> OCTEP_VDPA_DEVID_CN105K_PF
> >+0xba00 #define OCTEP_VDPA_DEVID_CN105K_VF 0xba03 #define
> >+OCTEP_VDPA_DEVID_CN103K_PF 0xbd00 #define
> OCTEP_VDPA_DEVID_CN103K_VF
> >+0xbd03
> >+
> >+#define OCTEP_HW_MBOX_BAR 0
> >+#define OCTEP_HW_CAPS_BAR 4
> >+
> >+#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> >+
> >+#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25)) #define
> >+OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17)) #define
> >+OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> >+
> >+#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF) #define
> >+OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> >+
> >+#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED #define
> >+OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> >+
> >+enum octep_vdpa_dev_status {
> >+	OCTEP_VDPA_DEV_STATUS_INVALID,
> >+	OCTEP_VDPA_DEV_STATUS_ALLOC,
> >+	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> >+	OCTEP_VDPA_DEV_STATUS_INIT,
> >+	OCTEP_VDPA_DEV_STATUS_READY,
> >+	OCTEP_VDPA_DEV_STATUS_UNINIT
> >+};
> >+
> >+struct octep_vring_info {
> >+	struct vdpa_callback cb;
> >+	void __iomem *notify_addr;
> >+	u32 __iomem *cb_notify_addr;
> >+	phys_addr_t notify_pa;
> >+	char msix_name[256];
> >+};
> >+
> >+struct octep_hw {
> >+	struct pci_dev *pdev;
> >+	u8 __iomem *base[PCI_STD_NUM_BARS];
> >+	struct virtio_pci_common_cfg __iomem *common_cfg;
> >+	u8 __iomem *dev_cfg;
> >+	u8 __iomem *isr;
> >+	void __iomem *notify_base;
> >+	phys_addr_t notify_base_pa;
> >+	u32 notify_off_multiplier;
> >+	u8 notify_bar;
> >+	struct octep_vring_info *vqs;
> >+	struct vdpa_callback config_cb;
> >+	u64 features;
> >+	u64 drv_features;
> >+	u16 nr_vring;
> >+	u32 config_size;
> >+	int irq;
> >+};
> >+
> >+u8 octep_hw_get_status(struct octep_hw *oct_hw); void
> >+octep_hw_set_status(struct octep_hw *dev, uint8_t status); void
> >+octep_hw_reset(struct octep_hw *oct_hw); void
> >+octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw); void
> >+octep_notify_queue(struct octep_hw *oct_hw, u16 qid); void
> >+octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst,
> >+int length); int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid,
> u64 desc_area, u64 driver_area,
> >+			 u64 device_area);
> >+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num); void
> >+octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready); bool
> >+octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid); int
> >+octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> >+vdpa_vq_state *state); int octep_get_vq_state(struct octep_hw *oct_hw,
> >+u16 qid, struct vdpa_vq_state *state);
> >+u16 octep_get_vq_size(struct octep_hw *oct_hw); int
> >+octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> >+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw); void
> >+octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> >+
> >+#endif /* __OCTEP_VDPA_H__ */
> >diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> >b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> >new file mode 100644
> >index 000000000000..0a2f1d09c4ab
> >--- /dev/null
> >+++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> >@@ -0,0 +1,457 @@
> >+// SPDX-License-Identifier: GPL-2.0-only
> >+/* Copyright (C) 2024 Marvell. */
> >+
> >+#include <linux/iopoll.h>
> >+
> >+#include "octep_vdpa.h"
> >+
> >+enum octep_mbox_ids {
> >+	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> >+	OCTEP_MBOX_MSG_GET_VQ_STATE,
> >+};
> >+
> >+#define OCTEP_HW_TIMEOUT       10000000
> >+
> >+#define MBOX_OFFSET            64
> >+#define MBOX_RSP_MASK          0x00000001
> >+#define MBOX_RC_MASK           0x0000FFFE
> >+
> >+#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> >+#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> >+#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK |
> MBOX_RSP_MASK))
> >+
> >+struct octep_mbox_hdr {
> >+	u8 ver;
> >+	u8 rsvd1;
> >+	u16 id;
> >+	u16 rsvd2;
> >+#define MBOX_REQ_SIG (0xdead)
> >+#define MBOX_RSP_SIG (0xbeef)
> >+	u16 sig;
> >+};
> >+
> >+struct octep_mbox_sts {
> >+	u16 rsp:1;
> >+	u16 rc:15;
> >+	u16 rsvd;
> >+};
> >+
> >+struct octep_mbox {
> >+	struct octep_mbox_hdr hdr;
> >+	struct octep_mbox_sts sts;
> >+	u64 rsvd;
> >+	u32 data[];
> >+};
> >+
> >+static inline struct octep_mbox __iomem *octep_get_mbox(struct
> >+octep_hw *oct_hw) {
> >+	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg +
> MBOX_OFFSET);
> >+}
> >+
> >+static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem
> >+*mbox) {
> >+	u32 val;
> >+
> >+	return readx_poll_timeout(ioread32, &mbox->sts, val,
> MBOX_AVAIL(val), 10,
> >+				  OCTEP_HW_TIMEOUT);
> >+}
> >+
> >+static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem
> >+*mbox) {
> >+	u32 val;
> >+
> >+	return readx_poll_timeout(ioread32, &mbox->sts, val,
> MBOX_RSP(val), 10,
> >+				  OCTEP_HW_TIMEOUT);
> >+}
> >+
> >+static inline void octep_write_hdr(struct octep_mbox __iomem *mbox,
> >+u16 id, u16 sig) {
> >+	iowrite16(id, &mbox->hdr.id);
> >+	iowrite16(sig, &mbox->hdr.sig);
> >+}
> >+
> >+static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox) {
> >+	return ioread16(&mbox->hdr.sig);
> >+}
> >+
> >+static inline void octep_write_sts(struct octep_mbox __iomem *mbox,
> >+u32 sts) {
> >+	iowrite32(sts, &mbox->sts);
> >+}
> >+
> >+static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox) {
> >+	return ioread32(&mbox->sts);
> >+}
> >+
> >+static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox,
> >+u16 word_idx) {
> >+	return ioread32(&mbox->data[word_idx]); }
> >+
> >+static inline void octep_write32_word(struct octep_mbox __iomem *mbox,
> >+u16 word_idx, u32 word) {
> >+	return iowrite32(word, &mbox->data[word_idx]); }
> >+
> >+static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid,
> void *buffer,
> >+			      u32 buf_size, bool write)
> >+{
> >+	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> >+	struct pci_dev *pdev = oct_hw->pdev;
> >+	u32 *p = (u32 *)buffer;
> >+	u16 data_wds;
> >+	int ret, i;
> >+	u32 val;
> >+
> >+	if (!IS_ALIGNED(buf_size, 4))
> >+		return -EINVAL;
> >+
> >+	/* Make sure mbox space is available */
> >+	ret = octep_wait_for_mbox_avail(mbox);
> >+	if (ret) {
> >+		dev_warn(&pdev->dev, "Timeout waiting for previous mbox
> data to be consumed\n");
> >+		return ret;
> >+	}
> >+	data_wds = buf_size / 4;
> >+
> >+	if (write) {
> >+		for (i = 1; i <= data_wds; i++) {
> >+			octep_write32_word(mbox, i, *p);
> >+			p++;
> >+		}
> >+	}
> >+	octep_write32_word(mbox, 0, (u32)qid);
> >+	octep_write_sts(mbox, 0);
> >+
> >+	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> >+
> >+	ret = octep_wait_for_mbox_rsp(mbox);
> >+	if (ret) {
> >+		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d
> response\n", id);
> >+		return ret;
> >+	}
> >+
> >+	val = octep_read_sig(mbox);
> >+	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> >+		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d
> response\n", id);
> >+		return ret;
> 
> `ret` here is 0, is it right to return 0 in this case?
> If yes, maybe better to return 0 directly.
> 
> >+	}
> >+
> >+	val = octep_read_sts(mbox);
> >+	if (val & MBOX_RC_MASK) {
> >+		ret = MBOX_RSP_TO_ERR(val);
> >+		dev_warn(&pdev->dev, "Error while processing mbox : %d, err
> %d\n", id, ret);
> >+		return ret;
> >+	}
> >+
> >+	if (!write)
> >+		for (i = 1; i <= data_wds; i++)
> >+			*p++ = octep_read32_word(mbox, i);
> >+
> >+	return 0;
> >+}
> >+
> >+static void octep_mbox_init(struct octep_mbox __iomem *mbox) {
> >+	iowrite32(1, &mbox->sts);
> >+}
> >+
> >+u8 octep_hw_get_status(struct octep_hw *oct_hw) {
> >+	return ioread8(&oct_hw->common_cfg->device_status);
> >+}
> >+
> >+void octep_hw_set_status(struct octep_hw *oct_hw, u8 status) {
> >+	iowrite8(status, &oct_hw->common_cfg->device_status);
> >+}
> >+
> >+void octep_hw_reset(struct octep_hw *oct_hw) {
> >+	u8 val;
> >+
> >+	octep_hw_set_status(oct_hw, 0 | BIT(7));
> >+	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg-
> >device_status, val, !val, 10,
> >+			       OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Octeon device reset
> timeout\n");
> >+		return;
> >+	}
> >+}
> >+
> >+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw) {
> >+	u32 features_lo, features_hi;
> >+	u32 val, select;
> >+
> >+	select = 0;
> >+	iowrite32(select | BIT(15),
> >+&oct_hw->common_cfg->device_feature_select);
> >+
> >+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >device_feature_select, val,
> >+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> >+		return 0ULL;
> 
> nit: `return 0;` should be fine too.
> 
> >+	}
> >+	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> >+
> >+	select = 1;
> >+	iowrite32(select | BIT(15),
> >+&oct_hw->common_cfg->device_feature_select);
> >+
> >+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >device_feature_select, val,
> >+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> >+		return 0ULL;
> >+	}
> >+	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> >+
> >+	return ((u64)features_hi << 32) | features_lo; }
> >+
> >+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> >+{
> >+	u32 val, select;
> >+
> >+	select = 0;
> >+	iowrite32(select | BIT(15),
> >+&oct_hw->common_cfg->guest_feature_select);
> >+
> >+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >guest_feature_select, val,
> >+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> >+		return;
> >+	}
> >+	iowrite32(features & (BIT_ULL(32) - 1),
> >+&oct_hw->common_cfg->guest_feature);
> >+
> >+	select = 1;
> >+	iowrite32(select | BIT(15),
> >+&oct_hw->common_cfg->guest_feature_select);
> >+
> >+	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> >guest_feature_select, val,
> >+			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> timeout\n", select);
> >+		return;
> >+	}
> >+	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> >+}
> >+
> >+void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
> 
> In almost all other fuctions, `oct_hw` is the first parameter, I would try to unify
> that here as well.
> 
> >+{
> >+	u16 val;
> >+
> >+	iowrite16(queue_id | BIT(15), &oct_hw->common_cfg-
> >queue_select);
> >+
> >+	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg-
> >queue_select, val, val == queue_id,
> >+			       10, OCTEP_HW_TIMEOUT)) {
> >+		dev_warn(&oct_hw->pdev->dev, "Queue select write
> timeout\n");
> >+		return;
> >+	}
> >+}
> >+
> >+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid) {
> >+	iowrite16(qid, oct_hw->vqs[qid].notify_addr); }
> >+
> >+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
> >+*dst, int length) {
> >+	u8 old_gen, new_gen, *p;
> >+	int i;
> >+
> >+	WARN_ON(offset + length > oct_hw->config_size);
>                 ^
> Should we return early in this case?
Yes, I will move this to caller of this function.
> 
> >+	do {
> >+		old_gen = ioread8(&oct_hw->common_cfg-
> >config_generation);
> >+		p = dst;
> >+		for (i = 0; i < length; i++)
> >+			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
> >+
> >+		new_gen = ioread8(&oct_hw->common_cfg-
> >config_generation);
> >+	} while (old_gen != new_gen);
> >+}
> >+
> >+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64
> desc_area, u64 driver_area,
> >+			 u64 device_area)
> >+{
> >+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> >+
> >+	octep_write_queue_select(qid, oct_hw);
> >+	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> >+			     &cfg->queue_desc_hi);
> >+	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> >+			     &cfg->queue_avail_hi);
> >+	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> >+			     &cfg->queue_used_hi);
> >+
> >+	return 0;
> >+}
> >+
> >+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct
> >+vdpa_vq_state *state) {
> >+	return octep_process_mbox(oct_hw,
> OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> >+				  sizeof(*state), 0);
> >+}
> >+
> >+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> >+vdpa_vq_state *state) {
> >+	struct vdpa_vq_state q_state;
> >+
> >+	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> >+	return octep_process_mbox(oct_hw,
> OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> >+				  sizeof(*state), 1);
> >+}
> >+
> >+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num) {
> >+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> >+
> >+	octep_write_queue_select(qid, oct_hw);
> >+	iowrite16(num, &cfg->queue_size);
> >+}
> >+
> >+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> >+{
> >+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> >+
> >+	octep_write_queue_select(qid, oct_hw);
> >+	iowrite16(ready, &cfg->queue_enable); }
> >+
> >+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid) {
> >+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> >+
> >+	octep_write_queue_select(qid, oct_hw);
> >+	return ioread16(&cfg->queue_enable);
> >+}
> >+
> >+u16 octep_get_vq_size(struct octep_hw *oct_hw) {
> >+	octep_write_queue_select(0, oct_hw);
> >+	return ioread16(&oct_hw->common_cfg->queue_size);
> >+}
> >+
> >+static u32 octep_get_config_size(struct octep_hw *oct_hw) {
> >+	return sizeof(struct virtio_net_config); }
> >+
> >+static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct
> >+virtio_pci_cap *cap) {
> >+	struct device *dev = &oct_hw->pdev->dev;
> >+	u32 length = cap->length;
> >+	u32 offset = cap->offset;
> 
> Should we use le32_to_cpu() to access that fields (lenght, offset)?
> 
> >+	u8  bar    = cap->bar;
> >+	u32 len;
> >+
> >+	if (bar != OCTEP_HW_CAPS_BAR) {
> >+		dev_err(dev, "Invalid bar: %u\n", bar);
> >+		return NULL;
> >+	}
> >+	if (offset + length < offset) {
> >+		dev_err(dev, "offset(%u) + length(%u) overflows\n",
> >+			offset, length);
> >+		return NULL;
> >+	}
> >+	len = pci_resource_len(oct_hw->pdev, bar);
> >+	if (offset + length > len) {
> >+		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> >+			offset + length, len);
> >+		return NULL;
> >+	}
> >+	return oct_hw->base[bar] + offset;
> >+}
> >+
> >+static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t
> >+len, off_t offset) {
> >+	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> >+	u8 *p = buf;
> >+	size_t i;
> >+
> >+	for (i = 0; i < len; i++)
> >+		*p++ = ioread8(bar + offset + i);
> >+}
> >+
> >+static int pci_signature_verify(struct octep_hw *oct_hw) {
> >+	u32 signature[2];
> >+
> >+	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> >+
> >+	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> >+		return -1;
> >+
> >+	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> >+		return -1;
> >+
> >+	return 0;
> >+}
> >+
> >+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> >+{
> >+	struct octep_mbox __iomem *mbox;
> >+	struct device *dev = &pdev->dev;
> >+	struct virtio_pci_cap cap;
> >+	int ret;
> >+	u8 pos;
> >+
> >+	oct_hw->pdev = pdev;
> >+	ret = pci_signature_verify(oct_hw);
> >+	if (ret) {
> >+		dev_err(dev, "Octeon Virtio FW is not initialized\n");
> >+		return -EIO;
> >+	}
> >+
> >+	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> >+
> >+	while (pos) {
> >+		pci_caps_read(oct_hw, &cap, 2, pos);
> >+
> >+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> >+			dev_err(dev, "Found invalid capability vndr id: %d\n",
> cap.cap_vndr);
> >+			break;
> >+		}
> >+
> >+		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> >+
> >+		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len:
> %u\n",
> >+			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> >+
> >+		switch (cap.cfg_type) {
> >+		case VIRTIO_PCI_CAP_COMMON_CFG:
> >+			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> >+			break;
> >+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
> >+			pci_caps_read(oct_hw, &oct_hw-
> >notify_off_multiplier,
> >+				      4, pos + sizeof(cap));
> >+
> >+			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> >+			oct_hw->notify_bar = cap.bar;
> >+			oct_hw->notify_base_pa = pci_resource_start(pdev,
> cap.bar) +
> >+cap.offset;
> 
> Should we use le32_to_cpu() to access cap.offset?
> 
> >+			break;
> >+		case VIRTIO_PCI_CAP_DEVICE_CFG:
> >+			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> >+			break;
> >+		case VIRTIO_PCI_CAP_ISR_CFG:
> >+			oct_hw->isr = get_cap_addr(oct_hw, &cap);
> >+			break;
> >+		}
> >+
> >+		pos = cap.cap_next;
> >+	}
> >+	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> >+	    !oct_hw->dev_cfg    || !oct_hw->isr) {
> >+		dev_err(dev, "Incomplete PCI capabilities");
> >+		return -EIO;
> >+	}
> >+	oct_hw->config_size = octep_get_config_size(oct_hw);
> >+
> >+	mbox = octep_get_mbox(oct_hw);
> >+	octep_mbox_init(mbox);
> >+
> >+	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> >common_cfg);
> >+	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> >dev_cfg);
> >+	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> >+	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> >+		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> >+	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> >+
> >+	return 0;
> >+}
> >diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> >b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> >new file mode 100644
> >index 000000000000..845fd35368ff
> >--- /dev/null
> >+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> >@@ -0,0 +1,903 @@
> >+// SPDX-License-Identifier: GPL-2.0-only
> >+/* Copyright (C) 2024 Marvell. */
> >+
> >+#include <linux/interrupt.h>
> >+#include <linux/module.h>
> >+#include <linux/iommu.h>
> >+#include "octep_vdpa.h"
> >+
> >+#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> >+
> >+struct octep_pf {
> >+	u8 __iomem *base[PCI_STD_NUM_BARS];
> >+	struct pci_dev *pdev;
> >+	struct resource res;
> >+	u64 vf_base;
> >+	int enabled_vfs;
> >+	u32 vf_stride;
> >+	u16 vf_devid;
> >+};
> >+
> >+struct octep_vdpa {
> >+	struct vdpa_device vdpa;
> >+	struct octep_hw *oct_hw;
> >+	struct pci_dev *pdev;
> >+};
> >+
> >+struct octep_vdpa_mgmt_dev {
> >+	struct vdpa_mgmt_dev mdev;
> >+	struct octep_hw oct_hw;
> >+	struct pci_dev *pdev;
> >+	/* Work entry to handle device setup */
> >+	struct work_struct setup_task;
> >+	/* Device status */
> >+	atomic_t status;
> >+};
> >+
> >+static int verify_features(u64 features) {
> >+	/* Minimum features to expect */
> >+	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> >+		return -EOPNOTSUPP;
> >+
> >+	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> >+		return -EOPNOTSUPP;
> >+
> >+	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> >+		return -EOPNOTSUPP;
> >+
> >+	/* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
> >+	 * requirements: "VIRTIO_NET_F_MQ Requires
> VIRTIO_NET_F_CTRL_VQ".
> >+	 */
> >+	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) |
> BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
> >+	    BIT_ULL(VIRTIO_NET_F_MQ))
> >+		return -EINVAL;
> >+
> >+	return 0;
> >+}
> >+
> >+static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> >+{
> >+	struct octep_vdpa *oct_vdpa;
> >+
> >+	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> >+
> >+	return oct_vdpa->oct_hw;
> >+}
> >+
> >+static irqreturn_t octep_vdpa_intr_handler(int irq, void *data) {
> >+	struct octep_hw *oct_hw = data;
> >+	int i;
> >+
> >+	for (i = 0; i < oct_hw->nr_vring; i++) {
> >+		if (oct_hw->vqs[i].cb.callback && *oct_hw-
> >vqs[i].cb_notify_addr) {
> >+			*oct_hw->vqs[i].cb_notify_addr = 0;
> >+			oct_hw->vqs[i].cb.callback(oct_hw-
> >vqs[i].cb.private);
> >+		}
> >+	}
> >+
> >+	return IRQ_HANDLED;
> >+}
> >+
> >+static void octep_free_irqs(struct octep_hw *oct_hw) {
> >+	struct pci_dev *pdev = oct_hw->pdev;
> >+
> >+	if (oct_hw->irq != -1) {
> >+		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> >+		oct_hw->irq = -1;
> >+	}
> >+	pci_free_irq_vectors(pdev);
> >+}
> >+
> >+static int octep_request_irqs(struct octep_hw *oct_hw) {
> >+	struct pci_dev *pdev = oct_hw->pdev;
> >+	int ret, irq;
> >+
> >+	/* Use one ring/interrupt per VF for virtio call interface. */
> >+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> >+	if (ret < 0) {
> >+		dev_err(&pdev->dev, "Failed to alloc msix vector");
> >+		return ret;
> >+	}
> >+
> >+	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs-
> >msix_name),
> >+		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> >+
> >+	irq = pci_irq_vector(pdev, 0);
> >+	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> >+			       oct_hw->vqs->msix_name, oct_hw);
> >+	if (ret) {
> >+		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> >+		goto free_irq_vec;
> >+	}
> >+	oct_hw->irq = irq;
> >+
> >+	return 0;
> >+
> >+free_irq_vec:
> >+	pci_free_irq_vectors(pdev);
> >+	return ret;
> >+}
> >+
> >+static u64 octep_vdpa_get_device_features(struct vdpa_device
> >+*vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return oct_hw->features;
> >+}
> >+
> >+static int octep_vdpa_set_driver_features(struct vdpa_device
> >+*vdpa_dev, u64 features) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+	int ret;
> >+
> >+	pr_debug("Driver Features: %llx\n", features);
> >+	ret = verify_features(features);
> >+	if (ret)
> >+		return ret;
> >+
> >+	octep_hw_set_drv_features(oct_hw, features);
> >+	oct_hw->drv_features = features;
> >+
> >+	return 0;
> >+}
> >+
> >+static u64 octep_vdpa_get_driver_features(struct vdpa_device
> >+*vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return oct_hw->features & oct_hw->drv_features; }
> >+
> >+static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return octep_hw_get_status(oct_hw);
> >+}
> >+
> >+static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8
> >+status) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+	u8 status_old;
> >+
> >+	status_old = octep_hw_get_status(oct_hw);
> >+
> >+	if (status_old == status)
> >+		return;
> >+
> >+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> >+	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> >+		if (octep_request_irqs(oct_hw))
> >+			status = status_old | VIRTIO_CONFIG_S_FAILED;
> >+	}
> >+	octep_hw_set_status(oct_hw, status);
> >+}
> >+
> >+static int octep_vdpa_reset(struct vdpa_device *vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+	u8 status = octep_hw_get_status(oct_hw);
> >+	u16 qid;
> >+
> >+	if (status == 0)
> >+		return 0;
> >+
> >+	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> >+		oct_hw->vqs[qid].cb.callback = NULL;
> >+		oct_hw->vqs[qid].cb.private = NULL;
> >+		oct_hw->config_cb.callback = NULL;
> >+		oct_hw->config_cb.private = NULL;
> >+	}
> >+	octep_hw_reset(oct_hw);
> >+
> >+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> >+		octep_free_irqs(oct_hw);
> >+
> >+	return 0;
> >+}
> >+
> >+static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return octep_get_vq_size(oct_hw);
> >+}
> >+
> >+static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> >+				   struct vdpa_vq_state *state)
> >+{
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return octep_get_vq_state(oct_hw, qid, state); }
> >+
> >+static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> >+				   const struct vdpa_vq_state *state) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return octep_set_vq_state(oct_hw, qid, state); }
> >+
> >+static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16
> >+qid, struct vdpa_callback *cb) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	oct_hw->vqs[qid].cb = *cb;
> >+}
> >+
> >+static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16
> >+qid, bool ready) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	octep_set_vq_ready(oct_hw, qid, ready); }
> >+
> >+static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16
> >+qid) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return octep_get_vq_ready(oct_hw, qid); }
> >+
> >+static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16
> >+qid, u32 num) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	octep_set_vq_num(oct_hw, qid, num);
> >+}
> >+
> >+static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16
> qid, u64 desc_area,
> >+				     u64 driver_area, u64 device_area) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> >+	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> >+	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> >+
> >+	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area,
> >+device_area); }
> >+
> >+static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> >+{
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	octep_notify_queue(oct_hw, qid);
> >+}
> >+
> >+static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev,
> >+u32 data) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+	u16 idx = data & 0xFFFF;
> >+
> >+	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr); }
> >+
> >+static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return vp_ioread8(&oct_hw->common_cfg->config_generation);
> >+}
> >+
> >+static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev) {
> >+	return VIRTIO_ID_NET;
> >+}
> >+
> >+static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) {
> >+	return PCI_VENDOR_ID_CAVIUM;
> >+}
> >+
> >+static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) {
> >+	return PAGE_SIZE;
> >+}
> >+
> >+static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> >+{
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	return oct_hw->config_size;
> >+}
> >+
> >+static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned
> int offset, void *buf,
> >+				  unsigned int len)
> >+{
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	octep_read_dev_config(oct_hw, offset, buf, len); }
> >+
> >+static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned
> int offset,
> >+				  const void *buf, unsigned int len) {
> >+	/* Not supported */
> >+}
> >+
> >+static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
> >+struct vdpa_callback *cb) {
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+
> >+	oct_hw->config_cb.callback = cb->callback;
> >+	oct_hw->config_cb.private = cb->private; }
> >+
> >+static struct vdpa_notification_area octep_get_vq_notification(struct
> vdpa_device *vdpa_dev,
> >+							       u16 idx)
> >+{
> >+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> >+	struct vdpa_notification_area area;
> >+
> >+	area.addr = oct_hw->vqs[idx].notify_pa;
> >+	area.size = PAGE_SIZE;
> >+
> >+	return area;
> >+}
> >+
> >+static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
> >+			      struct vhost_iotlb *iotlb)
> >+{
> >+	return 0;
> >+}
> >+
> >+static struct vdpa_config_ops octep_vdpa_ops = {
> >+	.get_device_features = octep_vdpa_get_device_features,
> >+	.set_driver_features = octep_vdpa_set_driver_features,
> >+	.get_driver_features = octep_vdpa_get_driver_features,
> >+	.get_status	= octep_vdpa_get_status,
> >+	.set_status	= octep_vdpa_set_status,
> >+	.reset		= octep_vdpa_reset,
> >+	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
> >+	.get_vq_state	= octep_vdpa_get_vq_state,
> >+	.set_vq_state	= octep_vdpa_set_vq_state,
> >+	.set_vq_cb	= octep_vdpa_set_vq_cb,
> >+	.set_vq_ready	= octep_vdpa_set_vq_ready,
> >+	.get_vq_ready	= octep_vdpa_get_vq_ready,
> >+	.set_vq_num	= octep_vdpa_set_vq_num,
> >+	.set_vq_address	= octep_vdpa_set_vq_address,
> >+	.get_vq_irq	= NULL,
> >+	.kick_vq	= octep_vdpa_kick_vq,
> >+	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
> >+	.get_generation	= octep_vdpa_get_generation,
> >+	.get_device_id	= octep_vdpa_get_device_id,
> >+	.get_vendor_id	= octep_vdpa_get_vendor_id,
> >+	.get_vq_align	= octep_vdpa_get_vq_align,
> >+	.get_config_size	= octep_vdpa_get_config_size,
> >+	.get_config	= octep_vdpa_get_config,
> >+	.set_config	= octep_vdpa_set_config,
> >+	.set_config_cb  = octep_vdpa_set_config_cb,
> >+	.get_vq_notification = octep_get_vq_notification, };
> >+
> >+static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl,
> >+u8 bar) {
> >+	int ret;
> >+
> >+	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> >+	if (ret) {
> >+		dev_err(&pdev->dev, "Failed to request BAR:%u region\n",
> bar);
> >+		return ret;
> >+	}
> >+
> >+	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> >+	if (!tbl[bar]) {
> >+		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> >+		pci_release_region(pdev, bar);
> >+		ret = -ENOMEM;
> >+	}
> >+
> >+	return ret;
> >+}
> >+
> >+static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem
> >+**tbl, u8 bar) {
> >+	pci_iounmap(pdev, tbl[bar]);
> >+	pci_release_region(pdev, bar);
> >+}
> >+
> >+static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf) {
> >+	struct pci_dev *pf_dev = octpf->pdev;
> >+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> >+	struct pci_bus_region bus_region;
> >+
> >+	octpf->res.start = res->start;
> >+	octpf->res.end = res->end;
> >+	octpf->vf_base = res->start;
> >+
> >+	bus_region.start = res->start;
> >+	bus_region.end = res->start - 1;
> >+
> >+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region); }
> >+
> >+static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf) {
> >+	struct pci_dev *pf_dev = octpf->pdev;
> >+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> >+	struct pci_bus_region bus_region;
> >+
> >+	bus_region.start = octpf->res.start;
> >+	bus_region.end = octpf->res.end;
> >+
> >+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region); }
> >+
> >+static void octep_vdpa_remove_pf(struct pci_dev *pdev) {
> >+	struct octep_pf *octpf = pci_get_drvdata(pdev);
> >+
> >+	pci_disable_sriov(pdev);
> >+
> >+	if (octpf->base[OCTEP_HW_CAPS_BAR])
> >+		octep_iounmap_region(pdev, octpf->base,
> OCTEP_HW_CAPS_BAR);
> >+
> >+	if (octpf->base[OCTEP_HW_MBOX_BAR])
> >+		octep_iounmap_region(pdev, octpf->base,
> OCTEP_HW_MBOX_BAR);
> >+
> >+	octep_vdpa_pf_bar_expand(octpf);
> >+}
> >+
> >+static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev) {
> >+	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> >+
> >+	memset(vf_res, 0, sizeof(*vf_res));
> >+}
> >+
> >+static void octep_vdpa_remove_vf(struct pci_dev *pdev) {
> >+	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> >+	struct octep_hw *oct_hw;
> >+	int status;
> >+
> >+	oct_hw = &mgmt_dev->oct_hw;
> >+	status = atomic_read(&mgmt_dev->status);
> >+	atomic_set(&mgmt_dev->status,
> OCTEP_VDPA_DEV_STATUS_UNINIT);
> >+
> >+	if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
> >+		cancel_work_sync(&mgmt_dev->setup_task);
> >+	} else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
> >+		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> >+		kfree(mgmt_dev->oct_hw.vqs);
> >+	}
> >+
> >+	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> >+		octep_iounmap_region(pdev, oct_hw->base,
> OCTEP_HW_CAPS_BAR);
> >+
> >+	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> >+		octep_iounmap_region(pdev, oct_hw->base,
> OCTEP_HW_MBOX_BAR);
> >+
> >+	octep_vdpa_vf_bar_shrink(pdev);
> >+}
> >+
> >+static void octep_vdpa_remove(struct pci_dev *pdev) {
> >+	if (pdev->is_virtfn)
> >+		octep_vdpa_remove_vf(pdev);
> >+	else
> >+		octep_vdpa_remove_pf(pdev);
> >+}
> >+
> >+static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char
> *name,
> >+			      const struct vdpa_dev_set_config *config) {
> >+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev,
> struct octep_vdpa_mgmt_dev, mdev);
> >+	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> >+	struct pci_dev *pdev = oct_hw->pdev;
> >+	struct vdpa_device *vdpa_dev;
> >+	struct octep_vdpa *oct_vdpa;
> >+	u64 device_features;
> >+	u16 notify_off;
> >+	int i, ret;
> >+
> >+	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev,
> &octep_vdpa_ops, 1, 1,
> >+				     NULL, false);
> >+	if (IS_ERR(oct_vdpa)) {
> >+		dev_err(&pdev->dev, "Failed to allocate vDPA structure for
> octep vdpa device");
> >+		return PTR_ERR(oct_vdpa);
> >+	}
> >+
> >+	oct_vdpa->pdev = pdev;
> >+	oct_vdpa->vdpa.dma_dev = &pdev->dev;
> >+	oct_vdpa->vdpa.mdev = mdev;
> >+	oct_vdpa->oct_hw = oct_hw;
> >+	vdpa_dev = &oct_vdpa->vdpa;
> >+
> >+	device_features = oct_hw->features;
> >+	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> >+		if (config->device_features & ~device_features) {
> >+			dev_err(&pdev->dev, "The provisioned features 0x%llx
> are not supported by this device with features 0x%llx\n",
> >+				config->device_features, device_features);
> >+			return -EINVAL;
> >+		}
> >+		device_features &= config->device_features;
> >+	}
> >+
> >+	oct_hw->features = device_features;
> >+
> >+	if (verify_features(device_features)) {
> >+		dev_warn(mdev->device,
> >+			 "Must provision minimum features 0x%llx for this
> device",
> >+			 BIT_ULL(VIRTIO_F_VERSION_1) |
> BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> >+			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) |
> BIT_ULL(VIRTIO_F_IN_ORDER));
> >+		return -EOPNOTSUPP;
> 
> What about returning the return value of verify_features()?
Yes, anything would be fine.
> 
> >+	}
> >+
> >+	oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs),
> >+GFP_KERNEL);
> 
> What about using devm_kcalloc()?
> Memory allocated with that should be automatically freed on driver detach.
Sure, I will change it to devm_kcalloc.

> 
> >+	if (!oct_hw->vqs)
> >+		return -ENOMEM;
> >+
> >+	oct_hw->irq = -1;
> >+
> >+	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> >+	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw-
> >nr_vring);
> >+
> >+	for (i = 0; i < oct_hw->nr_vring; i++) {
> >+		octep_write_queue_select(i, oct_hw);
> >+		notify_off = vp_ioread16(&oct_hw->common_cfg-
> >queue_notify_off);
> >+		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> >+			notify_off * oct_hw->notify_off_multiplier;
> >+		oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw-
> >vqs[i].notify_addr + 1;
> >+		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> >+			notify_off * oct_hw->notify_off_multiplier;
> >+	}
> >+
> >+	if (name)
> >+		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> >+	else
> >+		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev-
> >index);
> >+
> >+	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> >+	if (ret) {
> >+		dev_err(&pdev->dev, "Failed to register to vDPA bus");
> >+		goto free_vqs;
> >+	}
> >+	return 0;
> >+
> >+free_vqs:
> >+	put_device(&oct_vdpa->vdpa.dev);
> >+	kfree(oct_hw->vqs);
> >+	return ret;
> >+}
> >+
> >+static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct
> >+vdpa_device *dev) {
> >+	_vdpa_unregister_device(dev);
> >+}
> >+
> >+static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> >+	.dev_add = octep_vdpa_dev_add,
> >+	.dev_del = octep_vdpa_dev_del
> >+};
> >+
> >+static bool get_device_ready_status(u8 __iomem *addr) {
> >+	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> >+
> >+	if (signature == OCTEP_DEV_READY_SIGNATURE) {
> >+		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> >+		return true;
> >+	}
> >+
> >+	return false;
> >+}
> >+
> >+static struct virtio_device_id id_table[] = {
> >+	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> >+	{ 0 },
> >+};
> >+
> >+static void octep_vdpa_setup_task(struct work_struct *work) {
> >+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work,
> struct octep_vdpa_mgmt_dev,
> >+							    setup_task);
> >+	struct pci_dev *pdev = mgmt_dev->pdev;
> >+	struct device *dev = &pdev->dev;
> >+	struct octep_hw *oct_hw;
> >+	unsigned long timeout;
> >+	int ret;
> >+
> >+	oct_hw = &mgmt_dev->oct_hw;
> >+
> >+	atomic_set(&mgmt_dev->status,
> >+OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> >+
> >+	/* Wait for a maximum of 5 sec */
> >+	timeout = jiffies + msecs_to_jiffies(5000);
> >+	while (!time_after(jiffies, timeout)) {
> >+		if (get_device_ready_status(oct_hw-
> >base[OCTEP_HW_MBOX_BAR])) {
> >+			atomic_set(&mgmt_dev->status,
> OCTEP_VDPA_DEV_STATUS_INIT);
> >+			break;
> >+		}
> >+
> >+		if (atomic_read(&mgmt_dev->status) >=
> OCTEP_VDPA_DEV_STATUS_READY) {
> >+			dev_info(dev, "Stopping vDPA setup task.\n");
> >+			return;
> >+		}
> >+
> >+		usleep_range(1000, 1500);
> >+	}
> >+
> >+	if (atomic_read(&mgmt_dev->status) !=
> OCTEP_VDPA_DEV_STATUS_INIT) {
> >+		dev_err(dev, "BAR initialization is timed out\n");
> >+		return;
> >+	}
> >+
> >+	ret = octep_iomap_region(pdev, oct_hw->base,
> OCTEP_HW_CAPS_BAR);
> >+	if (ret)
> >+		return;
> >+
> >+	ret = octep_hw_caps_read(oct_hw, pdev);
> >+	if (ret < 0)
> >+		goto unmap_region;
> >+
> >+	oct_hw->features = octep_hw_get_dev_features(oct_hw);
> >+	ret = verify_features(oct_hw->features);
> >+	if (ret) {
> >+		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> >+		goto unmap_region;
> >+	}
> >+	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg-
> >num_queues);
> >+
> >+	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> >+	mgmt_dev->mdev.id_table = id_table;
> >+	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> >+	mgmt_dev->mdev.supported_features = oct_hw->features;
> >+	mgmt_dev->mdev.config_attr_mask = (1 <<
> VDPA_ATTR_DEV_FEATURES);
> >+	mgmt_dev->mdev.device = dev;
> >+
> >+	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> >+	if (ret) {
> >+		dev_err(dev, "Failed to register vdpa management
> interface\n");
> >+		goto unmap_region;
> >+	}
> >+
> >+	atomic_set(&mgmt_dev->status,
> OCTEP_VDPA_DEV_STATUS_READY);
> >+
> >+	return;
> >+
> >+unmap_region:
> >+	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> >+	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL; }
> >+
> >+static int octep_vdpa_probe_vf(struct pci_dev *pdev) {
> >+	struct octep_vdpa_mgmt_dev *mgmt_dev;
> >+	struct device *dev = &pdev->dev;
> >+	struct iommu_domain *domain;
> >+	int ret;
> >+
> >+	ret = pcim_enable_device(pdev);
> >+	if (ret) {
> >+		dev_err(dev, "Failed to enable device\n");
> >+		return ret;
> >+	}
> >+
> >+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> >+	if (ret) {
> >+		dev_err(dev, "No usable DMA configuration\n");
> >+		return ret;
> >+	}
> >+	pci_set_master(pdev);
> >+
> >+	domain = iommu_get_domain_for_dev(dev);
> >+	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> >+		dev_info(dev, "NO-IOMMU\n");
> >+		octep_vdpa_ops.set_map = octep_vdpa_set_map;
> >+	}
> >+
> >+	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev),
> GFP_KERNEL);
> >+	if (!mgmt_dev)
> >+		return -ENOMEM;
> >+
> >+	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base,
> OCTEP_HW_MBOX_BAR);
> >+	if (ret)
> >+		return ret;
> >+
> >+	mgmt_dev->pdev = pdev;
> >+	pci_set_drvdata(pdev, mgmt_dev);
> >+
> >+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
> >+	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
> >+	schedule_work(&mgmt_dev->setup_task);
> >+	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is
> >+queued\n");
> >+
> >+	return 0;
> >+}
> >+
> >+static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct
> >+pci_dev *pf_dev, u8 idx) {
> >+	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
> >+	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES +
> 4;
> >+	struct octep_pf *pf = pci_get_drvdata(pf_dev);
> >+	struct pci_bus_region bus_region;
> >+
> >+	vf_res->name = pci_name(vf_dev);
> >+	vf_res->flags = pf_res->flags;
> >+	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
> >+
> >+	bus_region.start = pf->vf_base + idx * pf->vf_stride;
> >+	bus_region.end = bus_region.start + pf->vf_stride - 1;
> >+	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region); }
> >+
> >+static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int
> >+num_vfs) {
> >+	struct octep_pf *pf = pci_get_drvdata(pdev);
> >+	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
> >+	int ret, i;
> >+
> >+	if (num_vfs > 0) {
> >+		struct pci_dev *vf_pdev = NULL;
> >+		bool done = false;
> >+		int index = 0;
> >+
> >+		ret = pci_enable_sriov(pdev, num_vfs);
> >+		if (ret)
> >+			return ret;
> >+
> >+		pf->enabled_vfs = num_vfs;
> >+
> >+		while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
> PCI_ANY_ID, vf_pdev))) {
> >+			if (vf_pdev->device != pf->vf_devid)
> >+				continue;
> >+
> >+			octep_vdpa_assign_barspace(vf_pdev, pdev, index);
> >+			if (++index == num_vfs) {
> >+				done = true;
> >+				break;
> >+			}
> >+		}
> >+
> >+		if (done) {
> >+			for (i = 0; i < pf->enabled_vfs; i++)
> >+				writeq(OCTEP_DEV_READY_SIGNATURE, addr
> + OCTEP_PF_MBOX_DATA(i));
> >+		}
> >+	} else {
> 
> I suggest moving the smaller branch to the top and possibly returning earlier.
> 
> >+		if (!pci_num_vf(pdev))
> >+			return 0;
> >+
> >+		pci_disable_sriov(pdev);
> >+		pf->enabled_vfs = 0;
> >+	}
> >+
> >+	return num_vfs;
> >+}
> >+
> >+static u16 octep_get_vf_devid(struct pci_dev *pdev) {
> >+	u16 did;
> >+
> >+	switch (pdev->device) {
> >+	case OCTEP_VDPA_DEVID_CN106K_PF:
> >+		did = OCTEP_VDPA_DEVID_CN106K_VF;
> >+		break;
> >+	case OCTEP_VDPA_DEVID_CN105K_PF:
> >+		did = OCTEP_VDPA_DEVID_CN105K_VF;
> >+		break;
> >+	case OCTEP_VDPA_DEVID_CN103K_PF:
> >+		did = OCTEP_VDPA_DEVID_CN103K_VF;
> >+		break;
> >+	default:
> >+		did = 0xFFFF;
> >+		break;
> >+	}
> >+
> >+	return did;
> >+}
> >+
> >+static int octep_vdpa_pf_setup(struct octep_pf *octpf) {
> >+	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
> >+	struct pci_dev *pdev = octpf->pdev;
> >+	int totalvfs;
> >+	u64 val, len;
> >+
> >+	totalvfs = pci_sriov_get_totalvfs(pdev);
> >+	if (unlikely(!totalvfs)) {
> >+		dev_info(&pdev->dev, "Total VFs are %d in PF sriov
> configuration\n", totalvfs);
> >+		return 0;
> >+	}
> >+
> >+	addr = octpf->base[OCTEP_HW_MBOX_BAR];
> >+	val = readq(addr + OCTEP_EPF_RINFO(0));
> >+	if (val == 0) {
> >+		dev_err(&pdev->dev, "Invalid device configuration\n");
> >+		return -EINVAL;
> >+	}
> >+
> >+	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
> >+		val &= ~GENMASK_ULL(35, 32);
> >+		val |= BIT_ULL(32);
> >+		writeq(val, addr + OCTEP_EPF_RINFO(0));
> >+	}
> >+
> >+	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
> >+
> >+	octpf->vf_stride = len / totalvfs;
> >+	octpf->vf_devid = octep_get_vf_devid(pdev);
> >+
> >+	octep_vdpa_pf_bar_shrink(octpf);
> >+
> >+	return 0;
> >+}
> >+
> >+static int octep_vdpa_probe_pf(struct pci_dev *pdev) {
> >+	struct device *dev = &pdev->dev;
> >+	struct octep_pf *octpf;
> >+	int ret;
> >+
> >+	ret = pcim_enable_device(pdev);
> >+	if (ret) {
> >+		dev_err(dev, "Failed to enable device\n");
> >+		return ret;
> >+	}
> >+
> >+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> >+	if (ret) {
> >+		dev_err(dev, "No usable DMA configuration\n");
> >+		return ret;
> >+	}
> >+	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
> >+	if (!octpf)
> >+		return -ENOMEM;
> >+
> >+	ret = octep_iomap_region(pdev, octpf->base,
> OCTEP_HW_MBOX_BAR);
> >+	if (ret)
> >+		return ret;
> >+
> >+	pci_set_master(pdev);
> >+	pci_set_drvdata(pdev, octpf);
> >+	octpf->pdev = pdev;
> >+
> >+	ret = octep_vdpa_pf_setup(octpf);
> >+	if (ret)
> >+		goto unmap_region;
> >+
> >+	return 0;
> >+
> >+unmap_region:
> >+	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> >+	return ret;
> >+}
> >+
> >+static int octep_vdpa_probe(struct pci_dev *pdev, const struct
> >+pci_device_id *id) {
> >+	if (pdev->is_virtfn)
> >+		return octep_vdpa_probe_vf(pdev);
> >+	else
> >+		return octep_vdpa_probe_pf(pdev);
> >+}
> >+
> >+static struct pci_device_id octep_pci_vdpa_map[] = {
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN106K_PF) },
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN106K_VF) },
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN105K_PF) },
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN105K_VF) },
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN103K_PF) },
> >+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM,
> OCTEP_VDPA_DEVID_CN103K_VF) },
> >+	{ 0 },
> >+};
> >+
> >+static struct pci_driver octep_pci_vdpa = {
> >+	.name     = OCTEP_VDPA_DRIVER_NAME,
> >+	.id_table = octep_pci_vdpa_map,
> >+	.probe    = octep_vdpa_probe,
> >+	.remove   = octep_vdpa_remove,
> >+	.sriov_configure = octep_vdpa_sriov_configure };
> >+
> >+module_pci_driver(octep_pci_vdpa);
> >+
> >+MODULE_AUTHOR("Marvell");
> >+MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
> >+MODULE_LICENSE("GPL");
> >--
> >2.25.1
> >
> >

Thanks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29 13:02   ` [EXTERNAL] " Srujana Challa
@ 2024-03-29 13:19     ` Stefano Garzarella
  2024-04-10 12:40       ` Srujana Challa
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Garzarella @ 2024-03-29 13:19 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, jasowang, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob

On Fri, Mar 29, 2024 at 01:02:28PM +0000, Srujana Challa wrote:
>> Subject: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON
>> DPU devices
>>
>> Prioritize security for external emails: Confirm sender and content safety
>> before clicking links or opening attachments
>>
>> ----------------------------------------------------------------------
>> On Wed, Mar 27, 2024 at 04:51:36PM +0530, Srujana Challa wrote:
>> >This commit introduces a new vDPA driver specifically designed for
>> >managing the virtio control plane over the vDPA bus for OCTEON DPU
>> >devices. The driver consists of two layers:
>> >
>> >1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
>> >operations and configurations related to the DPU device.
>> >
>> >2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
>> >implements device operations for the vDPA bus. It handles device
>> >probing, bus attachment, vring operations, and other relevant tasks.
>> >
>> >Signed-off-by: Srujana Challa <schalla@marvell.com>
>> >Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>> >Signed-off-by: Shijith Thotton <sthotton@marvell.com>
>> >Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>> >---
>> > MAINTAINERS                              |   5 +
>> > drivers/vdpa/Kconfig                     |   9 +
>> > drivers/vdpa/Makefile                    |   1 +
>> > drivers/vdpa/octeon_ep/Makefile          |   4 +
>> > drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
>> > drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
>> > drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903
>> +++++++++++++++++++++++
>> > 7 files changed, 1472 insertions(+)
>> > create mode 100644 drivers/vdpa/octeon_ep/Makefile  create mode
>> 100644
>> >drivers/vdpa/octeon_ep/octep_vdpa.h
>> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c

[...]

>> >diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index
>> >656c1cb541de..775149fad476 100644
>> >--- a/drivers/vdpa/Kconfig
>> >+++ b/drivers/vdpa/Kconfig
>> >@@ -126,4 +126,13 @@ config PDS_VDPA
>> > 	  With this driver, the VirtIO dataplane can be
>> > 	  offloaded to an AMD/Pensando DSC device.
>> >
>> >+config OCTEONEP_VDPA
>> >+	tristate "vDPA driver for Octeon DPU devices"
>> >+	depends on m
>>
>> Just for curiosity, why restrict it to module-only?
>
>There is an etherent driver for the same class of devices. So,
>we made this vDPA driver as module.

Sorry, I didn`t get it.

What would be the problem if we don`t put `depends on m` here?

>
>>
>> >+	depends on PCI_MSI
>> >+	help
>> >+	  vDPA driver for Marvell's Octeon DPU devices.
>> >+	  With this driver, the VirtIO dataplane can be
>> >+	  offloaded to a Octeon DPU device.
>> >+
>> > endif # VDPA

[...]

>> >+
>> >+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid) {
>> >+	iowrite16(qid, oct_hw->vqs[qid].notify_addr); }
>> >+
>> >+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
>> >+*dst, int length) {
>> >+	u8 old_gen, new_gen, *p;
>> >+	int i;
>> >+
>> >+	WARN_ON(offset + length > oct_hw->config_size);
>>                 ^
>> Should we return early in this case?
>Yes, I will move this to caller of this function.

Otherwise you can do this here:

	if (WARN_ON(offset + length > oct_hw->config_size))
		return;

Thanks,
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29 12:34   ` [EXTERNAL] " Srujana Challa
@ 2024-03-31 11:31     ` Michael S. Tsirkin
  2024-04-07  3:24       ` Jason Wang
  2024-04-07  3:34     ` Jason Wang
  1 sibling, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2024-03-31 11:31 UTC (permalink / raw)
  To: Srujana Challa
  Cc: Jason Wang, virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Fri, Mar 29, 2024 at 12:34:34PM +0000, Srujana Challa wrote:
> > > +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned
> > int offset,
> > > +                                 const void *buf, unsigned int len)
> > > +{
> > > +       /* Not supported */
> > 
> > We probably need to filter out VIRTIO_NET_F_ANNOUNCE and other
> > features that depend on the config writing.
> As per modern virtio spec(v1.2), device config is read only for the driver.
> Still do we need to filter out these features?

Yes I didn't get this either. What is VIRTIO_NET_F_ANNOUNCE?
VIRTIO_NET_F_GUEST_ANNOUNCE? And how does it depend on config
writing?

-- 
MST


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-31 11:31     ` Michael S. Tsirkin
@ 2024-04-07  3:24       ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2024-04-07  3:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Srujana Challa, virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Sun, Mar 31, 2024 at 7:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 29, 2024 at 12:34:34PM +0000, Srujana Challa wrote:
> > > > +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned
> > > int offset,
> > > > +                                 const void *buf, unsigned int len)
> > > > +{
> > > > +       /* Not supported */
> > >
> > > We probably need to filter out VIRTIO_NET_F_ANNOUNCE and other
> > > features that depend on the config writing.
> > As per modern virtio spec(v1.2), device config is read only for the driver.
> > Still do we need to filter out these features?
>
> Yes I didn't get this either. What is VIRTIO_NET_F_ANNOUNCE?
> VIRTIO_NET_F_GUEST_ANNOUNCE?

Yes.

> And how does it depend on config
> writing?

My bad, it should have nothing. So the part looks fine.

Thanks

>
> --
> MST
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29 12:34   ` [EXTERNAL] " Srujana Challa
  2024-03-31 11:31     ` Michael S. Tsirkin
@ 2024-04-07  3:34     ` Jason Wang
  2024-04-10 10:15       ` Srujana Challa
  1 sibling, 1 reply; 28+ messages in thread
From: Jason Wang @ 2024-04-07  3:34 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Fri, Mar 29, 2024 at 8:34 PM Srujana Challa <schalla@marvell.com> wrote:
>
> > Subject: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON
> > DPU devices
> >
> > Prioritize security for external emails: Confirm sender and content safety
> > before clicking links or opening attachments
> >
> > ----------------------------------------------------------------------
> > On Wed, Mar 27, 2024 at 7:22 PM Srujana Challa <schalla@marvell.com>
> > wrote:
> > >
> > > This commit introduces a new vDPA driver specifically designed for
> > > managing the virtio control plane over the vDPA bus for OCTEON DPU
> > > devices. The driver consists of two layers:
> > >
> > > 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> > > operations and configurations related to the DPU device.
> > >
> > > 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> > > implements device operations for the vDPA bus. It handles device
> > > probing, bus attachment, vring operations, and other relevant tasks.
> > >
> > > Signed-off-by: Srujana Challa <schalla@marvell.com>
> > > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> > > Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> > > ---
> > >  MAINTAINERS                              |   5 +
> > >  drivers/vdpa/Kconfig                     |   9 +
> > >  drivers/vdpa/Makefile                    |   1 +
> > >  drivers/vdpa/octeon_ep/Makefile          |   4 +
> > >  drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
> > >  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
> > >  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903
> > +++++++++++++++++++++++
> > >  7 files changed, 1472 insertions(+)
> > >  create mode 100644 drivers/vdpa/octeon_ep/Makefile
> > >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
> > >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > >  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > >
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index cfe44a876d8a..539ce209a960 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -13127,6 +13127,11 @@ S:     Supported
> > >  F:     Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
> > >  F:     drivers/mmc/host/sdhci-xenon*
> > >
> > > +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> > > +R:     schalla@marvell.com
> > > +R:     vattunuru@marvell.com
> > > +F:     drivers/vdpa/octeon_ep/
> > > +
> > >  MATROX FRAMEBUFFER DRIVER
> > >  L:     linux-fbdev@vger.kernel.org
> > >  S:     Orphan
> > > diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> > > index 656c1cb541de..775149fad476 100644
> > > --- a/drivers/vdpa/Kconfig
> > > +++ b/drivers/vdpa/Kconfig
> > > @@ -126,4 +126,13 @@ config PDS_VDPA
> > >           With this driver, the VirtIO dataplane can be
> > >           offloaded to an AMD/Pensando DSC device.
> > >
> > > +config OCTEONEP_VDPA
> > > +       tristate "vDPA driver for Octeon DPU devices"
> > > +       depends on m
> > > +       depends on PCI_MSI
> > > +       help
> > > +         vDPA driver for Marvell's Octeon DPU devices.
> > > +         With this driver, the VirtIO dataplane can be
> > > +         offloaded to a Octeon DPU device.
> > > +
> > >  endif # VDPA
> > > diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> > > index 8f53c6f3cca7..5654d36707af 100644
> > > --- a/drivers/vdpa/Makefile
> > > +++ b/drivers/vdpa/Makefile
> > > @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
> > >  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
> > >  obj-$(CONFIG_SNET_VDPA) += solidrun/
> > >  obj-$(CONFIG_PDS_VDPA) += pds/
> > > +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> > > diff --git a/drivers/vdpa/octeon_ep/Makefile
> > b/drivers/vdpa/octeon_ep/Makefile
> > > new file mode 100644
> > > index 000000000000..e23e2ff14f33
> > > --- /dev/null
> > > +++ b/drivers/vdpa/octeon_ep/Makefile
> > > @@ -0,0 +1,4 @@
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> > > +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> > > +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> > > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h
> > b/drivers/vdpa/octeon_ep/octep_vdpa.h
> > > new file mode 100644
> > > index 000000000000..60d2efc8f8b8
> > > --- /dev/null
> > > +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> > > @@ -0,0 +1,93 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only
> > > + * Copyright (C) 2024 Marvell.
> > > + */
> > > +#ifndef __OCTEP_VDPA_H__
> > > +#define __OCTEP_VDPA_H__
> > > +
> > > +#include <linux/pci.h>
> > > +#include <linux/pci_regs.h>
> > > +#include <linux/vdpa.h>
> > > +#include <linux/virtio_pci_modern.h>
> > > +#include <uapi/linux/virtio_net.h>
> > > +#include <uapi/linux/virtio_blk.h>
> > > +#include <uapi/linux/virtio_config.h>
> > > +#include <uapi/linux/virtio_pci.h>
> > > +#include <uapi/linux/vdpa.h>
> > > +
> > > +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> > > +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> > > +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> > > +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> > > +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> > > +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> > > +
> > > +#define OCTEP_HW_MBOX_BAR 0
> > > +#define OCTEP_HW_CAPS_BAR 4
> > > +
> > > +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> > > +
> > > +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> > > +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> > > +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> > > +
> > > +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> > > +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> > > +
> > > +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> > > +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> > > +
> > > +enum octep_vdpa_dev_status {
> > > +       OCTEP_VDPA_DEV_STATUS_INVALID,
> > > +       OCTEP_VDPA_DEV_STATUS_ALLOC,
> > > +       OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> > > +       OCTEP_VDPA_DEV_STATUS_INIT,
> > > +       OCTEP_VDPA_DEV_STATUS_READY,
> > > +       OCTEP_VDPA_DEV_STATUS_UNINIT
> > > +};
> > > +
> > > +struct octep_vring_info {
> > > +       struct vdpa_callback cb;
> > > +       void __iomem *notify_addr;
> > > +       u32 __iomem *cb_notify_addr;
> > > +       phys_addr_t notify_pa;
> > > +       char msix_name[256];
> > > +};
> > > +
> > > +struct octep_hw {
> > > +       struct pci_dev *pdev;
> > > +       u8 __iomem *base[PCI_STD_NUM_BARS];
> > > +       struct virtio_pci_common_cfg __iomem *common_cfg;
> >
> > This is a hint that any chance we can reuse the modern virtio-pci
> > library (virtio_pci_modern_dev.c) instead of duplicating codes?
> Yes, the modern pci library would be a great help but in current
> Octeon DPU device, the virtio config space completely emulated
> by the device's firmware. So, the standard pci config read apis can't
> be used for probing the virtio configuration. Hence, it's not possible to
> use vp_modern_probe() here.

I see, please add a comment somewhere for this.

>
> >
> > > +       u8 __iomem *dev_cfg;
> > > +       u8 __iomem *isr;
> > > +       void __iomem *notify_base;
> > > +       phys_addr_t notify_base_pa;
> > > +       u32 notify_off_multiplier;
> > > +       u8 notify_bar;
> > > +       struct octep_vring_info *vqs;
> > > +       struct vdpa_callback config_cb;
> > > +       u64 features;
> > > +       u64 drv_features;
> > > +       u16 nr_vring;
> > > +       u32 config_size;
> > > +       int irq;
> > > +};
> > > +
> > > +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> > > +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> > > +void octep_hw_reset(struct octep_hw *oct_hw);
> > > +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
> > > +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> > > +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
> > *dst, int length);
> > > +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64
> > desc_area, u64 driver_area,
> > > +                        u64 device_area);
> > > +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> > > +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> > > +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> > > +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> > vdpa_vq_state *state);
> > > +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct
> > vdpa_vq_state *state);
> > > +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> > > +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> > > +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> > > +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> > > +
> > > +#endif /* __OCTEP_VDPA_H__ */
> > > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > > new file mode 100644
> > > index 000000000000..0a2f1d09c4ab
> > > --- /dev/null
> > > +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> > > @@ -0,0 +1,457 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (C) 2024 Marvell. */
> > > +
> > > +#include <linux/iopoll.h>
> > > +
> > > +#include "octep_vdpa.h"
> > > +
> > > +enum octep_mbox_ids {
> > > +       OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> > > +       OCTEP_MBOX_MSG_GET_VQ_STATE,
> > > +};
> > > +
> > > +#define OCTEP_HW_TIMEOUT       10000000
> > > +
> > > +#define MBOX_OFFSET            64
> > > +#define MBOX_RSP_MASK          0x00000001
> > > +#define MBOX_RC_MASK           0x0000FFFE
> > > +
> > > +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> > > +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> > > +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK |
> > MBOX_RSP_MASK))
> > > +
> > > +struct octep_mbox_hdr {
> > > +       u8 ver;
> > > +       u8 rsvd1;
> > > +       u16 id;
> > > +       u16 rsvd2;
> > > +#define MBOX_REQ_SIG (0xdead)
> > > +#define MBOX_RSP_SIG (0xbeef)
> > > +       u16 sig;
> > > +};
> > > +
> > > +struct octep_mbox_sts {
> > > +       u16 rsp:1;
> > > +       u16 rc:15;
> > > +       u16 rsvd;
> > > +};
> > > +
> > > +struct octep_mbox {
> > > +       struct octep_mbox_hdr hdr;
> > > +       struct octep_mbox_sts sts;
> > > +       u64 rsvd;
> > > +       u32 data[];
> > > +};
> > > +
> > > +static inline struct octep_mbox __iomem *octep_get_mbox(struct
> > octep_hw *oct_hw)
> > > +{
> > > +       return (struct octep_mbox __iomem *)(oct_hw->dev_cfg +
> > MBOX_OFFSET);
> > > +}
> > > +
> > > +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem
> > *mbox)
> > > +{
> > > +       u32 val;
> > > +
> > > +       return readx_poll_timeout(ioread32, &mbox->sts, val,
> > MBOX_AVAIL(val), 10,
> > > +                                 OCTEP_HW_TIMEOUT);
> > > +}
> > > +
> > > +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem
> > *mbox)
> > > +{
> > > +       u32 val;
> > > +
> > > +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val),
> > 10,
> > > +                                 OCTEP_HW_TIMEOUT);
> > > +}
> > > +
> > > +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16
> > id, u16 sig)
> > > +{
> > > +       iowrite16(id, &mbox->hdr.id);
> > > +       iowrite16(sig, &mbox->hdr.sig);
> > > +}
> > > +
> > > +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> > > +{
> > > +       return ioread16(&mbox->hdr.sig);
> > > +}
> > > +
> > > +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32
> > sts)
> > > +{
> > > +       iowrite32(sts, &mbox->sts);
> > > +}
> > > +
> > > +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> > > +{
> > > +       return ioread32(&mbox->sts);
> > > +}
> > > +
> > > +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox,
> > u16 word_idx)
> > > +{
> > > +       return ioread32(&mbox->data[word_idx]);
> > > +}
> > > +
> > > +static inline void octep_write32_word(struct octep_mbox __iomem *mbox,
> > u16 word_idx, u32 word)
> > > +{
> > > +       return iowrite32(word, &mbox->data[word_idx]);
> > > +}
> > > +
> > > +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid,
> > void *buffer,
> > > +                             u32 buf_size, bool write)
> > > +{
> > > +       struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> > > +       struct pci_dev *pdev = oct_hw->pdev;
> > > +       u32 *p = (u32 *)buffer;
> > > +       u16 data_wds;
> > > +       int ret, i;
> > > +       u32 val;
> > > +
> > > +       if (!IS_ALIGNED(buf_size, 4))
> > > +               return -EINVAL;
> > > +
> > > +       /* Make sure mbox space is available */
> > > +       ret = octep_wait_for_mbox_avail(mbox);
> > > +       if (ret) {
> > > +               dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to
> > be consumed\n");
> > > +               return ret;
> > > +       }
> > > +       data_wds = buf_size / 4;
> > > +
> > > +       if (write) {
> > > +               for (i = 1; i <= data_wds; i++) {
> > > +                       octep_write32_word(mbox, i, *p);
> > > +                       p++;
> > > +               }
> > > +       }
> > > +       octep_write32_word(mbox, 0, (u32)qid);
> > > +       octep_write_sts(mbox, 0);
> > > +
> > > +       octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> > > +
> > > +       ret = octep_wait_for_mbox_rsp(mbox);
> > > +       if (ret) {
> > > +               dev_warn(&pdev->dev, "Timeout waiting for mbox : %d
> > response\n", id);
> > > +               return ret;
> > > +       }
> > > +
> > > +       val = octep_read_sig(mbox);
> > > +       if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> > > +               dev_warn(&pdev->dev, "Invalid Signature from mbox : %d
> > response\n", id);
> > > +               return ret;
> > > +       }
> > > +
> > > +       val = octep_read_sts(mbox);
> > > +       if (val & MBOX_RC_MASK) {
> > > +               ret = MBOX_RSP_TO_ERR(val);
> > > +               dev_warn(&pdev->dev, "Error while processing mbox : %d, err
> > %d\n", id, ret);
> > > +               return ret;
> > > +       }
> > > +
> > > +       if (!write)
> > > +               for (i = 1; i <= data_wds; i++)
> > > +                       *p++ = octep_read32_word(mbox, i);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> > > +{
> > > +       iowrite32(1, &mbox->sts);
> > > +}
> > > +
> > > +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> > > +{
> > > +       return ioread8(&oct_hw->common_cfg->device_status);
> > > +}
> > > +
> > > +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> > > +{
> > > +       iowrite8(status, &oct_hw->common_cfg->device_status);
> > > +}
> > > +
> > > +void octep_hw_reset(struct octep_hw *oct_hw)
> > > +{
> > > +       u8 val;
> > > +
> > > +       octep_hw_set_status(oct_hw, 0 | BIT(7));
> >
> > Could we have macros for the magic numbers like 7 and 15 below?
> Sure, will add proper macros in next version.
>
> >
> > > +       if (readx_poll_timeout(ioread8, &oct_hw->common_cfg-
> > >device_status, val, !val, 10,
> > > +                              OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> > > +               return;
> > > +       }
> > > +}
> > > +
> > > +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> > > +{
> > > +       u32 features_lo, features_hi;
> > > +       u32 val, select;
> > > +
> > > +       select = 0;
> > > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> > >device_feature_select);
> >
> > Ok I see something different with virtio. For any "select" it seems to
> > require BIT(15).
> >
> > > +
> > > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> > >device_feature_select, val,
> > > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> > timeout\n", select);
> > > +               return 0ULL;
> > > +       }
> > > +       features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> > > +
> > > +       select = 1;
> > > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> > >device_feature_select);
> > > +
> > > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> > >device_feature_select, val,
> > > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> > timeout\n", select);
> > > +               return 0ULL;
> > > +       }
> > > +       features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> > > +
> > > +       return ((u64)features_hi << 32) | features_lo;
> > > +}
> > > +
> > > +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> > > +{
> > > +       u32 val, select;
> > > +
> > > +       select = 0;
> > > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> > >guest_feature_select);
> > > +
> > > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> > >guest_feature_select, val,
> > > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> > timeout\n", select);
> > > +               return;
> > > +       }
> >
> > Could we introduce a helper for such write and readback && timeout
> > logic? It seems it is used in a lot of places.
> Sure, I will move repetitive ones(ioread32/iowrite32) to helper function.
>
> >
> > > +       iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg-
> > >guest_feature);
> > > +
> > > +       select = 1;
> > > +       iowrite32(select | BIT(15), &oct_hw->common_cfg-
> > >guest_feature_select);
> > > +
> > > +       if (readx_poll_timeout(ioread32, &oct_hw->common_cfg-
> > >guest_feature_select, val,
> > > +                              val == select, 10, OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write
> > timeout\n", select);
> > > +               return;
> > > +       }
> > > +       iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> > > +}
> > > +
> > > +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
> > > +{
> > > +       u16 val;
> > > +
> > > +       iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
> > > +
> > > +       if (readx_poll_timeout(ioread16, &oct_hw->common_cfg-
> > >queue_select, val, val == queue_id,
> > > +                              10, OCTEP_HW_TIMEOUT)) {
> > > +               dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> > > +               return;
> > > +       }
> > > +}
> > > +
> > > +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> > > +{
> > > +       iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> > > +}
> > > +
> > > +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void
> > *dst, int length)
> > > +{
> > > +       u8 old_gen, new_gen, *p;
> > > +       int i;
> > > +
> > > +       WARN_ON(offset + length > oct_hw->config_size);
> > > +       do {
> > > +               old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> > > +               p = dst;
> > > +               for (i = 0; i < length; i++)
> > > +                       *p++ = ioread8(oct_hw->dev_cfg + offset + i);
> > > +
> > > +               new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> > > +       } while (old_gen != new_gen);
> > > +}
> >
> > This looks similar to vp_vdpa_get_config(), any chance to unify them?
> It might not be workout as we can't use vp_modern_probe().
>
> >
> > > +
> > > +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64
> > desc_area, u64 driver_area,
> > > +                        u64 device_area)
> > > +{
> > > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > > +
> > > +       octep_write_queue_select(qid, oct_hw);
> > > +       vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> > > +                            &cfg->queue_desc_hi);
> > > +       vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> > > +                            &cfg->queue_avail_hi);
> > > +       vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> > > +                            &cfg->queue_used_hi);
> >
> > It's kind of interesting that there's no read back for validation here.
> Read back validation is not required for these writes. Firmware takes care of it.

Ok.

>
> >
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct
> > vdpa_vq_state *state)
> > > +{
> > > +       return octep_process_mbox(oct_hw,
> > OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> > > +                                 sizeof(*state), 0);
> > > +}
> > > +
> > > +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct
> > vdpa_vq_state *state)
> > > +{
> > > +       struct vdpa_vq_state q_state;
> > > +
> > > +       memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> > > +       return octep_process_mbox(oct_hw,
> > OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> > > +                                 sizeof(*state), 1);
> > > +}
> > > +
> > > +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> > > +{
> > > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > > +
> > > +       octep_write_queue_select(qid, oct_hw);
> > > +       iowrite16(num, &cfg->queue_size);
> > > +}
> > > +
> > > +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> > > +{
> > > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > > +
> > > +       octep_write_queue_select(qid, oct_hw);
> > > +       iowrite16(ready, &cfg->queue_enable);
> > > +}
> > > +
> > > +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> > > +{
> > > +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> > > +
> > > +       octep_write_queue_select(qid, oct_hw);
> > > +       return ioread16(&cfg->queue_enable);
> > > +}
> > > +
> > > +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> > > +{
> > > +       octep_write_queue_select(0, oct_hw);
> > > +       return ioread16(&oct_hw->common_cfg->queue_size);
> > > +}
> > > +
> > > +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> > > +{
> > > +       return sizeof(struct virtio_net_config);
> > > +}
> > > +
> > > +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct
> > virtio_pci_cap *cap)
> > > +{
> > > +       struct device *dev = &oct_hw->pdev->dev;
> > > +       u32 length = cap->length;
> > > +       u32 offset = cap->offset;
> > > +       u8  bar    = cap->bar;
> > > +       u32 len;
> > > +
> > > +       if (bar != OCTEP_HW_CAPS_BAR) {
> > > +               dev_err(dev, "Invalid bar: %u\n", bar);
> > > +               return NULL;
> > > +       }
> > > +       if (offset + length < offset) {
> > > +               dev_err(dev, "offset(%u) + length(%u) overflows\n",
> > > +                       offset, length);
> > > +               return NULL;
> > > +       }
> > > +       len = pci_resource_len(oct_hw->pdev, bar);
> > > +       if (offset + length > len) {
> > > +               dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> > > +                       offset + length, len);
> > > +               return NULL;
> > > +       }
> > > +       return oct_hw->base[bar] + offset;
> > > +}
> > > +
> > > +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len,
> > off_t offset)
> > > +{
> > > +       u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> > > +       u8 *p = buf;
> > > +       size_t i;
> > > +
> > > +       for (i = 0; i < len; i++)
> > > +               *p++ = ioread8(bar + offset + i);
> > > +}
> > > +
> > > +static int pci_signature_verify(struct octep_hw *oct_hw)
> > > +{
> > > +       u32 signature[2];
> > > +
> > > +       pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> > > +
> > > +       if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> > > +               return -1;
> > > +
> > > +       if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> > > +               return -1;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> > > +{
> > > +       struct octep_mbox __iomem *mbox;
> > > +       struct device *dev = &pdev->dev;
> > > +       struct virtio_pci_cap cap;
> > > +       int ret;
> > > +       u8 pos;
> > > +
> > > +       oct_hw->pdev = pdev;
> > > +       ret = pci_signature_verify(oct_hw);
> > > +       if (ret) {
> > > +               dev_err(dev, "Octeon Virtio FW is not initialized\n");
> > > +               return -EIO;
> > > +       }
> > > +
> > > +       pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> > > +
> > > +       while (pos) {
> > > +               pci_caps_read(oct_hw, &cap, 2, pos);
> > > +
> > > +               if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> > > +                       dev_err(dev, "Found invalid capability vndr id: %d\n",
> > cap.cap_vndr);
> > > +                       break;
> > > +               }
> > > +
> > > +               pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> > > +
> > > +               dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len:
> > %u\n",
> > > +                        pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> > > +
> > > +               switch (cap.cfg_type) {
> > > +               case VIRTIO_PCI_CAP_COMMON_CFG:
> > > +                       oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> > > +                       break;
> > > +               case VIRTIO_PCI_CAP_NOTIFY_CFG:
> > > +                       pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> > > +                                     4, pos + sizeof(cap));
> > > +
> > > +                       oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> > > +                       oct_hw->notify_bar = cap.bar;
> > > +                       oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
> > cap.offset;
> > > +                       break;
> > > +               case VIRTIO_PCI_CAP_DEVICE_CFG:
> > > +                       oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> > > +                       break;
> > > +               case VIRTIO_PCI_CAP_ISR_CFG:
> > > +                       oct_hw->isr = get_cap_addr(oct_hw, &cap);
> > > +                       break;
> > > +               }
> > > +
> > > +               pos = cap.cap_next;
> > > +       }
> > > +       if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> > > +           !oct_hw->dev_cfg    || !oct_hw->isr) {
> > > +               dev_err(dev, "Incomplete PCI capabilities");
> > > +               return -EIO;
> > > +       }
> > > +       oct_hw->config_size = octep_get_config_size(oct_hw);
> > > +
> > > +       mbox = octep_get_mbox(oct_hw);
> > > +       octep_mbox_init(mbox);
> > > +
> > > +       dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> > >common_cfg);
> > > +       dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw-
> > >dev_cfg);
> > > +       dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> > > +       dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> > > +                (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> > > +       dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> > > +
> > > +       return 0;
> > > +}
> >
> > PDS vDPA reuses vp_modern_probe(), can we reuse it as well here?
> >
> > For example, vp_modern_probe() allows vendor specific bars and id
> > check which looks to be useful here.
> >
> > > diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > > new file mode 100644
> > > index 000000000000..845fd35368ff
> > > --- /dev/null
> > > +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> > > @@ -0,0 +1,903 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (C) 2024 Marvell. */
> > > +
> > > +#include <linux/interrupt.h>
> > > +#include <linux/module.h>
> > > +#include <linux/iommu.h>
> > > +#include "octep_vdpa.h"
> > > +
> > > +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> > > +
> > > +struct octep_pf {
> > > +       u8 __iomem *base[PCI_STD_NUM_BARS];
> > > +       struct pci_dev *pdev;
> > > +       struct resource res;
> > > +       u64 vf_base;
> > > +       int enabled_vfs;
> > > +       u32 vf_stride;
> > > +       u16 vf_devid;
> > > +};
> > > +
> > > +struct octep_vdpa {
> > > +       struct vdpa_device vdpa;
> > > +       struct octep_hw *oct_hw;
> > > +       struct pci_dev *pdev;
> > > +};
> > > +
> > > +struct octep_vdpa_mgmt_dev {
> > > +       struct vdpa_mgmt_dev mdev;
> > > +       struct octep_hw oct_hw;
> > > +       struct pci_dev *pdev;
> > > +       /* Work entry to handle device setup */
> > > +       struct work_struct setup_task;
> > > +       /* Device status */
> > > +       atomic_t status;
> > > +};
> > > +
> > > +static int verify_features(u64 features)
> > > +{
> > > +       /* Minimum features to expect */
> > > +       if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> > > +               return -EOPNOTSUPP;
> > > +
> > > +       if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> > > +               return -EOPNOTSUPP;
> >
> > Any reason we need to mandate the NOTIFICATION_DATA. It seems can work
> > without that becasue:
> We are only supporting with NOTIFICATION_DATA feature enabled currently.
> I will remove below op in next version.
>
> >
> > void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> > {
> >         iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> > }
> >
> > > +
> > > +       if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> > > +               return -EOPNOTSUPP;
> >
> > Does this mean the packed virtqueue is mandatory?
> Yes.
>
> >
> > > +
> > > +       /* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
> > > +        * requirements: "VIRTIO_NET_F_MQ Requires
> > VIRTIO_NET_F_CTRL_VQ".
> > > +        */
> >
> > Right, but we have a lot of other features that depend on the CVQ, why
> > is MQ special here?
> It's not just MQ. I will check whether this condition is required or not.
>
> >
> > > +       if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) |
> > BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
> > > +           BIT_ULL(VIRTIO_NET_F_MQ))
> > > +               return -EINVAL;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_vdpa *oct_vdpa;
> > > +
> > > +       oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> > > +
> > > +       return oct_vdpa->oct_hw;
> > > +}
> > > +
> > > +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> > > +{
> > > +       struct octep_hw *oct_hw = data;
> > > +       int i;
> > > +
> > > +       for (i = 0; i < oct_hw->nr_vring; i++) {
> > > +               if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr)
> > {
> > > +                       *oct_hw->vqs[i].cb_notify_addr = 0;
> >
> > I didn't get how cb_notify_addr is useful here.
> We are using this as acknowledgment to the device's firmware. We will rename
> the variable.
>
> >
> > > +                       oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> > > +               }
> > > +       }
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > > +static void octep_free_irqs(struct octep_hw *oct_hw)
> > > +{
> > > +       struct pci_dev *pdev = oct_hw->pdev;
> > > +
> > > +       if (oct_hw->irq != -1) {
> > > +               devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> > > +               oct_hw->irq = -1;
> > > +       }
> > > +       pci_free_irq_vectors(pdev);
> > > +}
> > > +
> > > +static int octep_request_irqs(struct octep_hw *oct_hw)
> > > +{
> > > +       struct pci_dev *pdev = oct_hw->pdev;
> > > +       int ret, irq;
> > > +
> > > +       /* Use one ring/interrupt per VF for virtio call interface. */
> >
> > Is this a hardware limitation? If not, it would be slow.
> Yes, it's a hardware limitation currently.

Let's add a comment or TODO here.

>
> >
> > > +       ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> > > +       if (ret < 0) {
> > > +               dev_err(&pdev->dev, "Failed to alloc msix vector");
> > > +               return ret;
> > > +       }
> > > +
> > > +       snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> > > +                OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> > > +
> > > +       irq = pci_irq_vector(pdev, 0);
> > > +       ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> > > +                              oct_hw->vqs->msix_name, oct_hw);
> > > +       if (ret) {
> > > +               dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> > > +               goto free_irq_vec;
> > > +       }
> > > +       oct_hw->irq = irq;
> > > +
> > > +       return 0;
> > > +
> > > +free_irq_vec:
> > > +       pci_free_irq_vectors(pdev);
> > > +       return ret;
> > > +}
> > > +
> > > +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return oct_hw->features;
> > > +}
> > > +
> > > +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev,
> > u64 features)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +       int ret;
> > > +
> > > +       pr_debug("Driver Features: %llx\n", features);
> > > +       ret = verify_features(features);
> > > +       if (ret)
> > > +               return ret;
> > > +
> > > +       octep_hw_set_drv_features(oct_hw, features);
> > > +       oct_hw->drv_features = features;
> >
> > It is not guaranteed that the device will accept all those features or
> > it would be painful to maintain per fw feature white/blacklist.
> >
> > More below.
> >
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return oct_hw->features & oct_hw->drv_features;
> >
> > So here, the drv_features needs to be read from the device then
> > everything is fine.
> Sure, we will change it to read from the device.
>
> >
> > > +}
> > > +
> > > +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return octep_hw_get_status(oct_hw);
> > > +}
> > > +
> > > +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8
> > status)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +       u8 status_old;
> > > +
> > > +       status_old = octep_hw_get_status(oct_hw);
> > > +
> > > +       if (status_old == status)
> > > +               return;
> > > +
> > > +       if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> > > +           !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> > > +               if (octep_request_irqs(oct_hw))
> > > +                       status = status_old | VIRTIO_CONFIG_S_FAILED;
> > > +       }
> > > +       octep_hw_set_status(oct_hw, status);
> > > +}
> > > +
> > > +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +       u8 status = octep_hw_get_status(oct_hw);
> > > +       u16 qid;
> > > +
> > > +       if (status == 0)
> > > +               return 0;
> > > +
> > > +       for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> > > +               oct_hw->vqs[qid].cb.callback = NULL;
> > > +               oct_hw->vqs[qid].cb.private = NULL;
> > > +               oct_hw->config_cb.callback = NULL;
> > > +               oct_hw->config_cb.private = NULL;
> > > +       }
> > > +       octep_hw_reset(oct_hw);
> > > +
> > > +       if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> > > +               octep_free_irqs(oct_hw);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return octep_get_vq_size(oct_hw);
> > > +}
> > > +
> > > +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> > > +                                  struct vdpa_vq_state *state)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return octep_get_vq_state(oct_hw, qid, state);
> > > +}
> > > +
> > > +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> > > +                                  const struct vdpa_vq_state *state)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return octep_set_vq_state(oct_hw, qid, state);
> > > +}
> > > +
> > > +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
> > struct vdpa_callback *cb)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       oct_hw->vqs[qid].cb = *cb;
> > > +}
> > > +
> > > +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16
> > qid, bool ready)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       octep_set_vq_ready(oct_hw, qid, ready);
> > > +}
> > > +
> > > +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16
> > qid)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return octep_get_vq_ready(oct_hw, qid);
> > > +}
> > > +
> > > +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16
> > qid, u32 num)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       octep_set_vq_num(oct_hw, qid, num);
> > > +}
> > > +
> > > +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16
> > qid, u64 desc_area,
> > > +                                    u64 driver_area, u64 device_area)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> > > +       pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> > > +       pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> > > +
> > > +       return octep_set_vq_address(oct_hw, qid, desc_area, driver_area,
> > device_area);
> > > +}
> > > +
> > > +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       octep_notify_queue(oct_hw, qid);
> > > +}
> > > +
> > > +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev,
> > u32 data)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +       u16 idx = data & 0xFFFF;
> > > +
> > > +       vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
> > > +}
> > > +
> > > +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return vp_ioread8(&oct_hw->common_cfg->config_generation);
> > > +}
> > > +
> > > +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       return VIRTIO_ID_NET;
> > > +}
> > > +
> > > +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       return PCI_VENDOR_ID_CAVIUM;
> > > +}
> > > +
> > > +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       return PAGE_SIZE;
> > > +}
> > > +
> > > +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       return oct_hw->config_size;
> > > +}
> > > +
> > > +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned
> > int offset, void *buf,
> > > +                                 unsigned int len)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       octep_read_dev_config(oct_hw, offset, buf, len);
> > > +}
> > > +
> > > +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned
> > int offset,
> > > +                                 const void *buf, unsigned int len)
> > > +{
> > > +       /* Not supported */
> >
> > We probably need to filter out VIRTIO_NET_F_ANNOUNCE and other
> > features that depend on the config writing.
> As per modern virtio spec(v1.2), device config is read only for the driver.
> Still do we need to filter out these features?
>
> >
> > > +}
> > > +
> > > +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct
> > vdpa_callback *cb)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +
> > > +       oct_hw->config_cb.callback = cb->callback;
> > > +       oct_hw->config_cb.private = cb->private;
> > > +}
> > > +
> > > +static struct vdpa_notification_area octep_get_vq_notification(struct
> > vdpa_device *vdpa_dev,
> > > +                                                              u16 idx)
> > > +{
> > > +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> > > +       struct vdpa_notification_area area;
> > > +
> > > +       area.addr = oct_hw->vqs[idx].notify_pa;
> > > +       area.size = PAGE_SIZE;
> > > +
> > > +       return area;
> > > +}
> > > +
> > > +static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
> > > +                             struct vhost_iotlb *iotlb)
> > > +{
> > > +       return 0;
> > > +}
> > > +
> > > +static struct vdpa_config_ops octep_vdpa_ops = {
> > > +       .get_device_features = octep_vdpa_get_device_features,
> > > +       .set_driver_features = octep_vdpa_set_driver_features,
> > > +       .get_driver_features = octep_vdpa_get_driver_features,
> > > +       .get_status     = octep_vdpa_get_status,
> > > +       .set_status     = octep_vdpa_set_status,
> > > +       .reset          = octep_vdpa_reset,
> > > +       .get_vq_num_max = octep_vdpa_get_vq_num_max,
> > > +       .get_vq_state   = octep_vdpa_get_vq_state,
> > > +       .set_vq_state   = octep_vdpa_set_vq_state,
> > > +       .set_vq_cb      = octep_vdpa_set_vq_cb,
> > > +       .set_vq_ready   = octep_vdpa_set_vq_ready,
> > > +       .get_vq_ready   = octep_vdpa_get_vq_ready,
> > > +       .set_vq_num     = octep_vdpa_set_vq_num,
> > > +       .set_vq_address = octep_vdpa_set_vq_address,
> > > +       .get_vq_irq     = NULL,
> > > +       .kick_vq        = octep_vdpa_kick_vq,
> > > +       .kick_vq_with_data      = octep_vdpa_kick_vq_with_data,
> > > +       .get_generation = octep_vdpa_get_generation,
> > > +       .get_device_id  = octep_vdpa_get_device_id,
> > > +       .get_vendor_id  = octep_vdpa_get_vendor_id,
> > > +       .get_vq_align   = octep_vdpa_get_vq_align,
> > > +       .get_config_size        = octep_vdpa_get_config_size,
> > > +       .get_config     = octep_vdpa_get_config,
> > > +       .set_config     = octep_vdpa_set_config,
> > > +       .set_config_cb  = octep_vdpa_set_config_cb,
> > > +       .get_vq_notification = octep_get_vq_notification,
> > > +};
> > > +
> > > +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8
> > bar)
> > > +{
> > > +       int ret;
> > > +
> > > +       ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> > > +       if (ret) {
> > > +               dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
> > > +               return ret;
> > > +       }
> > > +
> > > +       tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> > > +       if (!tbl[bar]) {
> > > +               dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> > > +               pci_release_region(pdev, bar);
> > > +               ret = -ENOMEM;
> > > +       }
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl,
> > u8 bar)
> > > +{
> > > +       pci_iounmap(pdev, tbl[bar]);
> > > +       pci_release_region(pdev, bar);
> > > +}
> > > +
> > > +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
> > > +{
> > > +       struct pci_dev *pf_dev = octpf->pdev;
> > > +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> > > +       struct pci_bus_region bus_region;
> > > +
> > > +       octpf->res.start = res->start;
> > > +       octpf->res.end = res->end;
> > > +       octpf->vf_base = res->start;
> > > +
> > > +       bus_region.start = res->start;
> > > +       bus_region.end = res->start - 1;
> >
> > Is this inteneded?
> Yes,  it is required for emulating virtio config space.
>
> >
> > > +
> > > +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> > > +}
> > > +
> > > +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
> > > +{
> > > +       struct pci_dev *pf_dev = octpf->pdev;
> > > +       struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> > > +       struct pci_bus_region bus_region;
> > > +
> > > +       bus_region.start = octpf->res.start;
> > > +       bus_region.end = octpf->res.end;
> > > +
> > > +       pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> > > +}
> > > +
> > > +static void octep_vdpa_remove_pf(struct pci_dev *pdev)
> > > +{
> > > +       struct octep_pf *octpf = pci_get_drvdata(pdev);
> > > +
> > > +       pci_disable_sriov(pdev);
> > > +
> > > +       if (octpf->base[OCTEP_HW_CAPS_BAR])
> > > +               octep_iounmap_region(pdev, octpf->base,
> > OCTEP_HW_CAPS_BAR);
> > > +
> > > +       if (octpf->base[OCTEP_HW_MBOX_BAR])
> > > +               octep_iounmap_region(pdev, octpf->base,
> > OCTEP_HW_MBOX_BAR);
> > > +
> > > +       octep_vdpa_pf_bar_expand(octpf);
> > > +}
> > > +
> > > +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
> > > +{
> > > +       struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> > > +
> > > +       memset(vf_res, 0, sizeof(*vf_res));
> > > +}
> > > +
> > > +static void octep_vdpa_remove_vf(struct pci_dev *pdev)
> > > +{
> > > +       struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> > > +       struct octep_hw *oct_hw;
> > > +       int status;
> > > +
> > > +       oct_hw = &mgmt_dev->oct_hw;
> > > +       status = atomic_read(&mgmt_dev->status);
> > > +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
> > > +
> > > +       if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
> > > +               cancel_work_sync(&mgmt_dev->setup_task);
> >
> > This seems to be racy, can we call cancel_work_sync() unconditionally?
> >
> > > +       } else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
> > > +               vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> > > +               kfree(mgmt_dev->oct_hw.vqs);
> > > +       }
> > > +
> > > +       if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> > > +               octep_iounmap_region(pdev, oct_hw->base,
> > OCTEP_HW_CAPS_BAR);
> > > +
> > > +       if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> > > +               octep_iounmap_region(pdev, oct_hw->base,
> > OCTEP_HW_MBOX_BAR);
> > > +
> > > +       octep_vdpa_vf_bar_shrink(pdev);
> > > +}
> > > +
> > > +static void octep_vdpa_remove(struct pci_dev *pdev)
> > > +{
> > > +       if (pdev->is_virtfn)
> > > +               octep_vdpa_remove_vf(pdev);
> > > +       else
> > > +               octep_vdpa_remove_pf(pdev);
> > > +}
> > > +
> > > +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char
> > *name,
> > > +                             const struct vdpa_dev_set_config *config)
> > > +{
> > > +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct
> > octep_vdpa_mgmt_dev, mdev);
> > > +       struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> > > +       struct pci_dev *pdev = oct_hw->pdev;
> > > +       struct vdpa_device *vdpa_dev;
> > > +       struct octep_vdpa *oct_vdpa;
> > > +       u64 device_features;
> > > +       u16 notify_off;
> > > +       int i, ret;
> > > +
> > > +       oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev,
> > &octep_vdpa_ops, 1, 1,
> > > +                                    NULL, false);
> > > +       if (IS_ERR(oct_vdpa)) {
> > > +               dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep
> > vdpa device");
> > > +               return PTR_ERR(oct_vdpa);
> > > +       }
> > > +
> > > +       oct_vdpa->pdev = pdev;
> > > +       oct_vdpa->vdpa.dma_dev = &pdev->dev;
> > > +       oct_vdpa->vdpa.mdev = mdev;
> > > +       oct_vdpa->oct_hw = oct_hw;
> > > +       vdpa_dev = &oct_vdpa->vdpa;
> > > +
> > > +       device_features = oct_hw->features;
> > > +       if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> > > +               if (config->device_features & ~device_features) {
> > > +                       dev_err(&pdev->dev, "The provisioned features 0x%llx are not
> > supported by this device with features 0x%llx\n",
> > > +                               config->device_features, device_features);
> > > +                       return -EINVAL;
> > > +               }
> > > +               device_features &= config->device_features;
> > > +       }
> > > +
> > > +       oct_hw->features = device_features;
> > > +
> > > +       if (verify_features(device_features)) {
> > > +               dev_warn(mdev->device,
> > > +                        "Must provision minimum features 0x%llx for this device",
> > > +                        BIT_ULL(VIRTIO_F_VERSION_1) |
> > BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> > > +                        BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) |
> > BIT_ULL(VIRTIO_F_IN_ORDER));
> > > +               return -EOPNOTSUPP;
> > > +       }
> > > +
> > > +       oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs),
> > GFP_KERNEL);
> > > +       if (!oct_hw->vqs)
> > > +               return -ENOMEM;
> > > +
> > > +       oct_hw->irq = -1;
> > > +
> > > +       dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> > > +       dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> > > +
> > > +       for (i = 0; i < oct_hw->nr_vring; i++) {
> > > +               octep_write_queue_select(i, oct_hw);
> > > +               notify_off = vp_ioread16(&oct_hw->common_cfg-
> > >queue_notify_off);
> > > +               oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> > > +                       notify_off * oct_hw->notify_off_multiplier;
> > > +               oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw-
> > >vqs[i].notify_addr + 1;
> > > +               oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> > > +                       notify_off * oct_hw->notify_off_multiplier;
> > > +       }
> >
> > I think the reason we can't do the above in the probe is that we need
> > to wait for the device ready then we can get those information?
> Yes, your understanding is correct.

Comment please.

>
> >
> > > +
> > > +       if (name)
> > > +               ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> > > +       else
> > > +               ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev-
> > >index);
> > > +
> > > +       ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> > > +       if (ret) {
> > > +               dev_err(&pdev->dev, "Failed to register to vDPA bus");
> > > +               goto free_vqs;
> > > +       }
> > > +       return 0;
> > > +
> > > +free_vqs:
> > > +       put_device(&oct_vdpa->vdpa.dev);
> > > +       kfree(oct_hw->vqs);
> > > +       return ret;
> > > +}
> > > +
> > > +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct
> > vdpa_device *dev)
> > > +{
> > > +       _vdpa_unregister_device(dev);
> > > +}
> > > +
> > > +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> > > +       .dev_add = octep_vdpa_dev_add,
> > > +       .dev_del = octep_vdpa_dev_del
> > > +};
> > > +
> > > +static bool get_device_ready_status(u8 __iomem *addr)
> > > +{
> > > +       u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> > > +
> > > +       if (signature == OCTEP_DEV_READY_SIGNATURE) {
> > > +               writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> > > +               return true;
> > > +       }
> > > +
> > > +       return false;
> > > +}
> > > +
> > > +static struct virtio_device_id id_table[] = {
> > > +       { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> > > +       { 0 },
> > > +};
> > > +
> > > +static void octep_vdpa_setup_task(struct work_struct *work)
> > > +{
> > > +       struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct
> > octep_vdpa_mgmt_dev,
> > > +                                                           setup_task);
> > > +       struct pci_dev *pdev = mgmt_dev->pdev;
> > > +       struct device *dev = &pdev->dev;
> > > +       struct octep_hw *oct_hw;
> > > +       unsigned long timeout;
> > > +       int ret;
> > > +
> > > +       oct_hw = &mgmt_dev->oct_hw;
> > > +
> > > +       atomic_set(&mgmt_dev->status,
> > OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> > > +
> > > +       /* Wait for a maximum of 5 sec */
> > > +       timeout = jiffies + msecs_to_jiffies(5000);
> > > +       while (!time_after(jiffies, timeout)) {
> > > +               if (get_device_ready_status(oct_hw-
> > >base[OCTEP_HW_MBOX_BAR])) {
> > > +                       atomic_set(&mgmt_dev->status,
> > OCTEP_VDPA_DEV_STATUS_INIT);
> > > +                       break;
> > > +               }
> > > +
> > > +               if (atomic_read(&mgmt_dev->status) >=
> > OCTEP_VDPA_DEV_STATUS_READY) {
> > > +                       dev_info(dev, "Stopping vDPA setup task.\n");
> > > +                       return;
> > > +               }
> > > +
> > > +               usleep_range(1000, 1500);
> > > +       }
> > > +
> > > +       if (atomic_read(&mgmt_dev->status) !=
> > OCTEP_VDPA_DEV_STATUS_INIT) {
> > > +               dev_err(dev, "BAR initialization is timed out\n");
> > > +               return;
> > > +       }
> > > +
> > > +       ret = octep_iomap_region(pdev, oct_hw->base,
> > OCTEP_HW_CAPS_BAR);
> > > +       if (ret)
> > > +               return;
> > > +
> > > +       ret = octep_hw_caps_read(oct_hw, pdev);
> > > +       if (ret < 0)
> > > +               goto unmap_region;
> > > +
> > > +       oct_hw->features = octep_hw_get_dev_features(oct_hw);
> > > +       ret = verify_features(oct_hw->features);
> > > +       if (ret) {
> > > +               dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> > > +               goto unmap_region;
> > > +       }
> > > +       oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg-
> > >num_queues);
> > > +
> > > +       mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> > > +       mgmt_dev->mdev.id_table = id_table;
> > > +       mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> > > +       mgmt_dev->mdev.supported_features = oct_hw->features;
> > > +       mgmt_dev->mdev.config_attr_mask = (1 <<
> > VDPA_ATTR_DEV_FEATURES);
> > > +       mgmt_dev->mdev.device = dev;
> > > +
> > > +       ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> > > +       if (ret) {
> > > +               dev_err(dev, "Failed to register vdpa management interface\n");
> > > +               goto unmap_region;
> > > +       }
> > > +
> > > +       atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
> > > +
> > > +       return;
> > > +
> > > +unmap_region:
> > > +       octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> > > +       oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
> > > +}
> > > +
> > > +static int octep_vdpa_probe_vf(struct pci_dev *pdev)
> > > +{
> > > +       struct octep_vdpa_mgmt_dev *mgmt_dev;
> > > +       struct device *dev = &pdev->dev;
> > > +       struct iommu_domain *domain;
> > > +       int ret;
> > > +
> > > +       ret = pcim_enable_device(pdev);
> > > +       if (ret) {
> > > +               dev_err(dev, "Failed to enable device\n");
> > > +               return ret;
> > > +       }
> > > +
> > > +       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> > > +       if (ret) {
> > > +               dev_err(dev, "No usable DMA configuration\n");
> > > +               return ret;
> > > +       }
> > > +       pci_set_master(pdev);
> > > +
> > > +       domain = iommu_get_domain_for_dev(dev);
> > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > +               dev_info(dev, "NO-IOMMU\n");
> > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> >
> > Is this a shortcut to have get better performance? DMA API should have
> > those greacefully I think.
> When IOMMU is disabled on host and set_map/dma_map is not set,
> vhost-vdpa is reporting an error "Failed to allocate domain, device is not IOMMU cache coherent capable\n".
> Hence we are doing this way to get better performance.

The problem is, assuming the device does not have any internal IOMMU.

1) If we allow it running without IOMMU, it opens a window for guest
to attack the host.
2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
report it to DMA/IOMMU maintiner to fix that

Thanks


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-07  3:34     ` Jason Wang
@ 2024-04-10 10:15       ` Srujana Challa
  2024-04-10 11:19         ` Michael S. Tsirkin
  2024-04-11  5:59         ` Jason Wang
  0 siblings, 2 replies; 28+ messages in thread
From: Srujana Challa @ 2024-04-10 10:15 UTC (permalink / raw)
  To: Jason Wang
  Cc: virtualization, mst, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

> > > > +
> > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> > >
> > > Is this a shortcut to have get better performance? DMA API should have
> > > those greacefully I think.
> > When IOMMU is disabled on host and set_map/dma_map is not set,
> > vhost-vdpa is reporting an error "Failed to allocate domain, device is not
> IOMMU cache coherent capable\n".
> > Hence we are doing this way to get better performance.
> 
> The problem is, assuming the device does not have any internal IOMMU.
> 
> 1) If we allow it running without IOMMU, it opens a window for guest
> to attack the host.
> 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
> report it to DMA/IOMMU maintiner to fix that
It will be helpful for host networking case when iommu is disabled. 
Can we take the vfio pci driver approach as a reference where user explicitly set 
"enable_unsafe_noiommu_mode" using module param?

> 
> Thanks

Thanks.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-10 10:15       ` Srujana Challa
@ 2024-04-10 11:19         ` Michael S. Tsirkin
  2024-04-10 12:34           ` Srujana Challa
  2024-04-11  5:59         ` Jason Wang
  1 sibling, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2024-04-10 11:19 UTC (permalink / raw)
  To: Srujana Challa
  Cc: Jason Wang, virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > +
> > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> > > >
> > > > Is this a shortcut to have get better performance? DMA API should have
> > > > those greacefully I think.
> > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > vhost-vdpa is reporting an error "Failed to allocate domain, device is not
> > IOMMU cache coherent capable\n".
> > > Hence we are doing this way to get better performance.
> > 
> > The problem is, assuming the device does not have any internal IOMMU.
> > 
> > 1) If we allow it running without IOMMU, it opens a window for guest
> > to attack the host.
> > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
> > report it to DMA/IOMMU maintiner to fix that
> It will be helpful for host networking case when iommu is disabled. 
> Can we take the vfio pci driver approach as a reference where user explicitly set 
> "enable_unsafe_noiommu_mode" using module param?

vfio is a userspace driver so it's userspace's responsibility.
what exactly ensures correctness here? does the device have
an on-chip iommu?

> > 
> > Thanks
> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-10 11:19         ` Michael S. Tsirkin
@ 2024-04-10 12:34           ` Srujana Challa
  2024-04-11  6:01             ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Srujana Challa @ 2024-04-10 12:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

> Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> OCTEON DPU devices
> 
> On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > +
> > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> > > > >
> > > > > Is this a shortcut to have get better performance? DMA API
> > > > > should have those greacefully I think.
> > > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > > vhost-vdpa is reporting an error "Failed to allocate domain,
> > > > device is not
> > > IOMMU cache coherent capable\n".
> > > > Hence we are doing this way to get better performance.
> > >
> > > The problem is, assuming the device does not have any internal IOMMU.
> > >
> > > 1) If we allow it running without IOMMU, it opens a window for guest
> > > to attack the host.
> > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
> > > report it to DMA/IOMMU maintiner to fix that
> > It will be helpful for host networking case when iommu is disabled.
> > Can we take the vfio pci driver approach as a reference where user
> > explicitly set "enable_unsafe_noiommu_mode" using module param?
> 
> vfio is a userspace driver so it's userspace's responsibility.
> what exactly ensures correctness here? does the device have an on-chip
> iommu?
>
Our device features an on-chip IOMMU, although it is not utilized for host-side
targeted DMA operations. We included no-iommu mode in our driver to ensure
that host applications, such as DPDK Virtio user PMD, continue to function even
when operating in a no-IOMMU mode. We observed performance impacts on
certain low-end x86 machines when IOMMU mode was enabled.
I think, correctness is Host userspace application's responsibility, in this case
when vhost-vdpa is used with Host application such as DPDK Virtio user PMD.
 
> > >
> > > Thanks
> >
> > Thanks.
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-29 13:19     ` Stefano Garzarella
@ 2024-04-10 12:40       ` Srujana Challa
  0 siblings, 0 replies; 28+ messages in thread
From: Srujana Challa @ 2024-04-10 12:40 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: virtualization, mst, jasowang, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob

> Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> OCTEON DPU devices
> 
> On Fri, Mar 29, 2024 at 01:02:28PM +0000, Srujana Challa wrote:
> >> Subject: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> >> OCTEON DPU devices
> >>
> >> Prioritize security for external emails: Confirm sender and content
> >> safety before clicking links or opening attachments
> >>
> >> ---------------------------------------------------------------------
> >> - On Wed, Mar 27, 2024 at 04:51:36PM +0530, Srujana Challa wrote:
> >> >This commit introduces a new vDPA driver specifically designed for
> >> >managing the virtio control plane over the vDPA bus for OCTEON DPU
> >> >devices. The driver consists of two layers:
> >> >
> >> >1. Octep HW Layer (Octeon Endpoint): Responsible for handling
> >> >hardware operations and configurations related to the DPU device.
> >> >
> >> >2. Octep Main Layer: Compliant with the vDPA bus framework, this
> >> >layer implements device operations for the vDPA bus. It handles
> >> >device probing, bus attachment, vring operations, and other relevant
> tasks.
> >> >
> >> >Signed-off-by: Srujana Challa <schalla@marvell.com>
> >> >Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >> >Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> >> >Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> >> >---
> >> > MAINTAINERS                              |   5 +
> >> > drivers/vdpa/Kconfig                     |   9 +
> >> > drivers/vdpa/Makefile                    |   1 +
> >> > drivers/vdpa/octeon_ep/Makefile          |   4 +
> >> > drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
> >> > drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
> >> > drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903
> >> +++++++++++++++++++++++
> >> > 7 files changed, 1472 insertions(+) create mode 100644
> >> > drivers/vdpa/octeon_ep/Makefile  create mode
> >> 100644
> >> >drivers/vdpa/octeon_ep/octep_vdpa.h
> >> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> >> > create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> 
> [...]
> 
> >> >diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index
> >> >656c1cb541de..775149fad476 100644
> >> >--- a/drivers/vdpa/Kconfig
> >> >+++ b/drivers/vdpa/Kconfig
> >> >@@ -126,4 +126,13 @@ config PDS_VDPA
> >> > 	  With this driver, the VirtIO dataplane can be
> >> > 	  offloaded to an AMD/Pensando DSC device.
> >> >
> >> >+config OCTEONEP_VDPA
> >> >+	tristate "vDPA driver for Octeon DPU devices"
> >> >+	depends on m
> >>
> >> Just for curiosity, why restrict it to module-only?
> >
> >There is an etherent driver for the same class of devices. So, we made
> >this vDPA driver as module.
> 
> Sorry, I didn`t get it.
> 
> What would be the problem if we don`t put `depends on m` here?
Our device's firmware needs to boot up on the card before loading
the vdpa driver.
> 
> >
> >>
> >> >+	depends on PCI_MSI
> >> >+	help
> >> >+	  vDPA driver for Marvell's Octeon DPU devices.
> >> >+	  With this driver, the VirtIO dataplane can be
> >> >+	  offloaded to a Octeon DPU device.
> >> >+
> >> > endif # VDPA
> 
> [...]
> 
> >> >+
> >> >+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid) {
> >> >+	iowrite16(qid, oct_hw->vqs[qid].notify_addr); }
> >> >+
> >> >+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset,
> >> >+void *dst, int length) {
> >> >+	u8 old_gen, new_gen, *p;
> >> >+	int i;
> >> >+
> >> >+	WARN_ON(offset + length > oct_hw->config_size);
> >>                 ^
> >> Should we return early in this case?
> >Yes, I will move this to caller of this function.
> 
> Otherwise you can do this here:
> 
> 	if (WARN_ON(offset + length > oct_hw->config_size))
> 		return;
> 
> Thanks,
> Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-10 10:15       ` Srujana Challa
  2024-04-10 11:19         ` Michael S. Tsirkin
@ 2024-04-11  5:59         ` Jason Wang
  1 sibling, 0 replies; 28+ messages in thread
From: Jason Wang @ 2024-04-11  5:59 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Wed, Apr 10, 2024 at 6:16 PM Srujana Challa <schalla@marvell.com> wrote:
>
> > > > > +
> > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> > > >
> > > > Is this a shortcut to have get better performance? DMA API should have
> > > > those greacefully I think.
> > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > vhost-vdpa is reporting an error "Failed to allocate domain, device is not
> > IOMMU cache coherent capable\n".
> > > Hence we are doing this way to get better performance.
> >
> > The problem is, assuming the device does not have any internal IOMMU.
> >
> > 1) If we allow it running without IOMMU, it opens a window for guest
> > to attack the host.
> > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
> > report it to DMA/IOMMU maintiner to fix that
> It will be helpful for host networking case when iommu is disabled.
> Can we take the vfio pci driver approach as a reference where user explicitly set
> "enable_unsafe_noiommu_mode" using module param?

I prefer not, the "unsafe" is a hint that it should not be used in
production environment due to security implications.

How much degradation have you seen for IOMMU_DOMAIN_IDENTITY or IOMMU
is disabled? Is that because of the DMA API?

Thanks

>
> >
> > Thanks
>
> Thanks.
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-10 12:34           ` Srujana Challa
@ 2024-04-11  6:01             ` Jason Wang
  2024-04-12  5:12               ` Srujana Challa
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2024-04-11  6:01 UTC (permalink / raw)
  To: Srujana Challa
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma

On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa <schalla@marvell.com> wrote:
>
> > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> > OCTEON DPU devices
> >
> > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > +
> > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > +               octep_vdpa_ops.set_map = octep_vdpa_set_map;
> > > > > >
> > > > > > Is this a shortcut to have get better performance? DMA API
> > > > > > should have those greacefully I think.
> > > > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > > > vhost-vdpa is reporting an error "Failed to allocate domain,
> > > > > device is not
> > > > IOMMU cache coherent capable\n".
> > > > > Hence we are doing this way to get better performance.
> > > >
> > > > The problem is, assuming the device does not have any internal IOMMU.
> > > >
> > > > 1) If we allow it running without IOMMU, it opens a window for guest
> > > > to attack the host.
> > > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY, let's
> > > > report it to DMA/IOMMU maintiner to fix that
> > > It will be helpful for host networking case when iommu is disabled.
> > > Can we take the vfio pci driver approach as a reference where user
> > > explicitly set "enable_unsafe_noiommu_mode" using module param?
> >
> > vfio is a userspace driver so it's userspace's responsibility.
> > what exactly ensures correctness here? does the device have an on-chip
> > iommu?
> >
> Our device features an on-chip IOMMU, although it is not utilized for host-side
> targeted DMA operations. We included no-iommu mode in our driver to ensure
> that host applications, such as DPDK Virtio user PMD, continue to function even
> when operating in a no-IOMMU mode.

I may miss something but set_map() is empty in this driver. How could
such isolation be done?

> We observed performance impacts on
> certain low-end x86 machines when IOMMU mode was enabled.
> I think, correctness is Host userspace application's responsibility, in this case
> when vhost-vdpa is used with Host application such as DPDK Virtio user PMD.

Thanks

>
> > > >
> > > > Thanks
> > >
> > > Thanks.
> > >
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-11  6:01             ` Jason Wang
@ 2024-04-12  5:12               ` Srujana Challa
  2024-04-12  6:41                 ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Srujana Challa @ 2024-04-12  5:12 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma



> -----Original Message-----
> From: Jason Wang <jasowang@redhat.com>
> Sent: Thursday, April 11, 2024 11:32 AM
> To: Srujana Challa <schalla@marvell.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>; virtualization@lists.linux.dev;
> xuanzhuo@linux.alibaba.com; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; Shijith Thotton <sthotton@marvell.com>; Nithin
> Kumar Dabilpuram <ndabilpuram@marvell.com>; Jerin Jacob
> <jerinj@marvell.com>; eperezma <eperezma@redhat.com>
> Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> OCTEON DPU devices
> 
> On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa <schalla@marvell.com>
> wrote:
> >
> > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > Marvell OCTEON DPU devices
> > >
> > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > +
> > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
> {
> > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > + octep_vdpa_set_map;
> > > > > > >
> > > > > > > Is this a shortcut to have get better performance? DMA API
> > > > > > > should have those greacefully I think.
> > > > > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > > > > vhost-vdpa is reporting an error "Failed to allocate domain,
> > > > > > device is not
> > > > > IOMMU cache coherent capable\n".
> > > > > > Hence we are doing this way to get better performance.
> > > > >
> > > > > The problem is, assuming the device does not have any internal
> IOMMU.
> > > > >
> > > > > 1) If we allow it running without IOMMU, it opens a window for
> > > > > guest to attack the host.
> > > > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY,
> > > > > let's report it to DMA/IOMMU maintiner to fix that
> > > > It will be helpful for host networking case when iommu is disabled.
> > > > Can we take the vfio pci driver approach as a reference where user
> > > > explicitly set "enable_unsafe_noiommu_mode" using module param?
> > >
> > > vfio is a userspace driver so it's userspace's responsibility.
> > > what exactly ensures correctness here? does the device have an
> > > on-chip iommu?
> > >
> > Our device features an on-chip IOMMU, although it is not utilized for
> > host-side targeted DMA operations. We included no-iommu mode in our
> > driver to ensure that host applications, such as DPDK Virtio user PMD,
> > continue to function even when operating in a no-IOMMU mode.
> 
> I may miss something but set_map() is empty in this driver. How could such
> isolation be done?

In no-iommu case, there would be no domain right, and the user of vhost-vdpa(DPDK virtio user pmd),
would create the mapping and pass the PA (= IOVA) to the device directly. So that, device can directly
DMA to the PA.

> 
> > We observed performance impacts on
> > certain low-end x86 machines when IOMMU mode was enabled.
> > I think, correctness is Host userspace application's responsibility,
> > in this case when vhost-vdpa is used with Host application such as DPDK
> Virtio user PMD.
> 
> Thanks
> 
> >
> > > > >
> > > > > Thanks
> > > >
> > > > Thanks.
> > > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-12  5:12               ` Srujana Challa
@ 2024-04-12  6:41                 ` Jason Wang
  2024-04-12  9:48                   ` Srujana Challa
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2024-04-12  6:41 UTC (permalink / raw)
  To: Srujana Challa
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma

On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang <jasowang@redhat.com>
> > Sent: Thursday, April 11, 2024 11:32 AM
> > To: Srujana Challa <schalla@marvell.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>; virtualization@lists.linux.dev;
> > xuanzhuo@linux.alibaba.com; Vamsi Krishna Attunuru
> > <vattunuru@marvell.com>; Shijith Thotton <sthotton@marvell.com>; Nithin
> > Kumar Dabilpuram <ndabilpuram@marvell.com>; Jerin Jacob
> > <jerinj@marvell.com>; eperezma <eperezma@redhat.com>
> > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> > OCTEON DPU devices
> >
> > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa <schalla@marvell.com>
> > wrote:
> > >
> > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > > Marvell OCTEON DPU devices
> > > >
> > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > +
> > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > +       if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
> > {
> > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > + octep_vdpa_set_map;
> > > > > > > >
> > > > > > > > Is this a shortcut to have get better performance? DMA API
> > > > > > > > should have those greacefully I think.
> > > > > > > When IOMMU is disabled on host and set_map/dma_map is not set,
> > > > > > > vhost-vdpa is reporting an error "Failed to allocate domain,
> > > > > > > device is not
> > > > > > IOMMU cache coherent capable\n".
> > > > > > > Hence we are doing this way to get better performance.
> > > > > >
> > > > > > The problem is, assuming the device does not have any internal
> > IOMMU.
> > > > > >
> > > > > > 1) If we allow it running without IOMMU, it opens a window for
> > > > > > guest to attack the host.
> > > > > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY,
> > > > > > let's report it to DMA/IOMMU maintiner to fix that
> > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > Can we take the vfio pci driver approach as a reference where user
> > > > > explicitly set "enable_unsafe_noiommu_mode" using module param?
> > > >
> > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > what exactly ensures correctness here? does the device have an
> > > > on-chip iommu?
> > > >
> > > Our device features an on-chip IOMMU, although it is not utilized for
> > > host-side targeted DMA operations. We included no-iommu mode in our
> > > driver to ensure that host applications, such as DPDK Virtio user PMD,
> > > continue to function even when operating in a no-IOMMU mode.
> >
> > I may miss something but set_map() is empty in this driver. How could such
> > isolation be done?
>
> In no-iommu case, there would be no domain right, and the user of vhost-vdpa(DPDK virtio user pmd),
> would create the mapping and pass the PA (= IOVA) to the device directly. So that, device can directly
> DMA to the PA.

Yes, but this doesn't differ too much from the case where DMA API is
used with IOMMU disabled.

Are you saying DMA API introduces overheads in this case?

Thanks

>
> >
> > > We observed performance impacts on
> > > certain low-end x86 machines when IOMMU mode was enabled.
> > > I think, correctness is Host userspace application's responsibility,
> > > in this case when vhost-vdpa is used with Host application such as DPDK
> > Virtio user PMD.
> >
> > Thanks
> >
> > >
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Thanks.
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-12  6:41                 ` Jason Wang
@ 2024-04-12  9:48                   ` Srujana Challa
  2024-04-15  6:49                     ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Srujana Challa @ 2024-04-12  9:48 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma

> Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> OCTEON DPU devices
> 
> On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Jason Wang <jasowang@redhat.com>
> > > Sent: Thursday, April 11, 2024 11:32 AM
> > > To: Srujana Challa <schalla@marvell.com>
> > > Cc: Michael S. Tsirkin <mst@redhat.com>;
> > > virtualization@lists.linux.dev; xuanzhuo@linux.alibaba.com; Vamsi
> > > Krishna Attunuru <vattunuru@marvell.com>; Shijith Thotton
> > > <sthotton@marvell.com>; Nithin Kumar Dabilpuram
> > > <ndabilpuram@marvell.com>; Jerin Jacob <jerinj@marvell.com>;
> > > eperezma <eperezma@redhat.com>
> > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > Marvell OCTEON DPU devices
> > >
> > > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa <schalla@marvell.com>
> > > wrote:
> > > >
> > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver
> > > > > for Marvell OCTEON DPU devices
> > > > >
> > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > > +
> > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > {
> > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > >
> > > > > > > > > Is this a shortcut to have get better performance? DMA
> > > > > > > > > API should have those greacefully I think.
> > > > > > > > When IOMMU is disabled on host and set_map/dma_map is not
> > > > > > > > set, vhost-vdpa is reporting an error "Failed to allocate
> > > > > > > > domain, device is not
> > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > Hence we are doing this way to get better performance.
> > > > > > >
> > > > > > > The problem is, assuming the device does not have any
> > > > > > > internal
> > > IOMMU.
> > > > > > >
> > > > > > > 1) If we allow it running without IOMMU, it opens a window
> > > > > > > for guest to attack the host.
> > > > > > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY,
> > > > > > > let's report it to DMA/IOMMU maintiner to fix that
> > > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > > Can we take the vfio pci driver approach as a reference where
> > > > > > user explicitly set "enable_unsafe_noiommu_mode" using module
> param?
> > > > >
> > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > what exactly ensures correctness here? does the device have an
> > > > > on-chip iommu?
> > > > >
> > > > Our device features an on-chip IOMMU, although it is not utilized
> > > > for host-side targeted DMA operations. We included no-iommu mode
> > > > in our driver to ensure that host applications, such as DPDK
> > > > Virtio user PMD, continue to function even when operating in a no-
> IOMMU mode.
> > >
> > > I may miss something but set_map() is empty in this driver. How
> > > could such isolation be done?
> >
> > In no-iommu case, there would be no domain right, and the user of
> > vhost-vdpa(DPDK virtio user pmd), would create the mapping and pass
> > the PA (= IOVA) to the device directly. So that, device can directly DMA to the
> PA.
> 
> Yes, but this doesn't differ too much from the case where DMA API is used
> with IOMMU disabled.
> 
> Are you saying DMA API introduces overheads in this case?
No actually, current vhost-vdpa code is not allowing IOMMU disabled mode,
If set_map/dma_map op is not set. Hence, we are setting set_map with dummy
api to allow IOMMU disabled mode.

Following is the code snippet from drivers/vhost/vdpa.c

      /* Device want to do DMA by itself */
        if (ops->set_map || ops->dma_map)
                return 0;

        bus = dma_dev->bus;
        if (!bus)
                return -EFAULT;  

       if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY))
                return -ENOTSUPP;

Performance degradation when iommu enabled is not with DMA API but the x86 HW IOMMU
translation performance on certain low end x86 machines.

> 
> Thanks
> 
> >
> > >
> > > > We observed performance impacts on certain low-end x86 machines
> > > > when IOMMU mode was enabled.
> > > > I think, correctness is Host userspace application's
> > > > responsibility, in this case when vhost-vdpa is used with Host
> > > > application such as DPDK
> > > Virtio user PMD.
> > >
> > > Thanks
> > >
> > > >
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-12  9:48                   ` Srujana Challa
@ 2024-04-15  6:49                     ` Jason Wang
  2024-04-15 12:42                       ` Srujana Challa
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2024-04-15  6:49 UTC (permalink / raw)
  To: Srujana Challa
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma

On Fri, Apr 12, 2024 at 5:49 PM Srujana Challa <schalla@marvell.com> wrote:
>
> > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> > OCTEON DPU devices
> >
> > On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com> wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Jason Wang <jasowang@redhat.com>
> > > > Sent: Thursday, April 11, 2024 11:32 AM
> > > > To: Srujana Challa <schalla@marvell.com>
> > > > Cc: Michael S. Tsirkin <mst@redhat.com>;
> > > > virtualization@lists.linux.dev; xuanzhuo@linux.alibaba.com; Vamsi
> > > > Krishna Attunuru <vattunuru@marvell.com>; Shijith Thotton
> > > > <sthotton@marvell.com>; Nithin Kumar Dabilpuram
> > > > <ndabilpuram@marvell.com>; Jerin Jacob <jerinj@marvell.com>;
> > > > eperezma <eperezma@redhat.com>
> > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > > Marvell OCTEON DPU devices
> > > >
> > > > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa <schalla@marvell.com>
> > > > wrote:
> > > > >
> > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver
> > > > > > for Marvell OCTEON DPU devices
> > > > > >
> > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > > > +
> > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > {
> > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > >
> > > > > > > > > > Is this a shortcut to have get better performance? DMA
> > > > > > > > > > API should have those greacefully I think.
> > > > > > > > > When IOMMU is disabled on host and set_map/dma_map is not
> > > > > > > > > set, vhost-vdpa is reporting an error "Failed to allocate
> > > > > > > > > domain, device is not
> > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > >
> > > > > > > > The problem is, assuming the device does not have any
> > > > > > > > internal
> > > > IOMMU.
> > > > > > > >
> > > > > > > > 1) If we allow it running without IOMMU, it opens a window
> > > > > > > > for guest to attack the host.
> > > > > > > > 2) If you see perforamnce issue with IOMMU_DOMAIN_IDENTITY,
> > > > > > > > let's report it to DMA/IOMMU maintiner to fix that
> > > > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > > > Can we take the vfio pci driver approach as a reference where
> > > > > > > user explicitly set "enable_unsafe_noiommu_mode" using module
> > param?
> > > > > >
> > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > what exactly ensures correctness here? does the device have an
> > > > > > on-chip iommu?
> > > > > >
> > > > > Our device features an on-chip IOMMU, although it is not utilized
> > > > > for host-side targeted DMA operations. We included no-iommu mode
> > > > > in our driver to ensure that host applications, such as DPDK
> > > > > Virtio user PMD, continue to function even when operating in a no-
> > IOMMU mode.
> > > >
> > > > I may miss something but set_map() is empty in this driver. How
> > > > could such isolation be done?
> > >
> > > In no-iommu case, there would be no domain right, and the user of
> > > vhost-vdpa(DPDK virtio user pmd), would create the mapping and pass
> > > the PA (= IOVA) to the device directly. So that, device can directly DMA to the
> > PA.
> >
> > Yes, but this doesn't differ too much from the case where DMA API is used
> > with IOMMU disabled.
> >
> > Are you saying DMA API introduces overheads in this case?
> No actually, current vhost-vdpa code is not allowing IOMMU disabled mode,
> If set_map/dma_map op is not set. Hence, we are setting set_map with dummy
> api to allow IOMMU disabled mode.
>
> Following is the code snippet from drivers/vhost/vdpa.c
>
>       /* Device want to do DMA by itself */
>         if (ops->set_map || ops->dma_map)
>                 return 0;
>
>         bus = dma_dev->bus;
>         if (!bus)
>                 return -EFAULT;
>
>        if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY))
>                 return -ENOTSUPP;

Right, so here's the question.

When IOMMU is disabled, if there's no isolation from the device
on-chip IOMMU. It might have security implications. For example if
we're using PA, userspace could attack the kernel.

So there should be some logic in the set_map() to program the on-chip
IOMMU to isolate DMA in that case but I don't see such implementation
done in set_map().

>
> Performance degradation when iommu enabled is not with DMA API but the x86 HW IOMMU
> translation performance on certain low end x86 machines.

This might be true but it's not specific to vDPA I think?

Thanks

>
> >
> > Thanks
> >
> > >
> > > >
> > > > > We observed performance impacts on certain low-end x86 machines
> > > > > when IOMMU mode was enabled.
> > > > > I think, correctness is Host userspace application's
> > > > > responsibility, in this case when vhost-vdpa is used with Host
> > > > > application such as DPDK
> > > > Virtio user PMD.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-15  6:49                     ` Jason Wang
@ 2024-04-15 12:42                       ` Srujana Challa
  2024-04-16  3:17                         ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Srujana Challa @ 2024-04-15 12:42 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, xuanzhuo,
	Vamsi Krishna Attunuru, Shijith Thotton, Nithin Kumar Dabilpuram,
	Jerin Jacob, eperezma

> Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> OCTEON DPU devices
> 
> On Fri, Apr 12, 2024 at 5:49 PM Srujana Challa <schalla@marvell.com> wrote:
> >
> > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > Marvell OCTEON DPU devices
> > >
> > > On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com>
> wrote:
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Jason Wang <jasowang@redhat.com>
> > > > > Sent: Thursday, April 11, 2024 11:32 AM
> > > > > To: Srujana Challa <schalla@marvell.com>
> > > > > Cc: Michael S. Tsirkin <mst@redhat.com>;
> > > > > virtualization@lists.linux.dev; xuanzhuo@linux.alibaba.com;
> > > > > Vamsi Krishna Attunuru <vattunuru@marvell.com>; Shijith Thotton
> > > > > <sthotton@marvell.com>; Nithin Kumar Dabilpuram
> > > > > <ndabilpuram@marvell.com>; Jerin Jacob <jerinj@marvell.com>;
> > > > > eperezma <eperezma@redhat.com>
> > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver
> > > > > for Marvell OCTEON DPU devices
> > > > >
> > > > > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa
> > > > > <schalla@marvell.com>
> > > > > wrote:
> > > > > >
> > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA
> > > > > > > driver for Marvell OCTEON DPU devices
> > > > > > >
> > > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > > > > +
> > > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > > {
> > > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > > >
> > > > > > > > > > > Is this a shortcut to have get better performance?
> > > > > > > > > > > DMA API should have those greacefully I think.
> > > > > > > > > > When IOMMU is disabled on host and set_map/dma_map is
> > > > > > > > > > not set, vhost-vdpa is reporting an error "Failed to
> > > > > > > > > > allocate domain, device is not
> > > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > > >
> > > > > > > > > The problem is, assuming the device does not have any
> > > > > > > > > internal
> > > > > IOMMU.
> > > > > > > > >
> > > > > > > > > 1) If we allow it running without IOMMU, it opens a
> > > > > > > > > window for guest to attack the host.
> > > > > > > > > 2) If you see perforamnce issue with
> > > > > > > > > IOMMU_DOMAIN_IDENTITY, let's report it to DMA/IOMMU
> > > > > > > > > maintiner to fix that
> > > > > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > > > > Can we take the vfio pci driver approach as a reference
> > > > > > > > where user explicitly set "enable_unsafe_noiommu_mode"
> > > > > > > > using module
> > > param?
> > > > > > >
> > > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > > what exactly ensures correctness here? does the device have
> > > > > > > an on-chip iommu?
> > > > > > >
> > > > > > Our device features an on-chip IOMMU, although it is not
> > > > > > utilized for host-side targeted DMA operations. We included
> > > > > > no-iommu mode in our driver to ensure that host applications,
> > > > > > such as DPDK Virtio user PMD, continue to function even when
> > > > > > operating in a no-
> > > IOMMU mode.
> > > > >
> > > > > I may miss something but set_map() is empty in this driver. How
> > > > > could such isolation be done?
> > > >
> > > > In no-iommu case, there would be no domain right, and the user of
> > > > vhost-vdpa(DPDK virtio user pmd), would create the mapping and
> > > > pass the PA (= IOVA) to the device directly. So that, device can
> > > > directly DMA to the
> > > PA.
> > >
> > > Yes, but this doesn't differ too much from the case where DMA API is
> > > used with IOMMU disabled.
> > >
> > > Are you saying DMA API introduces overheads in this case?
> > No actually, current vhost-vdpa code is not allowing IOMMU disabled
> > mode, If set_map/dma_map op is not set. Hence, we are setting set_map
> > with dummy api to allow IOMMU disabled mode.
> >
> > Following is the code snippet from drivers/vhost/vdpa.c
> >
> >       /* Device want to do DMA by itself */
> >         if (ops->set_map || ops->dma_map)
> >                 return 0;
> >
> >         bus = dma_dev->bus;
> >         if (!bus)
> >                 return -EFAULT;
> >
> >        if (!device_iommu_capable(dma_dev,
> IOMMU_CAP_CACHE_COHERENCY))
> >                 return -ENOTSUPP;
> 
> Right, so here's the question.
> 
> When IOMMU is disabled, if there's no isolation from the device on-chip
> IOMMU. It might have security implications. For example if we're using PA,
> userspace could attack the kernel.
> 
> So there should be some logic in the set_map() to program the on-chip
> IOMMU to isolate DMA in that case but I don't see such implementation done
> in set_map().

Our chip lacks support for on-chip IOMMU for host-side targeted DMA operations.
When using the DPDK virtio user PMD, we’ve noticed a significant 80% performance
improvement when IOMMU is disabled on specific x86 machines. This performance
improvement can be leveraged by embedded platforms where applications run in
controlled environment.
May be it's a trade-off between security and performance.

We can disable the no-iommu support by default and enable it through some module
parameter and taint the kernel similar to VFIO driver(enable_unsafe_noiommu_mode) right?
> 
> >
> > Performance degradation when iommu enabled is not with DMA API but the
> > x86 HW IOMMU translation performance on certain low end x86 machines.
> 
> This might be true but it's not specific to vDPA I think?
> 
> Thanks
> 
> >
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > > We observed performance impacts on certain low-end x86
> > > > > > machines when IOMMU mode was enabled.
> > > > > > I think, correctness is Host userspace application's
> > > > > > responsibility, in this case when vhost-vdpa is used with Host
> > > > > > application such as DPDK
> > > > > Virtio user PMD.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-15 12:42                       ` Srujana Challa
@ 2024-04-16  3:17                         ` Jason Wang
  2024-04-22 20:42                           ` Michael S. Tsirkin
  2024-04-23  5:40                           ` Srujana Challa
  0 siblings, 2 replies; 28+ messages in thread
From: Jason Wang @ 2024-04-16  3:17 UTC (permalink / raw)
  To: Srujana Challa, Michael S. Tsirkin
  Cc: virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Mon, Apr 15, 2024 at 8:42 PM Srujana Challa <schalla@marvell.com> wrote:
>
> > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> > OCTEON DPU devices
> >
> > On Fri, Apr 12, 2024 at 5:49 PM Srujana Challa <schalla@marvell.com> wrote:
> > >
> > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > > Marvell OCTEON DPU devices
> > > >
> > > > On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com>
> > wrote:
> > > > >
> > > > >
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Jason Wang <jasowang@redhat.com>
> > > > > > Sent: Thursday, April 11, 2024 11:32 AM
> > > > > > To: Srujana Challa <schalla@marvell.com>
> > > > > > Cc: Michael S. Tsirkin <mst@redhat.com>;
> > > > > > virtualization@lists.linux.dev; xuanzhuo@linux.alibaba.com;
> > > > > > Vamsi Krishna Attunuru <vattunuru@marvell.com>; Shijith Thotton
> > > > > > <sthotton@marvell.com>; Nithin Kumar Dabilpuram
> > > > > > <ndabilpuram@marvell.com>; Jerin Jacob <jerinj@marvell.com>;
> > > > > > eperezma <eperezma@redhat.com>
> > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver
> > > > > > for Marvell OCTEON DPU devices
> > > > > >
> > > > > > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa
> > > > > > <schalla@marvell.com>
> > > > > > wrote:
> > > > > > >
> > > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA
> > > > > > > > driver for Marvell OCTEON DPU devices
> > > > > > > >
> > > > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > > > {
> > > > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > > > >
> > > > > > > > > > > > Is this a shortcut to have get better performance?
> > > > > > > > > > > > DMA API should have those greacefully I think.
> > > > > > > > > > > When IOMMU is disabled on host and set_map/dma_map is
> > > > > > > > > > > not set, vhost-vdpa is reporting an error "Failed to
> > > > > > > > > > > allocate domain, device is not
> > > > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > > > >
> > > > > > > > > > The problem is, assuming the device does not have any
> > > > > > > > > > internal
> > > > > > IOMMU.
> > > > > > > > > >
> > > > > > > > > > 1) If we allow it running without IOMMU, it opens a
> > > > > > > > > > window for guest to attack the host.
> > > > > > > > > > 2) If you see perforamnce issue with
> > > > > > > > > > IOMMU_DOMAIN_IDENTITY, let's report it to DMA/IOMMU
> > > > > > > > > > maintiner to fix that
> > > > > > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > > > > > Can we take the vfio pci driver approach as a reference
> > > > > > > > > where user explicitly set "enable_unsafe_noiommu_mode"
> > > > > > > > > using module
> > > > param?
> > > > > > > >
> > > > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > > > what exactly ensures correctness here? does the device have
> > > > > > > > an on-chip iommu?
> > > > > > > >
> > > > > > > Our device features an on-chip IOMMU, although it is not
> > > > > > > utilized for host-side targeted DMA operations. We included
> > > > > > > no-iommu mode in our driver to ensure that host applications,
> > > > > > > such as DPDK Virtio user PMD, continue to function even when
> > > > > > > operating in a no-
> > > > IOMMU mode.
> > > > > >
> > > > > > I may miss something but set_map() is empty in this driver. How
> > > > > > could such isolation be done?
> > > > >
> > > > > In no-iommu case, there would be no domain right, and the user of
> > > > > vhost-vdpa(DPDK virtio user pmd), would create the mapping and
> > > > > pass the PA (= IOVA) to the device directly. So that, device can
> > > > > directly DMA to the
> > > > PA.
> > > >
> > > > Yes, but this doesn't differ too much from the case where DMA API is
> > > > used with IOMMU disabled.
> > > >
> > > > Are you saying DMA API introduces overheads in this case?
> > > No actually, current vhost-vdpa code is not allowing IOMMU disabled
> > > mode, If set_map/dma_map op is not set. Hence, we are setting set_map
> > > with dummy api to allow IOMMU disabled mode.
> > >
> > > Following is the code snippet from drivers/vhost/vdpa.c
> > >
> > >       /* Device want to do DMA by itself */
> > >         if (ops->set_map || ops->dma_map)
> > >                 return 0;
> > >
> > >         bus = dma_dev->bus;
> > >         if (!bus)
> > >                 return -EFAULT;
> > >
> > >        if (!device_iommu_capable(dma_dev,
> > IOMMU_CAP_CACHE_COHERENCY))
> > >                 return -ENOTSUPP;
> >
> > Right, so here's the question.
> >
> > When IOMMU is disabled, if there's no isolation from the device on-chip
> > IOMMU. It might have security implications. For example if we're using PA,
> > userspace could attack the kernel.
> >
> > So there should be some logic in the set_map() to program the on-chip
> > IOMMU to isolate DMA in that case but I don't see such implementation done
> > in set_map().
>
> Our chip lacks support for on-chip IOMMU for host-side targeted DMA operations.
> When using the DPDK virtio user PMD, we’ve noticed a significant 80% performance
> improvement when IOMMU is disabled on specific x86 machines. This performance
> improvement can be leveraged by embedded platforms where applications run in
> controlled environment.
> May be it's a trade-off between security and performance.
>
> We can disable the no-iommu support by default and enable it through some module
> parameter and taint the kernel similar to VFIO driver(enable_unsafe_noiommu_mode) right?

Could be one way.

Michael, any thoughts on this?

Thanks

> >
> > >
> > > Performance degradation when iommu enabled is not with DMA API but the
> > > x86 HW IOMMU translation performance on certain low end x86 machines.
> >
> > This might be true but it's not specific to vDPA I think?
> >
> > Thanks
> >
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > > We observed performance impacts on certain low-end x86
> > > > > > > machines when IOMMU mode was enabled.
> > > > > > > I think, correctness is Host userspace application's
> > > > > > > responsibility, in this case when vhost-vdpa is used with Host
> > > > > > > application such as DPDK
> > > > > > Virtio user PMD.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-16  3:17                         ` Jason Wang
@ 2024-04-22 20:42                           ` Michael S. Tsirkin
  2024-04-24 13:05                             ` Srujana Challa
  2024-04-23  5:40                           ` Srujana Challa
  1 sibling, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2024-04-22 20:42 UTC (permalink / raw)
  To: Jason Wang
  Cc: Srujana Challa, virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

On Tue, Apr 16, 2024 at 11:17:48AM +0800, Jason Wang wrote:
> On Mon, Apr 15, 2024 at 8:42 PM Srujana Challa <schalla@marvell.com> wrote:
> >
> > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell
> > > OCTEON DPU devices
> > >
> > > On Fri, Apr 12, 2024 at 5:49 PM Srujana Challa <schalla@marvell.com> wrote:
> > > >
> > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for
> > > > > Marvell OCTEON DPU devices
> > > > >
> > > > > On Fri, Apr 12, 2024 at 1:13 PM Srujana Challa <schalla@marvell.com>
> > > wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Jason Wang <jasowang@redhat.com>
> > > > > > > Sent: Thursday, April 11, 2024 11:32 AM
> > > > > > > To: Srujana Challa <schalla@marvell.com>
> > > > > > > Cc: Michael S. Tsirkin <mst@redhat.com>;
> > > > > > > virtualization@lists.linux.dev; xuanzhuo@linux.alibaba.com;
> > > > > > > Vamsi Krishna Attunuru <vattunuru@marvell.com>; Shijith Thotton
> > > > > > > <sthotton@marvell.com>; Nithin Kumar Dabilpuram
> > > > > > > <ndabilpuram@marvell.com>; Jerin Jacob <jerinj@marvell.com>;
> > > > > > > eperezma <eperezma@redhat.com>
> > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver
> > > > > > > for Marvell OCTEON DPU devices
> > > > > > >
> > > > > > > On Wed, Apr 10, 2024 at 8:35 PM Srujana Challa
> > > > > > > <schalla@marvell.com>
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA
> > > > > > > > > driver for Marvell OCTEON DPU devices
> > > > > > > > >
> > > > > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa wrote:
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > > > > {
> > > > > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > > > > >
> > > > > > > > > > > > > Is this a shortcut to have get better performance?
> > > > > > > > > > > > > DMA API should have those greacefully I think.
> > > > > > > > > > > > When IOMMU is disabled on host and set_map/dma_map is
> > > > > > > > > > > > not set, vhost-vdpa is reporting an error "Failed to
> > > > > > > > > > > > allocate domain, device is not
> > > > > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > > > > >
> > > > > > > > > > > The problem is, assuming the device does not have any
> > > > > > > > > > > internal
> > > > > > > IOMMU.
> > > > > > > > > > >
> > > > > > > > > > > 1) If we allow it running without IOMMU, it opens a
> > > > > > > > > > > window for guest to attack the host.
> > > > > > > > > > > 2) If you see perforamnce issue with
> > > > > > > > > > > IOMMU_DOMAIN_IDENTITY, let's report it to DMA/IOMMU
> > > > > > > > > > > maintiner to fix that
> > > > > > > > > > It will be helpful for host networking case when iommu is disabled.
> > > > > > > > > > Can we take the vfio pci driver approach as a reference
> > > > > > > > > > where user explicitly set "enable_unsafe_noiommu_mode"
> > > > > > > > > > using module
> > > > > param?
> > > > > > > > >
> > > > > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > > > > what exactly ensures correctness here? does the device have
> > > > > > > > > an on-chip iommu?
> > > > > > > > >
> > > > > > > > Our device features an on-chip IOMMU, although it is not
> > > > > > > > utilized for host-side targeted DMA operations. We included
> > > > > > > > no-iommu mode in our driver to ensure that host applications,
> > > > > > > > such as DPDK Virtio user PMD, continue to function even when
> > > > > > > > operating in a no-
> > > > > IOMMU mode.
> > > > > > >
> > > > > > > I may miss something but set_map() is empty in this driver. How
> > > > > > > could such isolation be done?
> > > > > >
> > > > > > In no-iommu case, there would be no domain right, and the user of
> > > > > > vhost-vdpa(DPDK virtio user pmd), would create the mapping and
> > > > > > pass the PA (= IOVA) to the device directly. So that, device can
> > > > > > directly DMA to the
> > > > > PA.
> > > > >
> > > > > Yes, but this doesn't differ too much from the case where DMA API is
> > > > > used with IOMMU disabled.
> > > > >
> > > > > Are you saying DMA API introduces overheads in this case?
> > > > No actually, current vhost-vdpa code is not allowing IOMMU disabled
> > > > mode, If set_map/dma_map op is not set. Hence, we are setting set_map
> > > > with dummy api to allow IOMMU disabled mode.
> > > >
> > > > Following is the code snippet from drivers/vhost/vdpa.c
> > > >
> > > >       /* Device want to do DMA by itself */
> > > >         if (ops->set_map || ops->dma_map)
> > > >                 return 0;
> > > >
> > > >         bus = dma_dev->bus;
> > > >         if (!bus)
> > > >                 return -EFAULT;
> > > >
> > > >        if (!device_iommu_capable(dma_dev,
> > > IOMMU_CAP_CACHE_COHERENCY))
> > > >                 return -ENOTSUPP;
> > >
> > > Right, so here's the question.
> > >
> > > When IOMMU is disabled, if there's no isolation from the device on-chip
> > > IOMMU. It might have security implications. For example if we're using PA,
> > > userspace could attack the kernel.
> > >
> > > So there should be some logic in the set_map() to program the on-chip
> > > IOMMU to isolate DMA in that case but I don't see such implementation done
> > > in set_map().
> >
> > Our chip lacks support for on-chip IOMMU for host-side targeted DMA operations.
> > When using the DPDK virtio user PMD, we’ve noticed a significant 80% performance
> > improvement when IOMMU is disabled on specific x86 machines. This performance
> > improvement can be leveraged by embedded platforms where applications run in
> > controlled environment.
> > May be it's a trade-off between security and performance.
> >
> > We can disable the no-iommu support by default and enable it through some module
> > parameter and taint the kernel similar to VFIO driver(enable_unsafe_noiommu_mode) right?
> 
> Could be one way.
> 
> Michael, any thoughts on this?
> 
> Thanks

My thought is there's nothing special about the Marvell chip here.
Merge it normally. Then if you like work on a no-iommu mode in vdpa.


> > >
> > > >
> > > > Performance degradation when iommu enabled is not with DMA API but the
> > > > x86 HW IOMMU translation performance on certain low end x86 machines.
> > >
> > > This might be true but it's not specific to vDPA I think?
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > We observed performance impacts on certain low-end x86
> > > > > > > > machines when IOMMU mode was enabled.
> > > > > > > > I think, correctness is Host userspace application's
> > > > > > > > responsibility, in this case when vhost-vdpa is used with Host
> > > > > > > > application such as DPDK
> > > > > > > Virtio user PMD.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-27 11:21 [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices Srujana Challa
  2024-03-29  4:28 ` Jason Wang
  2024-03-29 11:37 ` Stefano Garzarella
@ 2024-04-22 20:44 ` Michael S. Tsirkin
  2024-04-24 19:08 ` [PATCH v2] " Srujana Challa
  3 siblings, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2024-04-22 20:44 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, jasowang, xuanzhuo, vattunuru, sthotton,
	ndabilpuram, jerinj

On Wed, Mar 27, 2024 at 04:51:36PM +0530, Srujana Challa wrote:
> This commit introduces a new vDPA driver specifically designed for
> managing the virtio control plane over the vDPA bus for OCTEON DPU
> devices. The driver consists of two layers:
> 
> 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> operations and configurations related to the DPU device.
> 
> 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> implements device operations for the vDPA bus. It handles device
> probing, bus attachment, vring operations, and other relevant tasks.
> 
> Signed-off-by: Srujana Challa <schalla@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>

Srujana if this is to get in the next window you should iterate.

> ---
>  MAINTAINERS                              |   5 +
>  drivers/vdpa/Kconfig                     |   9 +
>  drivers/vdpa/Makefile                    |   1 +
>  drivers/vdpa/octeon_ep/Makefile          |   4 +
>  drivers/vdpa/octeon_ep/octep_vdpa.h      |  93 +++
>  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 457 ++++++++++++
>  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 903 +++++++++++++++++++++++
>  7 files changed, 1472 insertions(+)
>  create mode 100644 drivers/vdpa/octeon_ep/Makefile
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cfe44a876d8a..539ce209a960 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13127,6 +13127,11 @@ S:	Supported
>  F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
>  F:	drivers/mmc/host/sdhci-xenon*
>  
> +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> +R:	schalla@marvell.com
> +R:	vattunuru@marvell.com
> +F:	drivers/vdpa/octeon_ep/
> +
>  MATROX FRAMEBUFFER DRIVER
>  L:	linux-fbdev@vger.kernel.org
>  S:	Orphan
> diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> index 656c1cb541de..775149fad476 100644
> --- a/drivers/vdpa/Kconfig
> +++ b/drivers/vdpa/Kconfig
> @@ -126,4 +126,13 @@ config PDS_VDPA
>  	  With this driver, the VirtIO dataplane can be
>  	  offloaded to an AMD/Pensando DSC device.
>  
> +config OCTEONEP_VDPA
> +	tristate "vDPA driver for Octeon DPU devices"
> +	depends on m
> +	depends on PCI_MSI
> +	help
> +	  vDPA driver for Marvell's Octeon DPU devices.
> +	  With this driver, the VirtIO dataplane can be
> +	  offloaded to a Octeon DPU device.
> +
>  endif # VDPA
> diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> index 8f53c6f3cca7..5654d36707af 100644
> --- a/drivers/vdpa/Makefile
> +++ b/drivers/vdpa/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
>  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
>  obj-$(CONFIG_SNET_VDPA) += solidrun/
>  obj-$(CONFIG_PDS_VDPA) += pds/
> +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
> new file mode 100644
> index 000000000000..e23e2ff14f33
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
> new file mode 100644
> index 000000000000..60d2efc8f8b8
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> @@ -0,0 +1,93 @@
> +/* SPDX-License-Identifier: GPL-2.0-only
> + * Copyright (C) 2024 Marvell.
> + */
> +#ifndef __OCTEP_VDPA_H__
> +#define __OCTEP_VDPA_H__
> +
> +#include <linux/pci.h>
> +#include <linux/pci_regs.h>
> +#include <linux/vdpa.h>
> +#include <linux/virtio_pci_modern.h>
> +#include <uapi/linux/virtio_net.h>
> +#include <uapi/linux/virtio_blk.h>
> +#include <uapi/linux/virtio_config.h>
> +#include <uapi/linux/virtio_pci.h>
> +#include <uapi/linux/vdpa.h>
> +
> +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> +
> +#define OCTEP_HW_MBOX_BAR 0
> +#define OCTEP_HW_CAPS_BAR 4
> +
> +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> +
> +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> +
> +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> +
> +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> +
> +enum octep_vdpa_dev_status {
> +	OCTEP_VDPA_DEV_STATUS_INVALID,
> +	OCTEP_VDPA_DEV_STATUS_ALLOC,
> +	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> +	OCTEP_VDPA_DEV_STATUS_INIT,
> +	OCTEP_VDPA_DEV_STATUS_READY,
> +	OCTEP_VDPA_DEV_STATUS_UNINIT
> +};
> +
> +struct octep_vring_info {
> +	struct vdpa_callback cb;
> +	void __iomem *notify_addr;
> +	u32 __iomem *cb_notify_addr;
> +	phys_addr_t notify_pa;
> +	char msix_name[256];
> +};
> +
> +struct octep_hw {
> +	struct pci_dev *pdev;
> +	u8 __iomem *base[PCI_STD_NUM_BARS];
> +	struct virtio_pci_common_cfg __iomem *common_cfg;
> +	u8 __iomem *dev_cfg;
> +	u8 __iomem *isr;
> +	void __iomem *notify_base;
> +	phys_addr_t notify_base_pa;
> +	u32 notify_off_multiplier;
> +	u8 notify_bar;
> +	struct octep_vring_info *vqs;
> +	struct vdpa_callback config_cb;
> +	u64 features;
> +	u64 drv_features;
> +	u16 nr_vring;
> +	u32 config_size;
> +	int irq;
> +};
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> +void octep_hw_reset(struct octep_hw *oct_hw);
> +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw);
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +			 u64 device_area);
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
> +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> +
> +#endif /* __OCTEP_VDPA_H__ */
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> new file mode 100644
> index 000000000000..0a2f1d09c4ab
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> @@ -0,0 +1,457 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/iopoll.h>
> +
> +#include "octep_vdpa.h"
> +
> +enum octep_mbox_ids {
> +	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> +	OCTEP_MBOX_MSG_GET_VQ_STATE,
> +};
> +
> +#define OCTEP_HW_TIMEOUT       10000000
> +
> +#define MBOX_OFFSET            64
> +#define MBOX_RSP_MASK          0x00000001
> +#define MBOX_RC_MASK           0x0000FFFE
> +
> +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
> +
> +struct octep_mbox_hdr {
> +	u8 ver;
> +	u8 rsvd1;
> +	u16 id;
> +	u16 rsvd2;
> +#define MBOX_REQ_SIG (0xdead)
> +#define MBOX_RSP_SIG (0xbeef)
> +	u16 sig;
> +};
> +
> +struct octep_mbox_sts {
> +	u16 rsp:1;
> +	u16 rc:15;
> +	u16 rsvd;
> +};
> +
> +struct octep_mbox {
> +	struct octep_mbox_hdr hdr;
> +	struct octep_mbox_sts sts;
> +	u64 rsvd;
> +	u32 data[];
> +};
> +
> +static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
> +{
> +	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
> +}
> +
> +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
> +{
> +	u32 val;
> +
> +	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
> +				  OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
> +{
> +	u32 val;
> +
> +	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
> +				  OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
> +{
> +	iowrite16(id, &mbox->hdr.id);
> +	iowrite16(sig, &mbox->hdr.sig);
> +}
> +
> +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> +{
> +	return ioread16(&mbox->hdr.sig);
> +}
> +
> +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
> +{
> +	iowrite32(sts, &mbox->sts);
> +}
> +
> +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> +{
> +	return ioread32(&mbox->sts);
> +}
> +
> +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
> +{
> +	return ioread32(&mbox->data[word_idx]);
> +}
> +
> +static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
> +{
> +	return iowrite32(word, &mbox->data[word_idx]);
> +}
> +
> +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
> +			      u32 buf_size, bool write)
> +{
> +	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	u32 *p = (u32 *)buffer;
> +	u16 data_wds;
> +	int ret, i;
> +	u32 val;
> +
> +	if (!IS_ALIGNED(buf_size, 4))
> +		return -EINVAL;
> +
> +	/* Make sure mbox space is available */
> +	ret = octep_wait_for_mbox_avail(mbox);
> +	if (ret) {
> +		dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
> +		return ret;
> +	}
> +	data_wds = buf_size / 4;
> +
> +	if (write) {
> +		for (i = 1; i <= data_wds; i++) {
> +			octep_write32_word(mbox, i, *p);
> +			p++;
> +		}
> +	}
> +	octep_write32_word(mbox, 0, (u32)qid);
> +	octep_write_sts(mbox, 0);
> +
> +	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> +
> +	ret = octep_wait_for_mbox_rsp(mbox);
> +	if (ret) {
> +		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
> +		return ret;
> +	}
> +
> +	val = octep_read_sig(mbox);
> +	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> +		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
> +		return ret;
> +	}
> +
> +	val = octep_read_sts(mbox);
> +	if (val & MBOX_RC_MASK) {
> +		ret = MBOX_RSP_TO_ERR(val);
> +		dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
> +		return ret;
> +	}
> +
> +	if (!write)
> +		for (i = 1; i <= data_wds; i++)
> +			*p++ = octep_read32_word(mbox, i);
> +
> +	return 0;
> +}
> +
> +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> +{
> +	iowrite32(1, &mbox->sts);
> +}
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> +{
> +	return ioread8(&oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> +{
> +	iowrite8(status, &oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_reset(struct octep_hw *oct_hw)
> +{
> +	u8 val;
> +
> +	octep_hw_set_status(oct_hw, 0 | BIT(7));
> +	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
> +			       OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> +		return;
> +	}
> +}
> +
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> +{
> +	u32 features_lo, features_hi;
> +	u32 val, select;
> +
> +	select = 0;
> +	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
> +
> +	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
> +			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +		return 0ULL;
> +	}
> +	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +	select = 1;
> +	iowrite32(select | BIT(15), &oct_hw->common_cfg->device_feature_select);
> +
> +	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->device_feature_select, val,
> +			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +		return 0ULL;
> +	}
> +	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +	return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> +{
> +	u32 val, select;
> +
> +	select = 0;
> +	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
> +
> +	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
> +			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +		return;
> +	}
> +	iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
> +
> +	select = 1;
> +	iowrite32(select | BIT(15), &oct_hw->common_cfg->guest_feature_select);
> +
> +	if (readx_poll_timeout(ioread32, &oct_hw->common_cfg->guest_feature_select, val,
> +			       val == select, 10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +		return;
> +	}
> +	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> +}
> +
> +void octep_write_queue_select(u16 queue_id, struct octep_hw *oct_hw)
> +{
> +	u16 val;
> +
> +	iowrite16(queue_id | BIT(15), &oct_hw->common_cfg->queue_select);
> +
> +	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
> +			       10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> +		return;
> +	}
> +}
> +
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> +{
> +	iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> +}
> +
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
> +{
> +	u8 old_gen, new_gen, *p;
> +	int i;
> +
> +	WARN_ON(offset + length > oct_hw->config_size);
> +	do {
> +		old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +		p = dst;
> +		for (i = 0; i < length; i++)
> +			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
> +
> +		new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +	} while (old_gen != new_gen);
> +}
> +
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +			 u64 device_area)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(qid, oct_hw);
> +	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +			     &cfg->queue_desc_hi);
> +	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +			     &cfg->queue_avail_hi);
> +	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +			     &cfg->queue_used_hi);
> +
> +	return 0;
> +}
> +
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
> +{
> +	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> +				  sizeof(*state), 0);
> +}
> +
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
> +{
> +	struct vdpa_vq_state q_state;
> +
> +	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> +	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> +				  sizeof(*state), 1);
> +}
> +
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(qid, oct_hw);
> +	iowrite16(num, &cfg->queue_size);
> +}
> +
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(qid, oct_hw);
> +	iowrite16(ready, &cfg->queue_enable);
> +}
> +
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(qid, oct_hw);
> +	return ioread16(&cfg->queue_enable);
> +}
> +
> +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> +{
> +	octep_write_queue_select(0, oct_hw);
> +	return ioread16(&oct_hw->common_cfg->queue_size);
> +}
> +
> +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> +{
> +	return sizeof(struct virtio_net_config);
> +}
> +
> +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
> +{
> +	struct device *dev = &oct_hw->pdev->dev;
> +	u32 length = cap->length;
> +	u32 offset = cap->offset;
> +	u8  bar    = cap->bar;
> +	u32 len;
> +
> +	if (bar != OCTEP_HW_CAPS_BAR) {
> +		dev_err(dev, "Invalid bar: %u\n", bar);
> +		return NULL;
> +	}
> +	if (offset + length < offset) {
> +		dev_err(dev, "offset(%u) + length(%u) overflows\n",
> +			offset, length);
> +		return NULL;
> +	}
> +	len = pci_resource_len(oct_hw->pdev, bar);
> +	if (offset + length > len) {
> +		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> +			offset + length, len);
> +		return NULL;
> +	}
> +	return oct_hw->base[bar] + offset;
> +}
> +
> +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
> +{
> +	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> +	u8 *p = buf;
> +	size_t i;
> +
> +	for (i = 0; i < len; i++)
> +		*p++ = ioread8(bar + offset + i);
> +}
> +
> +static int pci_signature_verify(struct octep_hw *oct_hw)
> +{
> +	u32 signature[2];
> +
> +	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> +
> +	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> +		return -1;
> +
> +	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> +{
> +	struct octep_mbox __iomem *mbox;
> +	struct device *dev = &pdev->dev;
> +	struct virtio_pci_cap cap;
> +	int ret;
> +	u8 pos;
> +
> +	oct_hw->pdev = pdev;
> +	ret = pci_signature_verify(oct_hw);
> +	if (ret) {
> +		dev_err(dev, "Octeon Virtio FW is not initialized\n");
> +		return -EIO;
> +	}
> +
> +	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> +
> +	while (pos) {
> +		pci_caps_read(oct_hw, &cap, 2, pos);
> +
> +		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> +			dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
> +			break;
> +		}
> +
> +		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> +
> +		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
> +			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> +
> +		switch (cap.cfg_type) {
> +		case VIRTIO_PCI_CAP_COMMON_CFG:
> +			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> +			break;
> +		case VIRTIO_PCI_CAP_NOTIFY_CFG:
> +			pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> +				      4, pos + sizeof(cap));
> +
> +			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> +			oct_hw->notify_bar = cap.bar;
> +			oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + cap.offset;
> +			break;
> +		case VIRTIO_PCI_CAP_DEVICE_CFG:
> +			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> +			break;
> +		case VIRTIO_PCI_CAP_ISR_CFG:
> +			oct_hw->isr = get_cap_addr(oct_hw, &cap);
> +			break;
> +		}
> +
> +		pos = cap.cap_next;
> +	}
> +	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> +	    !oct_hw->dev_cfg    || !oct_hw->isr) {
> +		dev_err(dev, "Incomplete PCI capabilities");
> +		return -EIO;
> +	}
> +	oct_hw->config_size = octep_get_config_size(oct_hw);
> +
> +	mbox = octep_get_mbox(oct_hw);
> +	octep_mbox_init(mbox);
> +
> +	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
> +	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
> +	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> +	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> +		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> +	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> +
> +	return 0;
> +}
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> new file mode 100644
> index 000000000000..845fd35368ff
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> @@ -0,0 +1,903 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/iommu.h>
> +#include "octep_vdpa.h"
> +
> +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> +
> +struct octep_pf {
> +	u8 __iomem *base[PCI_STD_NUM_BARS];
> +	struct pci_dev *pdev;
> +	struct resource res;
> +	u64 vf_base;
> +	int enabled_vfs;
> +	u32 vf_stride;
> +	u16 vf_devid;
> +};
> +
> +struct octep_vdpa {
> +	struct vdpa_device vdpa;
> +	struct octep_hw *oct_hw;
> +	struct pci_dev *pdev;
> +};
> +
> +struct octep_vdpa_mgmt_dev {
> +	struct vdpa_mgmt_dev mdev;
> +	struct octep_hw oct_hw;
> +	struct pci_dev *pdev;
> +	/* Work entry to handle device setup */
> +	struct work_struct setup_task;
> +	/* Device status */
> +	atomic_t status;
> +};
> +
> +static int verify_features(u64 features)
> +{
> +	/* Minimum features to expect */
> +	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> +		return -EOPNOTSUPP;
> +
> +	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> +		return -EOPNOTSUPP;
> +
> +	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> +		return -EOPNOTSUPP;
> +
> +	/* Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
> +	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
> +	 */
> +	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
> +	    BIT_ULL(VIRTIO_NET_F_MQ))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_vdpa *oct_vdpa;
> +
> +	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> +
> +	return oct_vdpa->oct_hw;
> +}
> +
> +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> +{
> +	struct octep_hw *oct_hw = data;
> +	int i;
> +
> +	for (i = 0; i < oct_hw->nr_vring; i++) {
> +		if (oct_hw->vqs[i].cb.callback && *oct_hw->vqs[i].cb_notify_addr) {
> +			*oct_hw->vqs[i].cb_notify_addr = 0;
> +			oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> +		}
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void octep_free_irqs(struct octep_hw *oct_hw)
> +{
> +	struct pci_dev *pdev = oct_hw->pdev;
> +
> +	if (oct_hw->irq != -1) {
> +		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> +		oct_hw->irq = -1;
> +	}
> +	pci_free_irq_vectors(pdev);
> +}
> +
> +static int octep_request_irqs(struct octep_hw *oct_hw)
> +{
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	int ret, irq;
> +
> +	/* Use one ring/interrupt per VF for virtio call interface. */
> +	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> +	if (ret < 0) {
> +		dev_err(&pdev->dev, "Failed to alloc msix vector");
> +		return ret;
> +	}
> +
> +	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> +		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> +
> +	irq = pci_irq_vector(pdev, 0);
> +	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> +			       oct_hw->vqs->msix_name, oct_hw);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> +		goto free_irq_vec;
> +	}
> +	oct_hw->irq = irq;
> +
> +	return 0;
> +
> +free_irq_vec:
> +	pci_free_irq_vectors(pdev);
> +	return ret;
> +}
> +
> +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return oct_hw->features;
> +}
> +
> +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	int ret;
> +
> +	pr_debug("Driver Features: %llx\n", features);
> +	ret = verify_features(features);
> +	if (ret)
> +		return ret;
> +
> +	octep_hw_set_drv_features(oct_hw, features);
> +	oct_hw->drv_features = features;
> +
> +	return 0;
> +}
> +
> +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return oct_hw->features & oct_hw->drv_features;
> +}
> +
> +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_hw_get_status(oct_hw);
> +}
> +
> +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u8 status_old;
> +
> +	status_old = octep_hw_get_status(oct_hw);
> +
> +	if (status_old == status)
> +		return;
> +
> +	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> +	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +		if (octep_request_irqs(oct_hw))
> +			status = status_old | VIRTIO_CONFIG_S_FAILED;
> +	}
> +	octep_hw_set_status(oct_hw, status);
> +}
> +
> +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u8 status = octep_hw_get_status(oct_hw);
> +	u16 qid;
> +
> +	if (status == 0)
> +		return 0;
> +
> +	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> +		oct_hw->vqs[qid].cb.callback = NULL;
> +		oct_hw->vqs[qid].cb.private = NULL;
> +		oct_hw->config_cb.callback = NULL;
> +		oct_hw->config_cb.private = NULL;
> +	}
> +	octep_hw_reset(oct_hw);
> +
> +	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> +		octep_free_irqs(oct_hw);
> +
> +	return 0;
> +}
> +
> +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_size(oct_hw);
> +}
> +
> +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +				   struct vdpa_vq_state *state)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_state(oct_hw, qid, state);
> +}
> +
> +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +				   const struct vdpa_vq_state *state)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_set_vq_state(oct_hw, qid, state);
> +}
> +
> +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	oct_hw->vqs[qid].cb = *cb;
> +}
> +
> +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_set_vq_ready(oct_hw, qid, ready);
> +}
> +
> +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_ready(oct_hw, qid);
> +}
> +
> +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_set_vq_num(oct_hw, qid, num);
> +}
> +
> +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
> +				     u64 driver_area, u64 device_area)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> +	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> +	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> +
> +	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
> +}
> +
> +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_notify_queue(oct_hw, qid);
> +}
> +
> +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u16 idx = data & 0xFFFF;
> +
> +	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
> +}
> +
> +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return vp_ioread8(&oct_hw->common_cfg->config_generation);
> +}
> +
> +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
> +{
> +	return VIRTIO_ID_NET;
> +}
> +
> +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
> +{
> +	return PCI_VENDOR_ID_CAVIUM;
> +}
> +
> +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
> +{
> +	return PAGE_SIZE;
> +}
> +
> +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return oct_hw->config_size;
> +}
> +
> +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
> +				  unsigned int len)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_read_dev_config(oct_hw, offset, buf, len);
> +}
> +
> +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
> +				  const void *buf, unsigned int len)
> +{
> +	/* Not supported */
> +}
> +
> +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	oct_hw->config_cb.callback = cb->callback;
> +	oct_hw->config_cb.private = cb->private;
> +}
> +
> +static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
> +							       u16 idx)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	struct vdpa_notification_area area;
> +
> +	area.addr = oct_hw->vqs[idx].notify_pa;
> +	area.size = PAGE_SIZE;
> +
> +	return area;
> +}
> +
> +static int octep_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
> +			      struct vhost_iotlb *iotlb)
> +{
> +	return 0;
> +}
> +
> +static struct vdpa_config_ops octep_vdpa_ops = {
> +	.get_device_features = octep_vdpa_get_device_features,
> +	.set_driver_features = octep_vdpa_set_driver_features,
> +	.get_driver_features = octep_vdpa_get_driver_features,
> +	.get_status	= octep_vdpa_get_status,
> +	.set_status	= octep_vdpa_set_status,
> +	.reset		= octep_vdpa_reset,
> +	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
> +	.get_vq_state	= octep_vdpa_get_vq_state,
> +	.set_vq_state	= octep_vdpa_set_vq_state,
> +	.set_vq_cb	= octep_vdpa_set_vq_cb,
> +	.set_vq_ready	= octep_vdpa_set_vq_ready,
> +	.get_vq_ready	= octep_vdpa_get_vq_ready,
> +	.set_vq_num	= octep_vdpa_set_vq_num,
> +	.set_vq_address	= octep_vdpa_set_vq_address,
> +	.get_vq_irq	= NULL,
> +	.kick_vq	= octep_vdpa_kick_vq,
> +	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
> +	.get_generation	= octep_vdpa_get_generation,
> +	.get_device_id	= octep_vdpa_get_device_id,
> +	.get_vendor_id	= octep_vdpa_get_vendor_id,
> +	.get_vq_align	= octep_vdpa_get_vq_align,
> +	.get_config_size	= octep_vdpa_get_config_size,
> +	.get_config	= octep_vdpa_get_config,
> +	.set_config	= octep_vdpa_set_config,
> +	.set_config_cb  = octep_vdpa_set_config_cb,
> +	.get_vq_notification = octep_get_vq_notification,
> +};
> +
> +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +	int ret;
> +
> +	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
> +		return ret;
> +	}
> +
> +	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> +	if (!tbl[bar]) {
> +		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> +		pci_release_region(pdev, bar);
> +		ret = -ENOMEM;
> +	}
> +
> +	return ret;
> +}
> +
> +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +	pci_iounmap(pdev, tbl[bar]);
> +	pci_release_region(pdev, bar);
> +}
> +
> +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
> +{
> +	struct pci_dev *pf_dev = octpf->pdev;
> +	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct pci_bus_region bus_region;
> +
> +	octpf->res.start = res->start;
> +	octpf->res.end = res->end;
> +	octpf->vf_base = res->start;
> +
> +	bus_region.start = res->start;
> +	bus_region.end = res->start - 1;
> +
> +	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
> +{
> +	struct pci_dev *pf_dev = octpf->pdev;
> +	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct pci_bus_region bus_region;
> +
> +	bus_region.start = octpf->res.start;
> +	bus_region.end = octpf->res.end;
> +
> +	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_remove_pf(struct pci_dev *pdev)
> +{
> +	struct octep_pf *octpf = pci_get_drvdata(pdev);
> +
> +	pci_disable_sriov(pdev);
> +
> +	if (octpf->base[OCTEP_HW_CAPS_BAR])
> +		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
> +
> +	if (octpf->base[OCTEP_HW_MBOX_BAR])
> +		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +
> +	octep_vdpa_pf_bar_expand(octpf);
> +}
> +
> +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
> +{
> +	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> +
> +	memset(vf_res, 0, sizeof(*vf_res));
> +}
> +
> +static void octep_vdpa_remove_vf(struct pci_dev *pdev)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> +	struct octep_hw *oct_hw;
> +	int status;
> +
> +	oct_hw = &mgmt_dev->oct_hw;
> +	status = atomic_read(&mgmt_dev->status);
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
> +
> +	if (status == OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT) {
> +		cancel_work_sync(&mgmt_dev->setup_task);
> +	} else if (status == OCTEP_VDPA_DEV_STATUS_READY) {
> +		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> +		kfree(mgmt_dev->oct_hw.vqs);
> +	}
> +
> +	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> +		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +
> +	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> +		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
> +
> +	octep_vdpa_vf_bar_shrink(pdev);
> +}
> +
> +static void octep_vdpa_remove(struct pci_dev *pdev)
> +{
> +	if (pdev->is_virtfn)
> +		octep_vdpa_remove_vf(pdev);
> +	else
> +		octep_vdpa_remove_pf(pdev);
> +}
> +
> +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
> +			      const struct vdpa_dev_set_config *config)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
> +	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	struct vdpa_device *vdpa_dev;
> +	struct octep_vdpa *oct_vdpa;
> +	u64 device_features;
> +	u16 notify_off;
> +	int i, ret;
> +
> +	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
> +				     NULL, false);
> +	if (IS_ERR(oct_vdpa)) {
> +		dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
> +		return PTR_ERR(oct_vdpa);
> +	}
> +
> +	oct_vdpa->pdev = pdev;
> +	oct_vdpa->vdpa.dma_dev = &pdev->dev;
> +	oct_vdpa->vdpa.mdev = mdev;
> +	oct_vdpa->oct_hw = oct_hw;
> +	vdpa_dev = &oct_vdpa->vdpa;
> +
> +	device_features = oct_hw->features;
> +	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> +		if (config->device_features & ~device_features) {
> +			dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
> +				config->device_features, device_features);
> +			return -EINVAL;
> +		}
> +		device_features &= config->device_features;
> +	}
> +
> +	oct_hw->features = device_features;
> +
> +	if (verify_features(device_features)) {
> +		dev_warn(mdev->device,
> +			 "Must provision minimum features 0x%llx for this device",
> +			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> +			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
> +		return -EOPNOTSUPP;
> +	}
> +
> +	oct_hw->vqs = kcalloc(oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
> +	if (!oct_hw->vqs)
> +		return -ENOMEM;
> +
> +	oct_hw->irq = -1;
> +
> +	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> +	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> +
> +	for (i = 0; i < oct_hw->nr_vring; i++) {
> +		octep_write_queue_select(i, oct_hw);
> +		notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
> +		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> +			notify_off * oct_hw->notify_off_multiplier;
> +		oct_hw->vqs[i].cb_notify_addr = (u32 *)oct_hw->vqs[i].notify_addr + 1;
> +		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> +			notify_off * oct_hw->notify_off_multiplier;
> +	}
> +
> +	if (name)
> +		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> +	else
> +		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
> +
> +	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to register to vDPA bus");
> +		goto free_vqs;
> +	}
> +	return 0;
> +
> +free_vqs:
> +	put_device(&oct_vdpa->vdpa.dev);
> +	kfree(oct_hw->vqs);
> +	return ret;
> +}
> +
> +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
> +{
> +	_vdpa_unregister_device(dev);
> +}
> +
> +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> +	.dev_add = octep_vdpa_dev_add,
> +	.dev_del = octep_vdpa_dev_del
> +};
> +
> +static bool get_device_ready_status(u8 __iomem *addr)
> +{
> +	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> +
> +	if (signature == OCTEP_DEV_READY_SIGNATURE) {
> +		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static void octep_vdpa_setup_task(struct work_struct *work)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
> +							    setup_task);
> +	struct pci_dev *pdev = mgmt_dev->pdev;
> +	struct device *dev = &pdev->dev;
> +	struct octep_hw *oct_hw;
> +	unsigned long timeout;
> +	int ret;
> +
> +	oct_hw = &mgmt_dev->oct_hw;
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> +
> +	/* Wait for a maximum of 5 sec */
> +	timeout = jiffies + msecs_to_jiffies(5000);
> +	while (!time_after(jiffies, timeout)) {
> +		if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
> +			atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
> +			break;
> +		}
> +
> +		if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
> +			dev_info(dev, "Stopping vDPA setup task.\n");
> +			return;
> +		}
> +
> +		usleep_range(1000, 1500);
> +	}
> +
> +	if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
> +		dev_err(dev, "BAR initialization is timed out\n");
> +		return;
> +	}
> +
> +	ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +	if (ret)
> +		return;
> +
> +	ret = octep_hw_caps_read(oct_hw, pdev);
> +	if (ret < 0)
> +		goto unmap_region;
> +
> +	oct_hw->features = octep_hw_get_dev_features(oct_hw);
> +	ret = verify_features(oct_hw->features);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> +		goto unmap_region;
> +	}
> +	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
> +
> +	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> +	mgmt_dev->mdev.id_table = id_table;
> +	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> +	mgmt_dev->mdev.supported_features = oct_hw->features;
> +	mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
> +	mgmt_dev->mdev.device = dev;
> +
> +	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to register vdpa management interface\n");
> +		goto unmap_region;
> +	}
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
> +
> +	return;
> +
> +unmap_region:
> +	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
> +}
> +
> +static int octep_vdpa_probe_vf(struct pci_dev *pdev)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev;
> +	struct device *dev = &pdev->dev;
> +	struct iommu_domain *domain;
> +	int ret;
> +
> +	ret = pcim_enable_device(pdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to enable device\n");
> +		return ret;
> +	}
> +
> +	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +	if (ret) {
> +		dev_err(dev, "No usable DMA configuration\n");
> +		return ret;
> +	}
> +	pci_set_master(pdev);
> +
> +	domain = iommu_get_domain_for_dev(dev);
> +	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) {
> +		dev_info(dev, "NO-IOMMU\n");
> +		octep_vdpa_ops.set_map = octep_vdpa_set_map;
> +	}
> +
> +	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
> +	if (!mgmt_dev)
> +		return -ENOMEM;
> +
> +	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
> +	if (ret)
> +		return ret;
> +
> +	mgmt_dev->pdev = pdev;
> +	pci_set_drvdata(pdev, mgmt_dev);
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
> +	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
> +	schedule_work(&mgmt_dev->setup_task);
> +	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
> +
> +	return 0;
> +}
> +
> +static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
> +{
> +	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct octep_pf *pf = pci_get_drvdata(pf_dev);
> +	struct pci_bus_region bus_region;
> +
> +	vf_res->name = pci_name(vf_dev);
> +	vf_res->flags = pf_res->flags;
> +	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
> +
> +	bus_region.start = pf->vf_base + idx * pf->vf_stride;
> +	bus_region.end = bus_region.start + pf->vf_stride - 1;
> +	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
> +}
> +
> +static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
> +{
> +	struct octep_pf *pf = pci_get_drvdata(pdev);
> +	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
> +	int ret, i;
> +
> +	if (num_vfs > 0) {
> +		struct pci_dev *vf_pdev = NULL;
> +		bool done = false;
> +		int index = 0;
> +
> +		ret = pci_enable_sriov(pdev, num_vfs);
> +		if (ret)
> +			return ret;
> +
> +		pf->enabled_vfs = num_vfs;
> +
> +		while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {
> +			if (vf_pdev->device != pf->vf_devid)
> +				continue;
> +
> +			octep_vdpa_assign_barspace(vf_pdev, pdev, index);
> +			if (++index == num_vfs) {
> +				done = true;
> +				break;
> +			}
> +		}
> +
> +		if (done) {
> +			for (i = 0; i < pf->enabled_vfs; i++)
> +				writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
> +		}
> +	} else {
> +		if (!pci_num_vf(pdev))
> +			return 0;
> +
> +		pci_disable_sriov(pdev);
> +		pf->enabled_vfs = 0;
> +	}
> +
> +	return num_vfs;
> +}
> +
> +static u16 octep_get_vf_devid(struct pci_dev *pdev)
> +{
> +	u16 did;
> +
> +	switch (pdev->device) {
> +	case OCTEP_VDPA_DEVID_CN106K_PF:
> +		did = OCTEP_VDPA_DEVID_CN106K_VF;
> +		break;
> +	case OCTEP_VDPA_DEVID_CN105K_PF:
> +		did = OCTEP_VDPA_DEVID_CN105K_VF;
> +		break;
> +	case OCTEP_VDPA_DEVID_CN103K_PF:
> +		did = OCTEP_VDPA_DEVID_CN103K_VF;
> +		break;
> +	default:
> +		did = 0xFFFF;
> +		break;
> +	}
> +
> +	return did;
> +}
> +
> +static int octep_vdpa_pf_setup(struct octep_pf *octpf)
> +{
> +	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +	struct pci_dev *pdev = octpf->pdev;
> +	int totalvfs;
> +	u64 val, len;
> +
> +	totalvfs = pci_sriov_get_totalvfs(pdev);
> +	if (unlikely(!totalvfs)) {
> +		dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
> +		return 0;
> +	}
> +
> +	addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +	val = readq(addr + OCTEP_EPF_RINFO(0));
> +	if (val == 0) {
> +		dev_err(&pdev->dev, "Invalid device configuration\n");
> +		return -EINVAL;
> +	}
> +
> +	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
> +		val &= ~GENMASK_ULL(35, 32);
> +		val |= BIT_ULL(32);
> +		writeq(val, addr + OCTEP_EPF_RINFO(0));
> +	}
> +
> +	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
> +
> +	octpf->vf_stride = len / totalvfs;
> +	octpf->vf_devid = octep_get_vf_devid(pdev);
> +
> +	octep_vdpa_pf_bar_shrink(octpf);
> +
> +	return 0;
> +}
> +
> +static int octep_vdpa_probe_pf(struct pci_dev *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct octep_pf *octpf;
> +	int ret;
> +
> +	ret = pcim_enable_device(pdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to enable device\n");
> +		return ret;
> +	}
> +
> +	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +	if (ret) {
> +		dev_err(dev, "No usable DMA configuration\n");
> +		return ret;
> +	}
> +	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
> +	if (!octpf)
> +		return -ENOMEM;
> +
> +	ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +	if (ret)
> +		return ret;
> +
> +	pci_set_master(pdev);
> +	pci_set_drvdata(pdev, octpf);
> +	octpf->pdev = pdev;
> +
> +	ret = octep_vdpa_pf_setup(octpf);
> +	if (ret)
> +		goto unmap_region;
> +
> +	return 0;
> +
> +unmap_region:
> +	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +	return ret;
> +}
> +
> +static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> +	if (pdev->is_virtfn)
> +		return octep_vdpa_probe_vf(pdev);
> +	else
> +		return octep_vdpa_probe_pf(pdev);
> +}
> +
> +static struct pci_device_id octep_pci_vdpa_map[] = {
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
> +	{ 0 },
> +};
> +
> +static struct pci_driver octep_pci_vdpa = {
> +	.name     = OCTEP_VDPA_DRIVER_NAME,
> +	.id_table = octep_pci_vdpa_map,
> +	.probe    = octep_vdpa_probe,
> +	.remove   = octep_vdpa_remove,
> +	.sriov_configure = octep_vdpa_sriov_configure
> +};
> +
> +module_pci_driver(octep_pci_vdpa);
> +
> +MODULE_AUTHOR("Marvell");
> +MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
> +MODULE_LICENSE("GPL");
> -- 
> 2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-16  3:17                         ` Jason Wang
  2024-04-22 20:42                           ` Michael S. Tsirkin
@ 2024-04-23  5:40                           ` Srujana Challa
  1 sibling, 0 replies; 28+ messages in thread
From: Srujana Challa @ 2024-04-23  5:40 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin
  Cc: virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

> > > > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA
> > > > > > > > > driver for Marvell OCTEON DPU devices
> > > > > > > > >
> > > > > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa
> wrote:
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > > > > {
> > > > > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > > > > >
> > > > > > > > > > > > > Is this a shortcut to have get better performance?
> > > > > > > > > > > > > DMA API should have those greacefully I think.
> > > > > > > > > > > > When IOMMU is disabled on host and set_map/dma_map
> > > > > > > > > > > > is not set, vhost-vdpa is reporting an error
> > > > > > > > > > > > "Failed to allocate domain, device is not
> > > > > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > > > > >
> > > > > > > > > > > The problem is, assuming the device does not have
> > > > > > > > > > > any internal
> > > > > > > IOMMU.
> > > > > > > > > > >
> > > > > > > > > > > 1) If we allow it running without IOMMU, it opens a
> > > > > > > > > > > window for guest to attack the host.
> > > > > > > > > > > 2) If you see perforamnce issue with
> > > > > > > > > > > IOMMU_DOMAIN_IDENTITY, let's report it to DMA/IOMMU
> > > > > > > > > > > maintiner to fix that
> > > > > > > > > > It will be helpful for host networking case when iommu is
> disabled.
> > > > > > > > > > Can we take the vfio pci driver approach as a
> > > > > > > > > > reference where user explicitly set
> "enable_unsafe_noiommu_mode"
> > > > > > > > > > using module
> > > > > param?
> > > > > > > > >
> > > > > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > > > > what exactly ensures correctness here? does the device
> > > > > > > > > have an on-chip iommu?
> > > > > > > > >
> > > > > > > > Our device features an on-chip IOMMU, although it is not
> > > > > > > > utilized for host-side targeted DMA operations. We
> > > > > > > > included no-iommu mode in our driver to ensure that host
> > > > > > > > applications, such as DPDK Virtio user PMD, continue to
> > > > > > > > function even when operating in a no-
> > > > > IOMMU mode.
> > > > > > >
> > > > > > > I may miss something but set_map() is empty in this driver.
> > > > > > > How could such isolation be done?
> > > > > >
> > > > > > In no-iommu case, there would be no domain right, and the user
> > > > > > of vhost-vdpa(DPDK virtio user pmd), would create the mapping
> > > > > > and pass the PA (= IOVA) to the device directly. So that,
> > > > > > device can directly DMA to the
> > > > > PA.
> > > > >
> > > > > Yes, but this doesn't differ too much from the case where DMA
> > > > > API is used with IOMMU disabled.
> > > > >
> > > > > Are you saying DMA API introduces overheads in this case?
> > > > No actually, current vhost-vdpa code is not allowing IOMMU
> > > > disabled mode, If set_map/dma_map op is not set. Hence, we are
> > > > setting set_map with dummy api to allow IOMMU disabled mode.
> > > >
> > > > Following is the code snippet from drivers/vhost/vdpa.c
> > > >
> > > >       /* Device want to do DMA by itself */
> > > >         if (ops->set_map || ops->dma_map)
> > > >                 return 0;
> > > >
> > > >         bus = dma_dev->bus;
> > > >         if (!bus)
> > > >                 return -EFAULT;
> > > >
> > > >        if (!device_iommu_capable(dma_dev,
> > > IOMMU_CAP_CACHE_COHERENCY))
> > > >                 return -ENOTSUPP;
> > >
> > > Right, so here's the question.
> > >
> > > When IOMMU is disabled, if there's no isolation from the device
> > > on-chip IOMMU. It might have security implications. For example if
> > > we're using PA, userspace could attack the kernel.
> > >
> > > So there should be some logic in the set_map() to program the
> > > on-chip IOMMU to isolate DMA in that case but I don't see such
> > > implementation done in set_map().
> >
> > Our chip lacks support for on-chip IOMMU for host-side targeted DMA
> operations.
> > When using the DPDK virtio user PMD, we’ve noticed a significant 80%
> > performance improvement when IOMMU is disabled on specific x86
> > machines. This performance improvement can be leveraged by embedded
> > platforms where applications run in controlled environment.
> > May be it's a trade-off between security and performance.
> >
> > We can disable the no-iommu support by default and enable it through
> > some module parameter and taint the kernel similar to VFIO
> driver(enable_unsafe_noiommu_mode) right?
> 
> Could be one way.
> 
> Michael, any thoughts on this?

@Michael S. Tsirkin Could you confirm on above. I will send the v2 if it is fine for you.

Thanks
> 
> Thanks
> 
> > >
> > > >
> > > > Performance degradation when iommu enabled is not with DMA API but
> > > > the
> > > > x86 HW IOMMU translation performance on certain low end x86
> machines.
> > >
> > > This might be true but it's not specific to vDPA I think?
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > We observed performance impacts on certain low-end x86
> > > > > > > > machines when IOMMU mode was enabled.
> > > > > > > > I think, correctness is Host userspace application's
> > > > > > > > responsibility, in this case when vhost-vdpa is used with
> > > > > > > > Host application such as DPDK
> > > > > > > Virtio user PMD.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-22 20:42                           ` Michael S. Tsirkin
@ 2024-04-24 13:05                             ` Srujana Challa
  0 siblings, 0 replies; 28+ messages in thread
From: Srujana Challa @ 2024-04-24 13:05 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang
  Cc: virtualization, xuanzhuo, Vamsi Krishna Attunuru,
	Shijith Thotton, Nithin Kumar Dabilpuram, Jerin Jacob, eperezma

> > > > > > > > > > Subject: Re: [EXTERNAL] Re: [PATCH] virtio: vdpa: vDPA
> > > > > > > > > > driver for Marvell OCTEON DPU devices
> > > > > > > > > >
> > > > > > > > > > On Wed, Apr 10, 2024 at 10:15:37AM +0000, Srujana Challa
> wrote:
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       domain = iommu_get_domain_for_dev(dev);
> > > > > > > > > > > > > > > +       if (!domain || domain->type ==
> > > > > > > > > > > > > > > + IOMMU_DOMAIN_IDENTITY)
> > > > > > > > {
> > > > > > > > > > > > > > > +               dev_info(dev, "NO-IOMMU\n");
> > > > > > > > > > > > > > > +               octep_vdpa_ops.set_map =
> > > > > > > > > > > > > > > + octep_vdpa_set_map;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Is this a shortcut to have get better performance?
> > > > > > > > > > > > > > DMA API should have those greacefully I think.
> > > > > > > > > > > > > When IOMMU is disabled on host and
> > > > > > > > > > > > > set_map/dma_map is not set, vhost-vdpa is
> > > > > > > > > > > > > reporting an error "Failed to allocate domain,
> > > > > > > > > > > > > device is not
> > > > > > > > > > > > IOMMU cache coherent capable\n".
> > > > > > > > > > > > > Hence we are doing this way to get better performance.
> > > > > > > > > > > >
> > > > > > > > > > > > The problem is, assuming the device does not have
> > > > > > > > > > > > any internal
> > > > > > > > IOMMU.
> > > > > > > > > > > >
> > > > > > > > > > > > 1) If we allow it running without IOMMU, it opens
> > > > > > > > > > > > a window for guest to attack the host.
> > > > > > > > > > > > 2) If you see perforamnce issue with
> > > > > > > > > > > > IOMMU_DOMAIN_IDENTITY, let's report it to
> > > > > > > > > > > > DMA/IOMMU maintiner to fix that
> > > > > > > > > > > It will be helpful for host networking case when iommu is
> disabled.
> > > > > > > > > > > Can we take the vfio pci driver approach as a
> > > > > > > > > > > reference where user explicitly set
> "enable_unsafe_noiommu_mode"
> > > > > > > > > > > using module
> > > > > > param?
> > > > > > > > > >
> > > > > > > > > > vfio is a userspace driver so it's userspace's responsibility.
> > > > > > > > > > what exactly ensures correctness here? does the device
> > > > > > > > > > have an on-chip iommu?
> > > > > > > > > >
> > > > > > > > > Our device features an on-chip IOMMU, although it is not
> > > > > > > > > utilized for host-side targeted DMA operations. We
> > > > > > > > > included no-iommu mode in our driver to ensure that host
> > > > > > > > > applications, such as DPDK Virtio user PMD, continue to
> > > > > > > > > function even when operating in a no-
> > > > > > IOMMU mode.
> > > > > > > >
> > > > > > > > I may miss something but set_map() is empty in this
> > > > > > > > driver. How could such isolation be done?
> > > > > > >
> > > > > > > In no-iommu case, there would be no domain right, and the
> > > > > > > user of vhost-vdpa(DPDK virtio user pmd), would create the
> > > > > > > mapping and pass the PA (= IOVA) to the device directly. So
> > > > > > > that, device can directly DMA to the
> > > > > > PA.
> > > > > >
> > > > > > Yes, but this doesn't differ too much from the case where DMA
> > > > > > API is used with IOMMU disabled.
> > > > > >
> > > > > > Are you saying DMA API introduces overheads in this case?
> > > > > No actually, current vhost-vdpa code is not allowing IOMMU
> > > > > disabled mode, If set_map/dma_map op is not set. Hence, we are
> > > > > setting set_map with dummy api to allow IOMMU disabled mode.
> > > > >
> > > > > Following is the code snippet from drivers/vhost/vdpa.c
> > > > >
> > > > >       /* Device want to do DMA by itself */
> > > > >         if (ops->set_map || ops->dma_map)
> > > > >                 return 0;
> > > > >
> > > > >         bus = dma_dev->bus;
> > > > >         if (!bus)
> > > > >                 return -EFAULT;
> > > > >
> > > > >        if (!device_iommu_capable(dma_dev,
> > > > IOMMU_CAP_CACHE_COHERENCY))
> > > > >                 return -ENOTSUPP;
> > > >
> > > > Right, so here's the question.
> > > >
> > > > When IOMMU is disabled, if there's no isolation from the device
> > > > on-chip IOMMU. It might have security implications. For example if
> > > > we're using PA, userspace could attack the kernel.
> > > >
> > > > So there should be some logic in the set_map() to program the
> > > > on-chip IOMMU to isolate DMA in that case but I don't see such
> > > > implementation done in set_map().
> > >
> > > Our chip lacks support for on-chip IOMMU for host-side targeted DMA
> operations.
> > > When using the DPDK virtio user PMD, we’ve noticed a significant 80%
> > > performance improvement when IOMMU is disabled on specific x86
> > > machines. This performance improvement can be leveraged by embedded
> > > platforms where applications run in controlled environment.
> > > May be it's a trade-off between security and performance.
> > >
> > > We can disable the no-iommu support by default and enable it through
> > > some module parameter and taint the kernel similar to VFIO
> driver(enable_unsafe_noiommu_mode) right?
> >
> > Could be one way.
> >
> > Michael, any thoughts on this?
> >
> > Thanks
> 
> My thought is there's nothing special about the Marvell chip here.
> Merge it normally. Then if you like work on a no-iommu mode in vdpa.
For now we will remove no-iommu code from this patch and we will
work with Jason for adding no-iommu mode in vdpa.
@Jason Wang Can you confirm if it sounds fine?
> 
> 
> > > >
> > > > >
> > > > > Performance degradation when iommu enabled is not with DMA API
> > > > > but the
> > > > > x86 HW IOMMU translation performance on certain low end x86
> machines.
> > > >
> > > > This might be true but it's not specific to vDPA I think?
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > We observed performance impacts on certain low-end x86
> > > > > > > > > machines when IOMMU mode was enabled.
> > > > > > > > > I think, correctness is Host userspace application's
> > > > > > > > > responsibility, in this case when vhost-vdpa is used
> > > > > > > > > with Host application such as DPDK
> > > > > > > > Virtio user PMD.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > > Thanks.
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-03-27 11:21 [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices Srujana Challa
                   ` (2 preceding siblings ...)
  2024-04-22 20:44 ` Michael S. Tsirkin
@ 2024-04-24 19:08 ` Srujana Challa
  2024-04-25  1:57   ` Jason Wang
  2024-04-25  9:36   ` Michael S. Tsirkin
  3 siblings, 2 replies; 28+ messages in thread
From: Srujana Challa @ 2024-04-24 19:08 UTC (permalink / raw)
  To: virtualization, mst, jasowang
  Cc: xuanzhuo, schalla, vattunuru, sthotton, ndabilpuram, jerinj

This commit introduces a new vDPA driver specifically designed for
managing the virtio control plane over the vDPA bus for OCTEON DPU
devices. The driver consists of two layers:

1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
operations and configurations related to the DPU device.

2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
implements device operations for the vDPA bus. It handles device
probing, bus attachment, vring operations, and other relevant tasks.

Signed-off-by: Srujana Challa <schalla@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Shijith Thotton <sthotton@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
v2:
* Addressed review comments

 MAINTAINERS                              |   5 +
 drivers/vdpa/Kconfig                     |   9 +
 drivers/vdpa/Makefile                    |   1 +
 drivers/vdpa/octeon_ep/Makefile          |   4 +
 drivers/vdpa/octeon_ep/octep_vdpa.h      |  94 +++
 drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 517 ++++++++++++++
 drivers/vdpa/octeon_ep/octep_vdpa_main.c | 850 +++++++++++++++++++++++
 7 files changed, 1480 insertions(+)
 create mode 100644 drivers/vdpa/octeon_ep/Makefile
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
 create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index cfe44a876d8a..539ce209a960 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13127,6 +13127,11 @@ S:	Supported
 F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
 F:	drivers/mmc/host/sdhci-xenon*
 
+MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
+R:	schalla@marvell.com
+R:	vattunuru@marvell.com
+F:	drivers/vdpa/octeon_ep/
+
 MATROX FRAMEBUFFER DRIVER
 L:	linux-fbdev@vger.kernel.org
 S:	Orphan
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 656c1cb541de..775149fad476 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -126,4 +126,13 @@ config PDS_VDPA
 	  With this driver, the VirtIO dataplane can be
 	  offloaded to an AMD/Pensando DSC device.
 
+config OCTEONEP_VDPA
+	tristate "vDPA driver for Octeon DPU devices"
+	depends on m
+	depends on PCI_MSI
+	help
+	  vDPA driver for Marvell's Octeon DPU devices.
+	  With this driver, the VirtIO dataplane can be
+	  offloaded to a Octeon DPU device.
+
 endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index 8f53c6f3cca7..5654d36707af 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
 obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
 obj-$(CONFIG_SNET_VDPA) += solidrun/
 obj-$(CONFIG_PDS_VDPA) += pds/
+obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
new file mode 100644
index 000000000000..e23e2ff14f33
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
+octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
new file mode 100644
index 000000000000..046710ec4d42
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (C) 2024 Marvell.
+ */
+#ifndef __OCTEP_VDPA_H__
+#define __OCTEP_VDPA_H__
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_pci_modern.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_blk.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+#include <uapi/linux/vdpa.h>
+
+#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
+#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
+#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
+#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
+#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
+#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
+
+#define OCTEP_HW_MBOX_BAR 0
+#define OCTEP_HW_CAPS_BAR 4
+
+#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
+
+#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
+#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
+#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
+
+#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
+#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
+
+#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
+#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
+
+enum octep_vdpa_dev_status {
+	OCTEP_VDPA_DEV_STATUS_INVALID,
+	OCTEP_VDPA_DEV_STATUS_ALLOC,
+	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
+	OCTEP_VDPA_DEV_STATUS_INIT,
+	OCTEP_VDPA_DEV_STATUS_READY,
+	OCTEP_VDPA_DEV_STATUS_UNINIT
+};
+
+struct octep_vring_info {
+	struct vdpa_callback cb;
+	void __iomem *notify_addr;
+	u32 __iomem *cb_notify_addr;
+	phys_addr_t notify_pa;
+	char msix_name[256];
+};
+
+struct octep_hw {
+	struct pci_dev *pdev;
+	u8 __iomem *base[PCI_STD_NUM_BARS];
+	struct virtio_pci_common_cfg __iomem *common_cfg;
+	u8 __iomem *dev_cfg;
+	u8 __iomem *isr;
+	void __iomem *notify_base;
+	phys_addr_t notify_base_pa;
+	u32 notify_off_multiplier;
+	u8 notify_bar;
+	struct octep_vring_info *vqs;
+	struct vdpa_callback config_cb;
+	u64 features;
+	u16 nr_vring;
+	u32 config_size;
+	int irq;
+};
+
+u8 octep_hw_get_status(struct octep_hw *oct_hw);
+void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
+void octep_hw_reset(struct octep_hw *oct_hw);
+void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id);
+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
+			 u64 device_area);
+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
+u16 octep_get_vq_size(struct octep_hw *oct_hw);
+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
+u64 octep_hw_get_drv_features(struct octep_hw *oct_hw);
+int octep_verify_features(u64 features);
+
+#endif /* __OCTEP_VDPA_H__ */
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
new file mode 100644
index 000000000000..6afd06ead46a
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Marvell. */
+
+#include <linux/iopoll.h>
+
+#include "octep_vdpa.h"
+
+enum octep_mbox_ids {
+	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
+	OCTEP_MBOX_MSG_GET_VQ_STATE,
+};
+
+#define OCTEP_HW_TIMEOUT       10000000
+
+#define MBOX_OFFSET            64
+#define MBOX_RSP_MASK          0x00000001
+#define MBOX_RC_MASK           0x0000FFFE
+
+#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
+#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
+#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
+
+#define DEV_RST_ACK_BIT        7
+#define FEATURE_SEL_ACK_BIT    15
+#define QUEUE_SEL_ACK_BIT      15
+
+struct octep_mbox_hdr {
+	u8 ver;
+	u8 rsvd1;
+	u16 id;
+	u16 rsvd2;
+#define MBOX_REQ_SIG (0xdead)
+#define MBOX_RSP_SIG (0xbeef)
+	u16 sig;
+};
+
+struct octep_mbox_sts {
+	u16 rsp:1;
+	u16 rc:15;
+	u16 rsvd;
+};
+
+struct octep_mbox {
+	struct octep_mbox_hdr hdr;
+	struct octep_mbox_sts sts;
+	u64 rsvd;
+	u32 data[];
+};
+
+static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
+{
+	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
+}
+
+static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
+{
+	u32 val;
+
+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
+				  OCTEP_HW_TIMEOUT);
+}
+
+static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
+{
+	u32 val;
+
+	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
+				  OCTEP_HW_TIMEOUT);
+}
+
+static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
+{
+	iowrite16(id, &mbox->hdr.id);
+	iowrite16(sig, &mbox->hdr.sig);
+}
+
+static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
+{
+	return ioread16(&mbox->hdr.sig);
+}
+
+static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
+{
+	iowrite32(sts, &mbox->sts);
+}
+
+static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
+{
+	return ioread32(&mbox->sts);
+}
+
+static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
+{
+	return ioread32(&mbox->data[word_idx]);
+}
+
+static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
+{
+	return iowrite32(word, &mbox->data[word_idx]);
+}
+
+static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
+			      u32 buf_size, bool write)
+{
+	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
+	struct pci_dev *pdev = oct_hw->pdev;
+	u32 *p = (u32 *)buffer;
+	u16 data_wds;
+	int ret, i;
+	u32 val;
+
+	if (!IS_ALIGNED(buf_size, 4))
+		return -EINVAL;
+
+	/* Make sure mbox space is available */
+	ret = octep_wait_for_mbox_avail(mbox);
+	if (ret) {
+		dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
+		return ret;
+	}
+	data_wds = buf_size / 4;
+
+	if (write) {
+		for (i = 1; i <= data_wds; i++) {
+			octep_write32_word(mbox, i, *p);
+			p++;
+		}
+	}
+	octep_write32_word(mbox, 0, (u32)qid);
+	octep_write_sts(mbox, 0);
+
+	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
+
+	ret = octep_wait_for_mbox_rsp(mbox);
+	if (ret) {
+		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
+		return ret;
+	}
+
+	val = octep_read_sig(mbox);
+	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
+		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
+		return ret;
+	}
+
+	val = octep_read_sts(mbox);
+	if (val & MBOX_RC_MASK) {
+		ret = MBOX_RSP_TO_ERR(val);
+		dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
+		return ret;
+	}
+
+	if (!write)
+		for (i = 1; i <= data_wds; i++)
+			*p++ = octep_read32_word(mbox, i);
+
+	return 0;
+}
+
+static void octep_mbox_init(struct octep_mbox __iomem *mbox)
+{
+	iowrite32(1, &mbox->sts);
+}
+
+int octep_verify_features(u64 features)
+{
+	/* Minimum features to expect */
+	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
+		return -EOPNOTSUPP;
+
+	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
+		return -EOPNOTSUPP;
+
+	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+u8 octep_hw_get_status(struct octep_hw *oct_hw)
+{
+	return ioread8(&oct_hw->common_cfg->device_status);
+}
+
+void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
+{
+	iowrite8(status, &oct_hw->common_cfg->device_status);
+}
+
+void octep_hw_reset(struct octep_hw *oct_hw)
+{
+	u8 val;
+
+	octep_hw_set_status(oct_hw, 0 | BIT(DEV_RST_ACK_BIT));
+	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
+			       OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
+		return;
+	}
+}
+
+static int feature_sel_write_with_timeout(struct octep_hw *oct_hw, u32 select, void __iomem *addr)
+{
+	u32 val;
+
+	iowrite32(select | BIT(FEATURE_SEL_ACK_BIT), addr);
+
+	if (readx_poll_timeout(ioread32, addr, val, val == select, 10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
+		return -1;
+	}
+	return 0;
+}
+
+u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
+{
+	u32 features_lo, features_hi;
+
+	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->device_feature_select))
+		return 0;
+
+	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
+
+	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->device_feature_select))
+		return 0;
+
+	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
+
+	return ((u64)features_hi << 32) | features_lo;
+}
+
+u64 octep_hw_get_drv_features(struct octep_hw *oct_hw)
+{
+	u32 features_lo, features_hi;
+
+	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
+		return 0;
+
+	features_lo = ioread32(&oct_hw->common_cfg->guest_feature);
+
+	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
+		return 0;
+
+	features_hi = ioread32(&oct_hw->common_cfg->guest_feature);
+
+	return ((u64)features_hi << 32) | features_lo;
+}
+
+void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
+{
+	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
+		return;
+
+	iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
+
+	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
+		return;
+
+	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
+}
+
+void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id)
+{
+	u16 val;
+
+	iowrite16(queue_id | BIT(QUEUE_SEL_ACK_BIT), &oct_hw->common_cfg->queue_select);
+
+	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
+			       10, OCTEP_HW_TIMEOUT)) {
+		dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
+		return;
+	}
+}
+
+void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
+{
+	iowrite16(qid, oct_hw->vqs[qid].notify_addr);
+}
+
+void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
+{
+	u8 old_gen, new_gen, *p;
+	int i;
+
+	if (WARN_ON(offset + length > oct_hw->config_size))
+		return;
+
+	do {
+		old_gen = ioread8(&oct_hw->common_cfg->config_generation);
+		p = dst;
+		for (i = 0; i < length; i++)
+			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
+
+		new_gen = ioread8(&oct_hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
+			 u64 device_area)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(oct_hw, qid);
+	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
+			     &cfg->queue_desc_hi);
+	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
+			     &cfg->queue_avail_hi);
+	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
+			     &cfg->queue_used_hi);
+
+	return 0;
+}
+
+int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
+{
+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
+				  sizeof(*state), 0);
+}
+
+int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
+{
+	struct vdpa_vq_state q_state;
+
+	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
+	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
+				  sizeof(*state), 1);
+}
+
+void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(oct_hw, qid);
+	iowrite16(num, &cfg->queue_size);
+}
+
+void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(oct_hw, qid);
+	iowrite16(ready, &cfg->queue_enable);
+}
+
+bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
+
+	octep_write_queue_select(oct_hw, qid);
+	return ioread16(&cfg->queue_enable);
+}
+
+u16 octep_get_vq_size(struct octep_hw *oct_hw)
+{
+	octep_write_queue_select(oct_hw, 0);
+	return ioread16(&oct_hw->common_cfg->queue_size);
+}
+
+static u32 octep_get_config_size(struct octep_hw *oct_hw)
+{
+	return sizeof(struct virtio_net_config);
+}
+
+static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
+{
+	struct device *dev = &oct_hw->pdev->dev;
+	u32 length = le32_to_cpu(cap->length);
+	u32 offset = le32_to_cpu(cap->offset);
+	u8  bar    = cap->bar;
+	u32 len;
+
+	if (bar != OCTEP_HW_CAPS_BAR) {
+		dev_err(dev, "Invalid bar: %u\n", bar);
+		return NULL;
+	}
+	if (offset + length < offset) {
+		dev_err(dev, "offset(%u) + length(%u) overflows\n",
+			offset, length);
+		return NULL;
+	}
+	len = pci_resource_len(oct_hw->pdev, bar);
+	if (offset + length > len) {
+		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
+			offset + length, len);
+		return NULL;
+	}
+	return oct_hw->base[bar] + offset;
+}
+
+/* In Octeon DPU device, the virtio config space completely
+ * emulated by the device's firmware. So, the standard pci config
+ * read apis can't be used for reading the virtio capabilities.
+ */
+static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
+{
+	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
+	u8 *p = buf;
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		*p++ = ioread8(bar + offset + i);
+}
+
+static int pci_signature_verify(struct octep_hw *oct_hw)
+{
+	u32 signature[2];
+
+	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
+
+	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
+		return -1;
+
+	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
+		return -1;
+
+	return 0;
+}
+
+int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
+{
+	struct octep_mbox __iomem *mbox;
+	struct device *dev = &pdev->dev;
+	struct virtio_pci_cap cap;
+	u16 notify_off;
+	int i, ret;
+	u8 pos;
+
+	oct_hw->pdev = pdev;
+	ret = pci_signature_verify(oct_hw);
+	if (ret) {
+		dev_err(dev, "Octeon Virtio FW is not initialized\n");
+		return -EIO;
+	}
+
+	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
+
+	while (pos) {
+		pci_caps_read(oct_hw, &cap, 2, pos);
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
+			break;
+		}
+
+		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
+
+		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
+			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
+				      4, pos + sizeof(cap));
+
+			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
+			oct_hw->notify_bar = cap.bar;
+			oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
+						 le32_to_cpu(cap.offset);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			oct_hw->isr = get_cap_addr(oct_hw, &cap);
+			break;
+		}
+
+		pos = cap.cap_next;
+	}
+	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
+	    !oct_hw->dev_cfg    || !oct_hw->isr) {
+		dev_err(dev, "Incomplete PCI capabilities");
+		return -EIO;
+	}
+	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
+	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
+	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
+	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
+		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
+
+	oct_hw->config_size = octep_get_config_size(oct_hw);
+	oct_hw->features = octep_hw_get_dev_features(oct_hw);
+
+	ret = octep_verify_features(oct_hw->features);
+	if (ret) {
+		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
+		return ret;
+	}
+	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
+
+	oct_hw->vqs = devm_kcalloc(&pdev->dev, oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
+	if (!oct_hw->vqs)
+		return -ENOMEM;
+
+	oct_hw->irq = -1;
+
+	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
+	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
+
+	for (i = 0; i < oct_hw->nr_vring; i++) {
+		octep_write_queue_select(oct_hw, i);
+		notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
+		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
+			notify_off * oct_hw->notify_off_multiplier;
+		oct_hw->vqs[i].cb_notify_addr = (u32 __iomem *)oct_hw->vqs[i].notify_addr + 1;
+		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
+			notify_off * oct_hw->notify_off_multiplier;
+	}
+	mbox = octep_get_mbox(oct_hw);
+	octep_mbox_init(mbox);
+	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
+
+	return 0;
+}
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
new file mode 100644
index 000000000000..89860a443981
--- /dev/null
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Marvell. */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/iommu.h>
+#include "octep_vdpa.h"
+
+#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
+
+struct octep_pf {
+	u8 __iomem *base[PCI_STD_NUM_BARS];
+	struct pci_dev *pdev;
+	struct resource res;
+	u64 vf_base;
+	int enabled_vfs;
+	u32 vf_stride;
+	u16 vf_devid;
+};
+
+struct octep_vdpa {
+	struct vdpa_device vdpa;
+	struct octep_hw *oct_hw;
+	struct pci_dev *pdev;
+};
+
+struct octep_vdpa_mgmt_dev {
+	struct vdpa_mgmt_dev mdev;
+	struct octep_hw oct_hw;
+	struct pci_dev *pdev;
+	/* Work entry to handle device setup */
+	struct work_struct setup_task;
+	/* Device status */
+	atomic_t status;
+};
+
+static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
+{
+	struct octep_vdpa *oct_vdpa;
+
+	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
+
+	return oct_vdpa->oct_hw;
+}
+
+static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
+{
+	struct octep_hw *oct_hw = data;
+	int i;
+
+	for (i = 0; i < oct_hw->nr_vring; i++) {
+		if (oct_hw->vqs[i].cb.callback && ioread32(oct_hw->vqs[i].cb_notify_addr)) {
+			/* Acknowledge the per queue notification to the device */
+			iowrite32(0, oct_hw->vqs[i].cb_notify_addr);
+			oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void octep_free_irqs(struct octep_hw *oct_hw)
+{
+	struct pci_dev *pdev = oct_hw->pdev;
+
+	if (oct_hw->irq != -1) {
+		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
+		oct_hw->irq = -1;
+	}
+	pci_free_irq_vectors(pdev);
+}
+
+static int octep_request_irqs(struct octep_hw *oct_hw)
+{
+	struct pci_dev *pdev = oct_hw->pdev;
+	int ret, irq;
+
+	/* Currently HW device provisions one IRQ per VF, hence
+	 * allocate one IRQ for all virtqueues call interface.
+	 */
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to alloc msix vector");
+		return ret;
+	}
+
+	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
+		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
+			       oct_hw->vqs->msix_name, oct_hw);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
+		goto free_irq_vec;
+	}
+	oct_hw->irq = irq;
+
+	return 0;
+
+free_irq_vec:
+	pci_free_irq_vectors(pdev);
+	return ret;
+}
+
+static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return oct_hw->features;
+}
+
+static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	int ret;
+
+	pr_debug("Driver Features: %llx\n", features);
+
+	ret = octep_verify_features(features);
+	if (ret)
+		return ret;
+	octep_hw_set_drv_features(oct_hw, features);
+
+	return 0;
+}
+
+static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_hw_get_drv_features(oct_hw);
+}
+
+static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_hw_get_status(oct_hw);
+}
+
+static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u8 status_old;
+
+	status_old = octep_hw_get_status(oct_hw);
+
+	if (status_old == status)
+		return;
+
+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		if (octep_request_irqs(oct_hw))
+			status = status_old | VIRTIO_CONFIG_S_FAILED;
+	}
+	octep_hw_set_status(oct_hw, status);
+}
+
+static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u8 status = octep_hw_get_status(oct_hw);
+	u16 qid;
+
+	if (status == 0)
+		return 0;
+
+	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
+		oct_hw->vqs[qid].cb.callback = NULL;
+		oct_hw->vqs[qid].cb.private = NULL;
+		oct_hw->config_cb.callback = NULL;
+		oct_hw->config_cb.private = NULL;
+	}
+	octep_hw_reset(oct_hw);
+
+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
+		octep_free_irqs(oct_hw);
+
+	return 0;
+}
+
+static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_size(oct_hw);
+}
+
+static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   struct vdpa_vq_state *state)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_state(oct_hw, qid, state);
+}
+
+static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   const struct vdpa_vq_state *state)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_set_vq_state(oct_hw, qid, state);
+}
+
+static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	oct_hw->vqs[qid].cb = *cb;
+}
+
+static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_set_vq_ready(oct_hw, qid, ready);
+}
+
+static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return octep_get_vq_ready(oct_hw, qid);
+}
+
+static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_set_vq_num(oct_hw, qid, num);
+}
+
+static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
+				     u64 driver_area, u64 device_area)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
+	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
+	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
+
+	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
+}
+
+static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	/* Not supported */
+}
+
+static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	u16 idx = data & 0xFFFF;
+
+	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
+}
+
+static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return vp_ioread8(&oct_hw->common_cfg->config_generation);
+}
+
+static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+	return VIRTIO_ID_NET;
+}
+
+static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+	return PCI_VENDOR_ID_CAVIUM;
+}
+
+static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+	return PAGE_SIZE;
+}
+
+static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	return oct_hw->config_size;
+}
+
+static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
+				  unsigned int len)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	octep_read_dev_config(oct_hw, offset, buf, len);
+}
+
+static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
+				  const void *buf, unsigned int len)
+{
+	/* Not supported */
+}
+
+static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+
+	oct_hw->config_cb.callback = cb->callback;
+	oct_hw->config_cb.private = cb->private;
+}
+
+static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
+							       u16 idx)
+{
+	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
+	struct vdpa_notification_area area;
+
+	area.addr = oct_hw->vqs[idx].notify_pa;
+	area.size = PAGE_SIZE;
+
+	return area;
+}
+
+static struct vdpa_config_ops octep_vdpa_ops = {
+	.get_device_features = octep_vdpa_get_device_features,
+	.set_driver_features = octep_vdpa_set_driver_features,
+	.get_driver_features = octep_vdpa_get_driver_features,
+	.get_status	= octep_vdpa_get_status,
+	.set_status	= octep_vdpa_set_status,
+	.reset		= octep_vdpa_reset,
+	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
+	.get_vq_state	= octep_vdpa_get_vq_state,
+	.set_vq_state	= octep_vdpa_set_vq_state,
+	.set_vq_cb	= octep_vdpa_set_vq_cb,
+	.set_vq_ready	= octep_vdpa_set_vq_ready,
+	.get_vq_ready	= octep_vdpa_get_vq_ready,
+	.set_vq_num	= octep_vdpa_set_vq_num,
+	.set_vq_address	= octep_vdpa_set_vq_address,
+	.get_vq_irq	= NULL,
+	.kick_vq	= octep_vdpa_kick_vq,
+	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
+	.get_generation	= octep_vdpa_get_generation,
+	.get_device_id	= octep_vdpa_get_device_id,
+	.get_vendor_id	= octep_vdpa_get_vendor_id,
+	.get_vq_align	= octep_vdpa_get_vq_align,
+	.get_config_size	= octep_vdpa_get_config_size,
+	.get_config	= octep_vdpa_get_config,
+	.set_config	= octep_vdpa_set_config,
+	.set_config_cb  = octep_vdpa_set_config_cb,
+	.get_vq_notification = octep_get_vq_notification,
+};
+
+static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
+{
+	int ret;
+
+	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
+		return ret;
+	}
+
+	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
+	if (!tbl[bar]) {
+		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
+		pci_release_region(pdev, bar);
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
+{
+	pci_iounmap(pdev, tbl[bar]);
+	pci_release_region(pdev, bar);
+}
+
+static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
+{
+	struct pci_dev *pf_dev = octpf->pdev;
+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct pci_bus_region bus_region;
+
+	octpf->res.start = res->start;
+	octpf->res.end = res->end;
+	octpf->vf_base = res->start;
+
+	bus_region.start = res->start;
+	bus_region.end = res->start - 1;
+
+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
+}
+
+static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
+{
+	struct pci_dev *pf_dev = octpf->pdev;
+	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct pci_bus_region bus_region;
+
+	bus_region.start = octpf->res.start;
+	bus_region.end = octpf->res.end;
+
+	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
+}
+
+static void octep_vdpa_remove_pf(struct pci_dev *pdev)
+{
+	struct octep_pf *octpf = pci_get_drvdata(pdev);
+
+	pci_disable_sriov(pdev);
+
+	if (octpf->base[OCTEP_HW_CAPS_BAR])
+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
+
+	if (octpf->base[OCTEP_HW_MBOX_BAR])
+		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+
+	octep_vdpa_pf_bar_expand(octpf);
+}
+
+static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
+{
+	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
+
+	memset(vf_res, 0, sizeof(*vf_res));
+}
+
+static void octep_vdpa_remove_vf(struct pci_dev *pdev)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
+	struct octep_hw *oct_hw;
+	int status;
+
+	oct_hw = &mgmt_dev->oct_hw;
+	status = atomic_read(&mgmt_dev->status);
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
+
+	cancel_work_sync(&mgmt_dev->setup_task);
+	if (status == OCTEP_VDPA_DEV_STATUS_READY)
+		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
+
+	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+
+	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
+		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
+
+	octep_vdpa_vf_bar_shrink(pdev);
+}
+
+static void octep_vdpa_remove(struct pci_dev *pdev)
+{
+	if (pdev->is_virtfn)
+		octep_vdpa_remove_vf(pdev);
+	else
+		octep_vdpa_remove_pf(pdev);
+}
+
+static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			      const struct vdpa_dev_set_config *config)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
+	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
+	struct pci_dev *pdev = oct_hw->pdev;
+	struct vdpa_device *vdpa_dev;
+	struct octep_vdpa *oct_vdpa;
+	u64 device_features;
+	int ret;
+
+	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
+				     NULL, false);
+	if (IS_ERR(oct_vdpa)) {
+		dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
+		return PTR_ERR(oct_vdpa);
+	}
+
+	oct_vdpa->pdev = pdev;
+	oct_vdpa->vdpa.dma_dev = &pdev->dev;
+	oct_vdpa->vdpa.mdev = mdev;
+	oct_vdpa->oct_hw = oct_hw;
+	vdpa_dev = &oct_vdpa->vdpa;
+
+	device_features = oct_hw->features;
+	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
+		if (config->device_features & ~device_features) {
+			dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
+				config->device_features, device_features);
+			ret = -EINVAL;
+			goto vdpa_dev_put;
+		}
+		device_features &= config->device_features;
+	}
+
+	oct_hw->features = device_features;
+	dev_info(&pdev->dev, "Vdpa management device features : %llx\n", device_features);
+
+	ret = octep_verify_features(device_features);
+	if (ret) {
+		dev_warn(mdev->device,
+			 "Must provision minimum features 0x%llx for this device",
+			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
+			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
+		goto vdpa_dev_put;
+	}
+	if (name)
+		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
+	else
+		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
+
+	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register to vDPA bus");
+		goto vdpa_dev_put;
+	}
+	return 0;
+
+vdpa_dev_put:
+	put_device(&oct_vdpa->vdpa.dev);
+	return ret;
+}
+
+static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *vdpa_dev)
+{
+	_vdpa_unregister_device(vdpa_dev);
+}
+
+static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
+	.dev_add = octep_vdpa_dev_add,
+	.dev_del = octep_vdpa_dev_del
+};
+
+static bool get_device_ready_status(u8 __iomem *addr)
+{
+	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
+
+	if (signature == OCTEP_DEV_READY_SIGNATURE) {
+		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
+		return true;
+	}
+
+	return false;
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static void octep_vdpa_setup_task(struct work_struct *work)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
+							    setup_task);
+	struct pci_dev *pdev = mgmt_dev->pdev;
+	struct device *dev = &pdev->dev;
+	struct octep_hw *oct_hw;
+	unsigned long timeout;
+	int ret;
+
+	oct_hw = &mgmt_dev->oct_hw;
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
+
+	/* Wait for a maximum of 5 sec */
+	timeout = jiffies + msecs_to_jiffies(5000);
+	while (!time_after(jiffies, timeout)) {
+		if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
+			atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
+			break;
+		}
+
+		if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
+			dev_info(dev, "Stopping vDPA setup task.\n");
+			return;
+		}
+
+		usleep_range(1000, 1500);
+	}
+
+	if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
+		dev_err(dev, "BAR initialization is timed out\n");
+		return;
+	}
+
+	ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+	if (ret)
+		return;
+
+	ret = octep_hw_caps_read(oct_hw, pdev);
+	if (ret < 0)
+		goto unmap_region;
+
+	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
+	mgmt_dev->mdev.id_table = id_table;
+	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
+	mgmt_dev->mdev.supported_features = oct_hw->features;
+	mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
+	mgmt_dev->mdev.device = dev;
+
+	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
+	if (ret) {
+		dev_err(dev, "Failed to register vdpa management interface\n");
+		goto unmap_region;
+	}
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
+
+	return;
+
+unmap_region:
+	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
+	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
+}
+
+static int octep_vdpa_probe_vf(struct pci_dev *pdev)
+{
+	struct octep_vdpa_mgmt_dev *mgmt_dev;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		dev_err(dev, "Failed to enable device\n");
+		return ret;
+	}
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(dev, "No usable DMA configuration\n");
+		return ret;
+	}
+	pci_set_master(pdev);
+
+	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
+	if (!mgmt_dev)
+		return -ENOMEM;
+
+	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
+	if (ret)
+		return ret;
+
+	mgmt_dev->pdev = pdev;
+	pci_set_drvdata(pdev, mgmt_dev);
+
+	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
+	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
+	schedule_work(&mgmt_dev->setup_task);
+	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
+
+	return 0;
+}
+
+static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
+{
+	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
+	struct octep_pf *pf = pci_get_drvdata(pf_dev);
+	struct pci_bus_region bus_region;
+
+	vf_res->name = pci_name(vf_dev);
+	vf_res->flags = pf_res->flags;
+	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
+
+	bus_region.start = pf->vf_base + idx * pf->vf_stride;
+	bus_region.end = bus_region.start + pf->vf_stride - 1;
+	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
+}
+
+static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs)
+{
+	struct octep_pf *pf = pci_get_drvdata(pdev);
+	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
+	struct pci_dev *vf_pdev = NULL;
+	bool done = false;
+	int index = 0;
+	int ret, i;
+
+	ret = pci_enable_sriov(pdev, num_vfs);
+	if (ret)
+		return ret;
+
+	pf->enabled_vfs = num_vfs;
+
+	while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {
+		if (vf_pdev->device != pf->vf_devid)
+			continue;
+
+		octep_vdpa_assign_barspace(vf_pdev, pdev, index);
+		if (++index == num_vfs) {
+			done = true;
+			break;
+		}
+	}
+
+	if (done) {
+		for (i = 0; i < pf->enabled_vfs; i++)
+			writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
+	}
+
+	return num_vfs;
+}
+
+static int octep_sriov_disable(struct pci_dev *pdev)
+{
+	struct octep_pf *pf = pci_get_drvdata(pdev);
+
+	if (!pci_num_vf(pdev))
+		return 0;
+
+	pci_disable_sriov(pdev);
+	pf->enabled_vfs = 0;
+
+	return 0;
+}
+
+static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	if (num_vfs > 0)
+		return octep_sriov_enable(pdev, num_vfs);
+	else
+		return octep_sriov_disable(pdev);
+}
+
+static u16 octep_get_vf_devid(struct pci_dev *pdev)
+{
+	u16 did;
+
+	switch (pdev->device) {
+	case OCTEP_VDPA_DEVID_CN106K_PF:
+		did = OCTEP_VDPA_DEVID_CN106K_VF;
+		break;
+	case OCTEP_VDPA_DEVID_CN105K_PF:
+		did = OCTEP_VDPA_DEVID_CN105K_VF;
+		break;
+	case OCTEP_VDPA_DEVID_CN103K_PF:
+		did = OCTEP_VDPA_DEVID_CN103K_VF;
+		break;
+	default:
+		did = 0xFFFF;
+		break;
+	}
+
+	return did;
+}
+
+static int octep_vdpa_pf_setup(struct octep_pf *octpf)
+{
+	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
+	struct pci_dev *pdev = octpf->pdev;
+	int totalvfs;
+	u64 val, len;
+
+	totalvfs = pci_sriov_get_totalvfs(pdev);
+	if (unlikely(!totalvfs)) {
+		dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
+		return 0;
+	}
+
+	addr = octpf->base[OCTEP_HW_MBOX_BAR];
+	val = readq(addr + OCTEP_EPF_RINFO(0));
+	if (val == 0) {
+		dev_err(&pdev->dev, "Invalid device configuration\n");
+		return -EINVAL;
+	}
+
+	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
+		val &= ~GENMASK_ULL(35, 32);
+		val |= BIT_ULL(32);
+		writeq(val, addr + OCTEP_EPF_RINFO(0));
+	}
+
+	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
+
+	octpf->vf_stride = len / totalvfs;
+	octpf->vf_devid = octep_get_vf_devid(pdev);
+
+	octep_vdpa_pf_bar_shrink(octpf);
+
+	return 0;
+}
+
+static int octep_vdpa_probe_pf(struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct octep_pf *octpf;
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		dev_err(dev, "Failed to enable device\n");
+		return ret;
+	}
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(dev, "No usable DMA configuration\n");
+		return ret;
+	}
+	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
+	if (!octpf)
+		return -ENOMEM;
+
+	ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+	if (ret)
+		return ret;
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, octpf);
+	octpf->pdev = pdev;
+
+	ret = octep_vdpa_pf_setup(octpf);
+	if (ret)
+		goto unmap_region;
+
+	return 0;
+
+unmap_region:
+	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
+	return ret;
+}
+
+static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	if (pdev->is_virtfn)
+		return octep_vdpa_probe_vf(pdev);
+	else
+		return octep_vdpa_probe_pf(pdev);
+}
+
+static struct pci_device_id octep_pci_vdpa_map[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
+	{ 0 },
+};
+
+static struct pci_driver octep_pci_vdpa = {
+	.name     = OCTEP_VDPA_DRIVER_NAME,
+	.id_table = octep_pci_vdpa_map,
+	.probe    = octep_vdpa_probe,
+	.remove   = octep_vdpa_remove,
+	.sriov_configure = octep_vdpa_sriov_configure
+};
+
+module_pci_driver(octep_pci_vdpa);
+
+MODULE_AUTHOR("Marvell");
+MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
+MODULE_LICENSE("GPL");
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-24 19:08 ` [PATCH v2] " Srujana Challa
@ 2024-04-25  1:57   ` Jason Wang
  2024-04-25  9:36   ` Michael S. Tsirkin
  1 sibling, 0 replies; 28+ messages in thread
From: Jason Wang @ 2024-04-25  1:57 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, mst, xuanzhuo, vattunuru, sthotton, ndabilpuram, jerinj

On Thu, Apr 25, 2024 at 3:08 AM Srujana Challa <schalla@marvell.com> wrote:
>
> This commit introduces a new vDPA driver specifically designed for
> managing the virtio control plane over the vDPA bus for OCTEON DPU
> devices. The driver consists of two layers:
>
> 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> operations and configurations related to the DPU device.
>
> 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> implements device operations for the vDPA bus. It handles device
> probing, bus attachment, vring operations, and other relevant tasks.
>
> Signed-off-by: Srujana Challa <schalla@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> ---
> v2:
> * Addressed review comments
>
>  MAINTAINERS                              |   5 +
>  drivers/vdpa/Kconfig                     |   9 +
>  drivers/vdpa/Makefile                    |   1 +
>  drivers/vdpa/octeon_ep/Makefile          |   4 +
>  drivers/vdpa/octeon_ep/octep_vdpa.h      |  94 +++
>  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 517 ++++++++++++++
>  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 850 +++++++++++++++++++++++
>  7 files changed, 1480 insertions(+)
>  create mode 100644 drivers/vdpa/octeon_ep/Makefile
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cfe44a876d8a..539ce209a960 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13127,6 +13127,11 @@ S:     Supported
>  F:     Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
>  F:     drivers/mmc/host/sdhci-xenon*
>
> +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> +R:     schalla@marvell.com
> +R:     vattunuru@marvell.com
> +F:     drivers/vdpa/octeon_ep/
> +
>  MATROX FRAMEBUFFER DRIVER
>  L:     linux-fbdev@vger.kernel.org
>  S:     Orphan
> diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> index 656c1cb541de..775149fad476 100644
> --- a/drivers/vdpa/Kconfig
> +++ b/drivers/vdpa/Kconfig
> @@ -126,4 +126,13 @@ config PDS_VDPA
>           With this driver, the VirtIO dataplane can be
>           offloaded to an AMD/Pensando DSC device.
>
> +config OCTEONEP_VDPA
> +       tristate "vDPA driver for Octeon DPU devices"
> +       depends on m
> +       depends on PCI_MSI
> +       help
> +         vDPA driver for Marvell's Octeon DPU devices.
> +         With this driver, the VirtIO dataplane can be
> +         offloaded to a Octeon DPU device.
> +
>  endif # VDPA
> diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> index 8f53c6f3cca7..5654d36707af 100644
> --- a/drivers/vdpa/Makefile
> +++ b/drivers/vdpa/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
>  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
>  obj-$(CONFIG_SNET_VDPA) += solidrun/
>  obj-$(CONFIG_PDS_VDPA) += pds/
> +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
> new file mode 100644
> index 000000000000..e23e2ff14f33
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
> new file mode 100644
> index 000000000000..046710ec4d42
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> @@ -0,0 +1,94 @@
> +/* SPDX-License-Identifier: GPL-2.0-only
> + * Copyright (C) 2024 Marvell.
> + */
> +#ifndef __OCTEP_VDPA_H__
> +#define __OCTEP_VDPA_H__
> +
> +#include <linux/pci.h>
> +#include <linux/pci_regs.h>
> +#include <linux/vdpa.h>
> +#include <linux/virtio_pci_modern.h>
> +#include <uapi/linux/virtio_net.h>
> +#include <uapi/linux/virtio_blk.h>
> +#include <uapi/linux/virtio_config.h>
> +#include <uapi/linux/virtio_pci.h>
> +#include <uapi/linux/vdpa.h>
> +
> +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> +
> +#define OCTEP_HW_MBOX_BAR 0
> +#define OCTEP_HW_CAPS_BAR 4
> +
> +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> +
> +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> +
> +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> +
> +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> +
> +enum octep_vdpa_dev_status {
> +       OCTEP_VDPA_DEV_STATUS_INVALID,
> +       OCTEP_VDPA_DEV_STATUS_ALLOC,
> +       OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> +       OCTEP_VDPA_DEV_STATUS_INIT,
> +       OCTEP_VDPA_DEV_STATUS_READY,
> +       OCTEP_VDPA_DEV_STATUS_UNINIT
> +};
> +
> +struct octep_vring_info {
> +       struct vdpa_callback cb;
> +       void __iomem *notify_addr;
> +       u32 __iomem *cb_notify_addr;
> +       phys_addr_t notify_pa;
> +       char msix_name[256];
> +};
> +
> +struct octep_hw {
> +       struct pci_dev *pdev;
> +       u8 __iomem *base[PCI_STD_NUM_BARS];
> +       struct virtio_pci_common_cfg __iomem *common_cfg;
> +       u8 __iomem *dev_cfg;
> +       u8 __iomem *isr;
> +       void __iomem *notify_base;
> +       phys_addr_t notify_base_pa;
> +       u32 notify_off_multiplier;
> +       u8 notify_bar;
> +       struct octep_vring_info *vqs;
> +       struct vdpa_callback config_cb;
> +       u64 features;
> +       u16 nr_vring;
> +       u32 config_size;
> +       int irq;
> +};
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> +void octep_hw_reset(struct octep_hw *oct_hw);
> +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id);
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +                        u64 device_area);
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
> +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw);
> +int octep_verify_features(u64 features);
> +
> +#endif /* __OCTEP_VDPA_H__ */
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> new file mode 100644
> index 000000000000..6afd06ead46a
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> @@ -0,0 +1,517 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/iopoll.h>
> +
> +#include "octep_vdpa.h"
> +
> +enum octep_mbox_ids {
> +       OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> +       OCTEP_MBOX_MSG_GET_VQ_STATE,
> +};
> +
> +#define OCTEP_HW_TIMEOUT       10000000
> +
> +#define MBOX_OFFSET            64
> +#define MBOX_RSP_MASK          0x00000001
> +#define MBOX_RC_MASK           0x0000FFFE
> +
> +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
> +
> +#define DEV_RST_ACK_BIT        7
> +#define FEATURE_SEL_ACK_BIT    15
> +#define QUEUE_SEL_ACK_BIT      15
> +
> +struct octep_mbox_hdr {
> +       u8 ver;
> +       u8 rsvd1;
> +       u16 id;
> +       u16 rsvd2;
> +#define MBOX_REQ_SIG (0xdead)
> +#define MBOX_RSP_SIG (0xbeef)
> +       u16 sig;
> +};
> +
> +struct octep_mbox_sts {
> +       u16 rsp:1;
> +       u16 rc:15;
> +       u16 rsvd;
> +};
> +
> +struct octep_mbox {
> +       struct octep_mbox_hdr hdr;
> +       struct octep_mbox_sts sts;
> +       u64 rsvd;
> +       u32 data[];
> +};
> +
> +static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
> +{
> +       return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
> +}
> +
> +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
> +{
> +       u32 val;
> +
> +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
> +                                 OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
> +{
> +       u32 val;
> +
> +       return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
> +                                 OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
> +{
> +       iowrite16(id, &mbox->hdr.id);
> +       iowrite16(sig, &mbox->hdr.sig);
> +}
> +
> +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> +{
> +       return ioread16(&mbox->hdr.sig);
> +}
> +
> +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
> +{
> +       iowrite32(sts, &mbox->sts);
> +}
> +
> +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> +{
> +       return ioread32(&mbox->sts);
> +}
> +
> +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
> +{
> +       return ioread32(&mbox->data[word_idx]);
> +}
> +
> +static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
> +{
> +       return iowrite32(word, &mbox->data[word_idx]);
> +}
> +
> +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
> +                             u32 buf_size, bool write)
> +{
> +       struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> +       struct pci_dev *pdev = oct_hw->pdev;
> +       u32 *p = (u32 *)buffer;
> +       u16 data_wds;
> +       int ret, i;
> +       u32 val;
> +
> +       if (!IS_ALIGNED(buf_size, 4))
> +               return -EINVAL;
> +
> +       /* Make sure mbox space is available */
> +       ret = octep_wait_for_mbox_avail(mbox);
> +       if (ret) {
> +               dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
> +               return ret;
> +       }
> +       data_wds = buf_size / 4;
> +
> +       if (write) {
> +               for (i = 1; i <= data_wds; i++) {
> +                       octep_write32_word(mbox, i, *p);
> +                       p++;
> +               }
> +       }
> +       octep_write32_word(mbox, 0, (u32)qid);
> +       octep_write_sts(mbox, 0);
> +
> +       octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> +
> +       ret = octep_wait_for_mbox_rsp(mbox);
> +       if (ret) {
> +               dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
> +               return ret;
> +       }
> +
> +       val = octep_read_sig(mbox);
> +       if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> +               dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
> +               return ret;
> +       }
> +
> +       val = octep_read_sts(mbox);
> +       if (val & MBOX_RC_MASK) {
> +               ret = MBOX_RSP_TO_ERR(val);
> +               dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
> +               return ret;
> +       }
> +
> +       if (!write)
> +               for (i = 1; i <= data_wds; i++)
> +                       *p++ = octep_read32_word(mbox, i);
> +
> +       return 0;
> +}
> +
> +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> +{
> +       iowrite32(1, &mbox->sts);
> +}
> +
> +int octep_verify_features(u64 features)
> +{
> +       /* Minimum features to expect */
> +       if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> +               return -EOPNOTSUPP;
> +
> +       if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> +               return -EOPNOTSUPP;
> +
> +       if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> +               return -EOPNOTSUPP;
> +
> +       return 0;
> +}
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> +{
> +       return ioread8(&oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> +{
> +       iowrite8(status, &oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_reset(struct octep_hw *oct_hw)
> +{
> +       u8 val;
> +
> +       octep_hw_set_status(oct_hw, 0 | BIT(DEV_RST_ACK_BIT));
> +       if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
> +                              OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> +               return;
> +       }
> +}
> +
> +static int feature_sel_write_with_timeout(struct octep_hw *oct_hw, u32 select, void __iomem *addr)
> +{
> +       u32 val;
> +
> +       iowrite32(select | BIT(FEATURE_SEL_ACK_BIT), addr);
> +
> +       if (readx_poll_timeout(ioread32, addr, val, val == select, 10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +               return -1;
> +       }
> +       return 0;
> +}
> +
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> +{
> +       u32 features_lo, features_hi;
> +
> +       if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->device_feature_select))
> +               return 0;
> +
> +       features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +       if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->device_feature_select))
> +               return 0;
> +
> +       features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +       return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw)
> +{
> +       u32 features_lo, features_hi;
> +
> +       if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
> +               return 0;
> +
> +       features_lo = ioread32(&oct_hw->common_cfg->guest_feature);
> +
> +       if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
> +               return 0;
> +
> +       features_hi = ioread32(&oct_hw->common_cfg->guest_feature);
> +
> +       return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> +{
> +       if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
> +               return;
> +
> +       iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
> +
> +       if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
> +               return;
> +
> +       iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> +}
> +
> +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id)
> +{
> +       u16 val;
> +
> +       iowrite16(queue_id | BIT(QUEUE_SEL_ACK_BIT), &oct_hw->common_cfg->queue_select);
> +
> +       if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
> +                              10, OCTEP_HW_TIMEOUT)) {
> +               dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> +               return;
> +       }
> +}
> +
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> +{
> +       iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> +}
> +
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
> +{
> +       u8 old_gen, new_gen, *p;
> +       int i;
> +
> +       if (WARN_ON(offset + length > oct_hw->config_size))
> +               return;
> +
> +       do {
> +               old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +               p = dst;
> +               for (i = 0; i < length; i++)
> +                       *p++ = ioread8(oct_hw->dev_cfg + offset + i);
> +
> +               new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +       } while (old_gen != new_gen);
> +}
> +
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +                        u64 device_area)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(oct_hw, qid);
> +       vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +                            &cfg->queue_desc_hi);
> +       vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +                            &cfg->queue_avail_hi);
> +       vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +                            &cfg->queue_used_hi);
> +
> +       return 0;
> +}
> +
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
> +{
> +       return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> +                                 sizeof(*state), 0);
> +}
> +
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
> +{
> +       struct vdpa_vq_state q_state;
> +
> +       memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> +       return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> +                                 sizeof(*state), 1);
> +}
> +
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(oct_hw, qid);
> +       iowrite16(num, &cfg->queue_size);
> +}
> +
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(oct_hw, qid);
> +       iowrite16(ready, &cfg->queue_enable);
> +}
> +
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> +{
> +       struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +       octep_write_queue_select(oct_hw, qid);
> +       return ioread16(&cfg->queue_enable);
> +}
> +
> +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> +{
> +       octep_write_queue_select(oct_hw, 0);
> +       return ioread16(&oct_hw->common_cfg->queue_size);
> +}
> +
> +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> +{
> +       return sizeof(struct virtio_net_config);
> +}
> +
> +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
> +{
> +       struct device *dev = &oct_hw->pdev->dev;
> +       u32 length = le32_to_cpu(cap->length);
> +       u32 offset = le32_to_cpu(cap->offset);
> +       u8  bar    = cap->bar;
> +       u32 len;
> +
> +       if (bar != OCTEP_HW_CAPS_BAR) {
> +               dev_err(dev, "Invalid bar: %u\n", bar);
> +               return NULL;
> +       }
> +       if (offset + length < offset) {
> +               dev_err(dev, "offset(%u) + length(%u) overflows\n",
> +                       offset, length);
> +               return NULL;
> +       }
> +       len = pci_resource_len(oct_hw->pdev, bar);
> +       if (offset + length > len) {
> +               dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> +                       offset + length, len);
> +               return NULL;
> +       }
> +       return oct_hw->base[bar] + offset;
> +}
> +
> +/* In Octeon DPU device, the virtio config space completely
> + * emulated by the device's firmware. So, the standard pci config
> + * read apis can't be used for reading the virtio capabilities.
> + */
> +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
> +{
> +       u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> +       u8 *p = buf;
> +       size_t i;
> +
> +       for (i = 0; i < len; i++)
> +               *p++ = ioread8(bar + offset + i);
> +}
> +
> +static int pci_signature_verify(struct octep_hw *oct_hw)
> +{
> +       u32 signature[2];
> +
> +       pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> +
> +       if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> +               return -1;
> +
> +       if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> +               return -1;
> +
> +       return 0;
> +}
> +
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> +{
> +       struct octep_mbox __iomem *mbox;
> +       struct device *dev = &pdev->dev;
> +       struct virtio_pci_cap cap;
> +       u16 notify_off;
> +       int i, ret;
> +       u8 pos;
> +
> +       oct_hw->pdev = pdev;
> +       ret = pci_signature_verify(oct_hw);
> +       if (ret) {
> +               dev_err(dev, "Octeon Virtio FW is not initialized\n");
> +               return -EIO;
> +       }
> +
> +       pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> +
> +       while (pos) {
> +               pci_caps_read(oct_hw, &cap, 2, pos);
> +
> +               if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> +                       dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
> +                       break;
> +               }
> +
> +               pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> +
> +               dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
> +                        pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> +
> +               switch (cap.cfg_type) {
> +               case VIRTIO_PCI_CAP_COMMON_CFG:
> +                       oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               case VIRTIO_PCI_CAP_NOTIFY_CFG:
> +                       pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> +                                     4, pos + sizeof(cap));
> +
> +                       oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> +                       oct_hw->notify_bar = cap.bar;
> +                       oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
> +                                                le32_to_cpu(cap.offset);
> +                       break;
> +               case VIRTIO_PCI_CAP_DEVICE_CFG:
> +                       oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               case VIRTIO_PCI_CAP_ISR_CFG:
> +                       oct_hw->isr = get_cap_addr(oct_hw, &cap);
> +                       break;
> +               }
> +
> +               pos = cap.cap_next;
> +       }
> +       if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> +           !oct_hw->dev_cfg    || !oct_hw->isr) {
> +               dev_err(dev, "Incomplete PCI capabilities");
> +               return -EIO;
> +       }
> +       dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
> +       dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
> +       dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> +       dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> +                (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> +
> +       oct_hw->config_size = octep_get_config_size(oct_hw);
> +       oct_hw->features = octep_hw_get_dev_features(oct_hw);
> +
> +       ret = octep_verify_features(oct_hw->features);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");

This error message seems to be misleading, I guess we should say
something like missing features in FW?

> +               return ret;
> +       }
> +       oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
> +
> +       oct_hw->vqs = devm_kcalloc(&pdev->dev, oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
> +       if (!oct_hw->vqs)
> +               return -ENOMEM;
> +
> +       oct_hw->irq = -1;
> +
> +       dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> +       dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> +
> +       for (i = 0; i < oct_hw->nr_vring; i++) {
> +               octep_write_queue_select(oct_hw, i);
> +               notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
> +               oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> +                       notify_off * oct_hw->notify_off_multiplier;
> +               oct_hw->vqs[i].cb_notify_addr = (u32 __iomem *)oct_hw->vqs[i].notify_addr + 1;
> +               oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> +                       notify_off * oct_hw->notify_off_multiplier;
> +       }
> +       mbox = octep_get_mbox(oct_hw);
> +       octep_mbox_init(mbox);
> +       dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> +
> +       return 0;
> +}
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> new file mode 100644
> index 000000000000..89860a443981
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> @@ -0,0 +1,850 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/iommu.h>
> +#include "octep_vdpa.h"
> +
> +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> +
> +struct octep_pf {
> +       u8 __iomem *base[PCI_STD_NUM_BARS];
> +       struct pci_dev *pdev;
> +       struct resource res;
> +       u64 vf_base;
> +       int enabled_vfs;
> +       u32 vf_stride;
> +       u16 vf_devid;
> +};
> +
> +struct octep_vdpa {
> +       struct vdpa_device vdpa;
> +       struct octep_hw *oct_hw;
> +       struct pci_dev *pdev;
> +};
> +
> +struct octep_vdpa_mgmt_dev {
> +       struct vdpa_mgmt_dev mdev;
> +       struct octep_hw oct_hw;
> +       struct pci_dev *pdev;
> +       /* Work entry to handle device setup */
> +       struct work_struct setup_task;
> +       /* Device status */
> +       atomic_t status;
> +};
> +
> +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_vdpa *oct_vdpa;
> +
> +       oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> +
> +       return oct_vdpa->oct_hw;
> +}
> +
> +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> +{
> +       struct octep_hw *oct_hw = data;
> +       int i;
> +
> +       for (i = 0; i < oct_hw->nr_vring; i++) {
> +               if (oct_hw->vqs[i].cb.callback && ioread32(oct_hw->vqs[i].cb_notify_addr)) {
> +                       /* Acknowledge the per queue notification to the device */
> +                       iowrite32(0, oct_hw->vqs[i].cb_notify_addr);
> +                       oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> +               }
> +       }
> +
> +       return IRQ_HANDLED;
> +}
> +
> +static void octep_free_irqs(struct octep_hw *oct_hw)
> +{
> +       struct pci_dev *pdev = oct_hw->pdev;
> +
> +       if (oct_hw->irq != -1) {
> +               devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> +               oct_hw->irq = -1;
> +       }
> +       pci_free_irq_vectors(pdev);
> +}
> +
> +static int octep_request_irqs(struct octep_hw *oct_hw)
> +{
> +       struct pci_dev *pdev = oct_hw->pdev;
> +       int ret, irq;
> +
> +       /* Currently HW device provisions one IRQ per VF, hence
> +        * allocate one IRQ for all virtqueues call interface.
> +        */
> +       ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> +       if (ret < 0) {
> +               dev_err(&pdev->dev, "Failed to alloc msix vector");
> +               return ret;
> +       }
> +
> +       snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> +                OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> +
> +       irq = pci_irq_vector(pdev, 0);
> +       ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> +                              oct_hw->vqs->msix_name, oct_hw);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> +               goto free_irq_vec;
> +       }
> +       oct_hw->irq = irq;
> +
> +       return 0;
> +
> +free_irq_vec:
> +       pci_free_irq_vectors(pdev);
> +       return ret;
> +}
> +
> +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +       return oct_hw->features;
> +}
> +
> +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
> +{
> +       struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +       int ret;
> +
> +       pr_debug("Driver Features: %llx\n", features);
> +
> +       ret = octep_verify_features(features);
> +       if (ret)
> +               return ret;

Nit: I think we need a warning or something here.

Other part looks good.

With those fixed.

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices
  2024-04-24 19:08 ` [PATCH v2] " Srujana Challa
  2024-04-25  1:57   ` Jason Wang
@ 2024-04-25  9:36   ` Michael S. Tsirkin
  1 sibling, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2024-04-25  9:36 UTC (permalink / raw)
  To: Srujana Challa
  Cc: virtualization, jasowang, xuanzhuo, vattunuru, sthotton,
	ndabilpuram, jerinj

On Thu, Apr 25, 2024 at 12:38:14AM +0530, Srujana Challa wrote:
> This commit introduces a new vDPA driver specifically designed for
> managing the virtio control plane over the vDPA bus for OCTEON DPU
> devices. The driver consists of two layers:
> 
> 1. Octep HW Layer (Octeon Endpoint): Responsible for handling hardware
> operations and configurations related to the DPU device.
> 
> 2. Octep Main Layer: Compliant with the vDPA bus framework, this layer
> implements device operations for the vDPA bus. It handles device
> probing, bus attachment, vring operations, and other relevant tasks.
> 
> Signed-off-by: Srujana Challa <schalla@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>

Thanks!
In the future pls do not send new versions as reply to old ones,
start a new thread please.

> ---
> v2:
> * Addressed review comments
> 
>  MAINTAINERS                              |   5 +
>  drivers/vdpa/Kconfig                     |   9 +
>  drivers/vdpa/Makefile                    |   1 +
>  drivers/vdpa/octeon_ep/Makefile          |   4 +
>  drivers/vdpa/octeon_ep/octep_vdpa.h      |  94 +++
>  drivers/vdpa/octeon_ep/octep_vdpa_hw.c   | 517 ++++++++++++++
>  drivers/vdpa/octeon_ep/octep_vdpa_main.c | 850 +++++++++++++++++++++++
>  7 files changed, 1480 insertions(+)
>  create mode 100644 drivers/vdpa/octeon_ep/Makefile
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa.h
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_hw.c
>  create mode 100644 drivers/vdpa/octeon_ep/octep_vdpa_main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cfe44a876d8a..539ce209a960 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13127,6 +13127,11 @@ S:	Supported
>  F:	Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
>  F:	drivers/mmc/host/sdhci-xenon*
>  
> +MARVELL OCTEON ENDPOINT VIRTIO DATA PATH ACCELERATOR
> +R:	schalla@marvell.com
> +R:	vattunuru@marvell.com
> +F:	drivers/vdpa/octeon_ep/
> +
>  MATROX FRAMEBUFFER DRIVER
>  L:	linux-fbdev@vger.kernel.org
>  S:	Orphan
> diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> index 656c1cb541de..775149fad476 100644
> --- a/drivers/vdpa/Kconfig
> +++ b/drivers/vdpa/Kconfig
> @@ -126,4 +126,13 @@ config PDS_VDPA
>  	  With this driver, the VirtIO dataplane can be
>  	  offloaded to an AMD/Pensando DSC device.
>  
> +config OCTEONEP_VDPA
> +	tristate "vDPA driver for Octeon DPU devices"
> +	depends on m
> +	depends on PCI_MSI
> +	help
> +	  vDPA driver for Marvell's Octeon DPU devices.
> +	  With this driver, the VirtIO dataplane can be
> +	  offloaded to a Octeon DPU device.
> +
>  endif # VDPA
> diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
> index 8f53c6f3cca7..5654d36707af 100644
> --- a/drivers/vdpa/Makefile
> +++ b/drivers/vdpa/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_VP_VDPA)    += virtio_pci/
>  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
>  obj-$(CONFIG_SNET_VDPA) += solidrun/
>  obj-$(CONFIG_PDS_VDPA) += pds/
> +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/
> diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile
> new file mode 100644
> index 000000000000..e23e2ff14f33
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o
> +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h
> new file mode 100644
> index 000000000000..046710ec4d42
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h
> @@ -0,0 +1,94 @@
> +/* SPDX-License-Identifier: GPL-2.0-only
> + * Copyright (C) 2024 Marvell.
> + */
> +#ifndef __OCTEP_VDPA_H__
> +#define __OCTEP_VDPA_H__
> +
> +#include <linux/pci.h>
> +#include <linux/pci_regs.h>
> +#include <linux/vdpa.h>
> +#include <linux/virtio_pci_modern.h>
> +#include <uapi/linux/virtio_net.h>
> +#include <uapi/linux/virtio_blk.h>
> +#include <uapi/linux/virtio_config.h>
> +#include <uapi/linux/virtio_pci.h>
> +#include <uapi/linux/vdpa.h>
> +
> +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900
> +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903
> +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00
> +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03
> +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00
> +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03
> +
> +#define OCTEP_HW_MBOX_BAR 0
> +#define OCTEP_HW_CAPS_BAR 4
> +
> +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA
> +
> +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25))
> +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17))
> +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4))
> +
> +#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF)
> +#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F)
> +
> +#define OCTEP_FW_READY_SIGNATURE0  0xFEEDFEED
> +#define OCTEP_FW_READY_SIGNATURE1  0x3355ffaa
> +
> +enum octep_vdpa_dev_status {
> +	OCTEP_VDPA_DEV_STATUS_INVALID,
> +	OCTEP_VDPA_DEV_STATUS_ALLOC,
> +	OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT,
> +	OCTEP_VDPA_DEV_STATUS_INIT,
> +	OCTEP_VDPA_DEV_STATUS_READY,
> +	OCTEP_VDPA_DEV_STATUS_UNINIT
> +};
> +
> +struct octep_vring_info {
> +	struct vdpa_callback cb;
> +	void __iomem *notify_addr;
> +	u32 __iomem *cb_notify_addr;
> +	phys_addr_t notify_pa;
> +	char msix_name[256];
> +};
> +
> +struct octep_hw {
> +	struct pci_dev *pdev;
> +	u8 __iomem *base[PCI_STD_NUM_BARS];
> +	struct virtio_pci_common_cfg __iomem *common_cfg;
> +	u8 __iomem *dev_cfg;
> +	u8 __iomem *isr;
> +	void __iomem *notify_base;
> +	phys_addr_t notify_base_pa;
> +	u32 notify_off_multiplier;
> +	u8 notify_bar;
> +	struct octep_vring_info *vqs;
> +	struct vdpa_callback config_cb;
> +	u64 features;
> +	u16 nr_vring;
> +	u32 config_size;
> +	int irq;
> +};
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw);
> +void octep_hw_set_status(struct octep_hw *dev, uint8_t status);
> +void octep_hw_reset(struct octep_hw *oct_hw);
> +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id);
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid);
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length);
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +			 u64 device_area);
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num);
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready);
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid);
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state);
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state);
> +u16 octep_get_vq_size(struct octep_hw *oct_hw);
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev);
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw);
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features);
> +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw);
> +int octep_verify_features(u64 features);
> +
> +#endif /* __OCTEP_VDPA_H__ */
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> new file mode 100644
> index 000000000000..6afd06ead46a
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c
> @@ -0,0 +1,517 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/iopoll.h>
> +
> +#include "octep_vdpa.h"
> +
> +enum octep_mbox_ids {
> +	OCTEP_MBOX_MSG_SET_VQ_STATE = 1,
> +	OCTEP_MBOX_MSG_GET_VQ_STATE,
> +};
> +
> +#define OCTEP_HW_TIMEOUT       10000000
> +
> +#define MBOX_OFFSET            64
> +#define MBOX_RSP_MASK          0x00000001
> +#define MBOX_RC_MASK           0x0000FFFE
> +
> +#define MBOX_RSP_TO_ERR(val)   (-(((val) & MBOX_RC_MASK) >> 2))
> +#define MBOX_AVAIL(val)        (((val) & MBOX_RSP_MASK))
> +#define MBOX_RSP(val)          ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK))
> +
> +#define DEV_RST_ACK_BIT        7
> +#define FEATURE_SEL_ACK_BIT    15
> +#define QUEUE_SEL_ACK_BIT      15
> +
> +struct octep_mbox_hdr {
> +	u8 ver;
> +	u8 rsvd1;
> +	u16 id;
> +	u16 rsvd2;
> +#define MBOX_REQ_SIG (0xdead)
> +#define MBOX_RSP_SIG (0xbeef)
> +	u16 sig;
> +};
> +
> +struct octep_mbox_sts {
> +	u16 rsp:1;
> +	u16 rc:15;
> +	u16 rsvd;
> +};
> +
> +struct octep_mbox {
> +	struct octep_mbox_hdr hdr;
> +	struct octep_mbox_sts sts;
> +	u64 rsvd;
> +	u32 data[];
> +};
> +
> +static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw)
> +{
> +	return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET);
> +}
> +
> +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox)
> +{
> +	u32 val;
> +
> +	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10,
> +				  OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox)
> +{
> +	u32 val;
> +
> +	return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10,
> +				  OCTEP_HW_TIMEOUT);
> +}
> +
> +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig)
> +{
> +	iowrite16(id, &mbox->hdr.id);
> +	iowrite16(sig, &mbox->hdr.sig);
> +}
> +
> +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox)
> +{
> +	return ioread16(&mbox->hdr.sig);
> +}
> +
> +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts)
> +{
> +	iowrite32(sts, &mbox->sts);
> +}
> +
> +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox)
> +{
> +	return ioread32(&mbox->sts);
> +}
> +
> +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx)
> +{
> +	return ioread32(&mbox->data[word_idx]);
> +}
> +
> +static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word)
> +{
> +	return iowrite32(word, &mbox->data[word_idx]);
> +}
> +
> +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer,
> +			      u32 buf_size, bool write)
> +{
> +	struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw);
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	u32 *p = (u32 *)buffer;
> +	u16 data_wds;
> +	int ret, i;
> +	u32 val;
> +
> +	if (!IS_ALIGNED(buf_size, 4))
> +		return -EINVAL;
> +
> +	/* Make sure mbox space is available */
> +	ret = octep_wait_for_mbox_avail(mbox);
> +	if (ret) {
> +		dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n");
> +		return ret;
> +	}
> +	data_wds = buf_size / 4;
> +
> +	if (write) {
> +		for (i = 1; i <= data_wds; i++) {
> +			octep_write32_word(mbox, i, *p);
> +			p++;
> +		}
> +	}
> +	octep_write32_word(mbox, 0, (u32)qid);
> +	octep_write_sts(mbox, 0);
> +
> +	octep_write_hdr(mbox, id, MBOX_REQ_SIG);
> +
> +	ret = octep_wait_for_mbox_rsp(mbox);
> +	if (ret) {
> +		dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id);
> +		return ret;
> +	}
> +
> +	val = octep_read_sig(mbox);
> +	if ((val & 0xFFFF) != MBOX_RSP_SIG) {
> +		dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id);
> +		return ret;
> +	}
> +
> +	val = octep_read_sts(mbox);
> +	if (val & MBOX_RC_MASK) {
> +		ret = MBOX_RSP_TO_ERR(val);
> +		dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret);
> +		return ret;
> +	}
> +
> +	if (!write)
> +		for (i = 1; i <= data_wds; i++)
> +			*p++ = octep_read32_word(mbox, i);
> +
> +	return 0;
> +}
> +
> +static void octep_mbox_init(struct octep_mbox __iomem *mbox)
> +{
> +	iowrite32(1, &mbox->sts);
> +}
> +
> +int octep_verify_features(u64 features)
> +{
> +	/* Minimum features to expect */
> +	if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
> +		return -EOPNOTSUPP;
> +
> +	if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA)))
> +		return -EOPNOTSUPP;
> +
> +	if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED)))
> +		return -EOPNOTSUPP;
> +
> +	return 0;
> +}
> +
> +u8 octep_hw_get_status(struct octep_hw *oct_hw)
> +{
> +	return ioread8(&oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status)
> +{
> +	iowrite8(status, &oct_hw->common_cfg->device_status);
> +}
> +
> +void octep_hw_reset(struct octep_hw *oct_hw)
> +{
> +	u8 val;
> +
> +	octep_hw_set_status(oct_hw, 0 | BIT(DEV_RST_ACK_BIT));
> +	if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10,
> +			       OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n");
> +		return;
> +	}
> +}
> +
> +static int feature_sel_write_with_timeout(struct octep_hw *oct_hw, u32 select, void __iomem *addr)
> +{
> +	u32 val;
> +
> +	iowrite32(select | BIT(FEATURE_SEL_ACK_BIT), addr);
> +
> +	if (readx_poll_timeout(ioread32, addr, val, val == select, 10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select);
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw)
> +{
> +	u32 features_lo, features_hi;
> +
> +	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->device_feature_select))
> +		return 0;
> +
> +	features_lo = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->device_feature_select))
> +		return 0;
> +
> +	features_hi = ioread32(&oct_hw->common_cfg->device_feature);
> +
> +	return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw)
> +{
> +	u32 features_lo, features_hi;
> +
> +	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
> +		return 0;
> +
> +	features_lo = ioread32(&oct_hw->common_cfg->guest_feature);
> +
> +	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
> +		return 0;
> +
> +	features_hi = ioread32(&oct_hw->common_cfg->guest_feature);
> +
> +	return ((u64)features_hi << 32) | features_lo;
> +}
> +
> +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features)
> +{
> +	if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select))
> +		return;
> +
> +	iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature);
> +
> +	if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select))
> +		return;
> +
> +	iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature);
> +}
> +
> +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id)
> +{
> +	u16 val;
> +
> +	iowrite16(queue_id | BIT(QUEUE_SEL_ACK_BIT), &oct_hw->common_cfg->queue_select);
> +
> +	if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id,
> +			       10, OCTEP_HW_TIMEOUT)) {
> +		dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n");
> +		return;
> +	}
> +}
> +
> +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid)
> +{
> +	iowrite16(qid, oct_hw->vqs[qid].notify_addr);
> +}
> +
> +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length)
> +{
> +	u8 old_gen, new_gen, *p;
> +	int i;
> +
> +	if (WARN_ON(offset + length > oct_hw->config_size))
> +		return;
> +
> +	do {
> +		old_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +		p = dst;
> +		for (i = 0; i < length; i++)
> +			*p++ = ioread8(oct_hw->dev_cfg + offset + i);
> +
> +		new_gen = ioread8(&oct_hw->common_cfg->config_generation);
> +	} while (old_gen != new_gen);
> +}
> +
> +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area,
> +			 u64 device_area)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(oct_hw, qid);
> +	vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +			     &cfg->queue_desc_hi);
> +	vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +			     &cfg->queue_avail_hi);
> +	vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +			     &cfg->queue_used_hi);
> +
> +	return 0;
> +}
> +
> +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state)
> +{
> +	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state,
> +				  sizeof(*state), 0);
> +}
> +
> +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state)
> +{
> +	struct vdpa_vq_state q_state;
> +
> +	memcpy(&q_state, state, sizeof(struct vdpa_vq_state));
> +	return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state,
> +				  sizeof(*state), 1);
> +}
> +
> +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(oct_hw, qid);
> +	iowrite16(num, &cfg->queue_size);
> +}
> +
> +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(oct_hw, qid);
> +	iowrite16(ready, &cfg->queue_enable);
> +}
> +
> +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid)
> +{
> +	struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg;
> +
> +	octep_write_queue_select(oct_hw, qid);
> +	return ioread16(&cfg->queue_enable);
> +}
> +
> +u16 octep_get_vq_size(struct octep_hw *oct_hw)
> +{
> +	octep_write_queue_select(oct_hw, 0);
> +	return ioread16(&oct_hw->common_cfg->queue_size);
> +}
> +
> +static u32 octep_get_config_size(struct octep_hw *oct_hw)
> +{
> +	return sizeof(struct virtio_net_config);
> +}
> +
> +static void __iomem *get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap)
> +{
> +	struct device *dev = &oct_hw->pdev->dev;
> +	u32 length = le32_to_cpu(cap->length);
> +	u32 offset = le32_to_cpu(cap->offset);
> +	u8  bar    = cap->bar;
> +	u32 len;
> +
> +	if (bar != OCTEP_HW_CAPS_BAR) {
> +		dev_err(dev, "Invalid bar: %u\n", bar);
> +		return NULL;
> +	}
> +	if (offset + length < offset) {
> +		dev_err(dev, "offset(%u) + length(%u) overflows\n",
> +			offset, length);
> +		return NULL;
> +	}
> +	len = pci_resource_len(oct_hw->pdev, bar);
> +	if (offset + length > len) {
> +		dev_err(dev, "invalid cap: overflows bar space: %u > %u\n",
> +			offset + length, len);
> +		return NULL;
> +	}
> +	return oct_hw->base[bar] + offset;
> +}
> +
> +/* In Octeon DPU device, the virtio config space completely
> + * emulated by the device's firmware. So, the standard pci config
> + * read apis can't be used for reading the virtio capabilities.
> + */
> +static void pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset)
> +{
> +	u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR];
> +	u8 *p = buf;
> +	size_t i;
> +
> +	for (i = 0; i < len; i++)
> +		*p++ = ioread8(bar + offset + i);
> +}
> +
> +static int pci_signature_verify(struct octep_hw *oct_hw)
> +{
> +	u32 signature[2];
> +
> +	pci_caps_read(oct_hw, &signature, sizeof(signature), 0);
> +
> +	if (signature[0] != OCTEP_FW_READY_SIGNATURE0)
> +		return -1;
> +
> +	if (signature[1] != OCTEP_FW_READY_SIGNATURE1)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev)
> +{
> +	struct octep_mbox __iomem *mbox;
> +	struct device *dev = &pdev->dev;
> +	struct virtio_pci_cap cap;
> +	u16 notify_off;
> +	int i, ret;
> +	u8 pos;
> +
> +	oct_hw->pdev = pdev;
> +	ret = pci_signature_verify(oct_hw);
> +	if (ret) {
> +		dev_err(dev, "Octeon Virtio FW is not initialized\n");
> +		return -EIO;
> +	}
> +
> +	pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST);
> +
> +	while (pos) {
> +		pci_caps_read(oct_hw, &cap, 2, pos);
> +
> +		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
> +			dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr);
> +			break;
> +		}
> +
> +		pci_caps_read(oct_hw, &cap, sizeof(cap), pos);
> +
> +		dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
> +			 pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
> +
> +		switch (cap.cfg_type) {
> +		case VIRTIO_PCI_CAP_COMMON_CFG:
> +			oct_hw->common_cfg = get_cap_addr(oct_hw, &cap);
> +			break;
> +		case VIRTIO_PCI_CAP_NOTIFY_CFG:
> +			pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier,
> +				      4, pos + sizeof(cap));
> +
> +			oct_hw->notify_base = get_cap_addr(oct_hw, &cap);
> +			oct_hw->notify_bar = cap.bar;
> +			oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) +
> +						 le32_to_cpu(cap.offset);
> +			break;
> +		case VIRTIO_PCI_CAP_DEVICE_CFG:
> +			oct_hw->dev_cfg = get_cap_addr(oct_hw, &cap);
> +			break;
> +		case VIRTIO_PCI_CAP_ISR_CFG:
> +			oct_hw->isr = get_cap_addr(oct_hw, &cap);
> +			break;
> +		}
> +
> +		pos = cap.cap_next;
> +	}
> +	if (!oct_hw->common_cfg || !oct_hw->notify_base ||
> +	    !oct_hw->dev_cfg    || !oct_hw->isr) {
> +		dev_err(dev, "Incomplete PCI capabilities");
> +		return -EIO;
> +	}
> +	dev_info(dev, "common cfg mapped at: 0x%016llx\n", (u64)oct_hw->common_cfg);
> +	dev_info(dev, "device cfg mapped at: 0x%016llx\n", (u64)oct_hw->dev_cfg);
> +	dev_info(dev, "isr cfg mapped at: 0x%016llx\n", (u64)oct_hw->isr);
> +	dev_info(dev, "notify base: 0x%016llx, notify off multiplier: %u\n",
> +		 (u64)oct_hw->notify_base, oct_hw->notify_off_multiplier);
> +
> +	oct_hw->config_size = octep_get_config_size(oct_hw);
> +	oct_hw->features = octep_hw_get_dev_features(oct_hw);
> +
> +	ret = octep_verify_features(oct_hw->features);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Octeon Virtio FW is not initialized\n");
> +		return ret;
> +	}
> +	oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues);
> +
> +	oct_hw->vqs = devm_kcalloc(&pdev->dev, oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL);
> +	if (!oct_hw->vqs)
> +		return -ENOMEM;
> +
> +	oct_hw->irq = -1;
> +
> +	dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features);
> +	dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring);
> +
> +	for (i = 0; i < oct_hw->nr_vring; i++) {
> +		octep_write_queue_select(oct_hw, i);
> +		notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off);
> +		oct_hw->vqs[i].notify_addr = oct_hw->notify_base +
> +			notify_off * oct_hw->notify_off_multiplier;
> +		oct_hw->vqs[i].cb_notify_addr = (u32 __iomem *)oct_hw->vqs[i].notify_addr + 1;
> +		oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa +
> +			notify_off * oct_hw->notify_off_multiplier;
> +	}
> +	mbox = octep_get_mbox(oct_hw);
> +	octep_mbox_init(mbox);
> +	dev_info(dev, "mbox mapped at: 0x%016llx\n", (u64)mbox);
> +
> +	return 0;
> +}
> diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> new file mode 100644
> index 000000000000..89860a443981
> --- /dev/null
> +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
> @@ -0,0 +1,850 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Marvell. */
> +
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/iommu.h>
> +#include "octep_vdpa.h"
> +
> +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa"
> +
> +struct octep_pf {
> +	u8 __iomem *base[PCI_STD_NUM_BARS];
> +	struct pci_dev *pdev;
> +	struct resource res;
> +	u64 vf_base;
> +	int enabled_vfs;
> +	u32 vf_stride;
> +	u16 vf_devid;
> +};
> +
> +struct octep_vdpa {
> +	struct vdpa_device vdpa;
> +	struct octep_hw *oct_hw;
> +	struct pci_dev *pdev;
> +};
> +
> +struct octep_vdpa_mgmt_dev {
> +	struct vdpa_mgmt_dev mdev;
> +	struct octep_hw oct_hw;
> +	struct pci_dev *pdev;
> +	/* Work entry to handle device setup */
> +	struct work_struct setup_task;
> +	/* Device status */
> +	atomic_t status;
> +};
> +
> +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_vdpa *oct_vdpa;
> +
> +	oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa);
> +
> +	return oct_vdpa->oct_hw;
> +}
> +
> +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data)
> +{
> +	struct octep_hw *oct_hw = data;
> +	int i;
> +
> +	for (i = 0; i < oct_hw->nr_vring; i++) {
> +		if (oct_hw->vqs[i].cb.callback && ioread32(oct_hw->vqs[i].cb_notify_addr)) {
> +			/* Acknowledge the per queue notification to the device */
> +			iowrite32(0, oct_hw->vqs[i].cb_notify_addr);
> +			oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private);
> +		}
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void octep_free_irqs(struct octep_hw *oct_hw)
> +{
> +	struct pci_dev *pdev = oct_hw->pdev;
> +
> +	if (oct_hw->irq != -1) {
> +		devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw);
> +		oct_hw->irq = -1;
> +	}
> +	pci_free_irq_vectors(pdev);
> +}
> +
> +static int octep_request_irqs(struct octep_hw *oct_hw)
> +{
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	int ret, irq;
> +
> +	/* Currently HW device provisions one IRQ per VF, hence
> +	 * allocate one IRQ for all virtqueues call interface.
> +	 */
> +	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
> +	if (ret < 0) {
> +		dev_err(&pdev->dev, "Failed to alloc msix vector");
> +		return ret;
> +	}
> +
> +	snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name),
> +		 OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev));
> +
> +	irq = pci_irq_vector(pdev, 0);
> +	ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0,
> +			       oct_hw->vqs->msix_name, oct_hw);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to register interrupt handler\n");
> +		goto free_irq_vec;
> +	}
> +	oct_hw->irq = irq;
> +
> +	return 0;
> +
> +free_irq_vec:
> +	pci_free_irq_vectors(pdev);
> +	return ret;
> +}
> +
> +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return oct_hw->features;
> +}
> +
> +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	int ret;
> +
> +	pr_debug("Driver Features: %llx\n", features);
> +
> +	ret = octep_verify_features(features);
> +	if (ret)
> +		return ret;
> +	octep_hw_set_drv_features(oct_hw, features);
> +
> +	return 0;
> +}
> +
> +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_hw_get_drv_features(oct_hw);
> +}
> +
> +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_hw_get_status(oct_hw);
> +}
> +
> +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u8 status_old;
> +
> +	status_old = octep_hw_get_status(oct_hw);
> +
> +	if (status_old == status)
> +		return;
> +
> +	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
> +	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +		if (octep_request_irqs(oct_hw))
> +			status = status_old | VIRTIO_CONFIG_S_FAILED;
> +	}
> +	octep_hw_set_status(oct_hw, status);
> +}
> +
> +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u8 status = octep_hw_get_status(oct_hw);
> +	u16 qid;
> +
> +	if (status == 0)
> +		return 0;
> +
> +	for (qid = 0; qid < oct_hw->nr_vring; qid++) {
> +		oct_hw->vqs[qid].cb.callback = NULL;
> +		oct_hw->vqs[qid].cb.private = NULL;
> +		oct_hw->config_cb.callback = NULL;
> +		oct_hw->config_cb.private = NULL;
> +	}
> +	octep_hw_reset(oct_hw);
> +
> +	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> +		octep_free_irqs(oct_hw);
> +
> +	return 0;
> +}
> +
> +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_size(oct_hw);
> +}
> +
> +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +				   struct vdpa_vq_state *state)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_state(oct_hw, qid, state);
> +}
> +
> +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> +				   const struct vdpa_vq_state *state)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_set_vq_state(oct_hw, qid, state);
> +}
> +
> +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	oct_hw->vqs[qid].cb = *cb;
> +}
> +
> +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_set_vq_ready(oct_hw, qid, ready);
> +}
> +
> +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return octep_get_vq_ready(oct_hw, qid);
> +}
> +
> +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_set_vq_num(oct_hw, qid, num);
> +}
> +
> +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area,
> +				     u64 driver_area, u64 device_area)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area);
> +	pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area);
> +	pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area);
> +
> +	return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area);
> +}
> +
> +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
> +{
> +	/* Not supported */
> +}
> +
> +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	u16 idx = data & 0xFFFF;
> +
> +	vp_iowrite32(data, oct_hw->vqs[idx].notify_addr);
> +}
> +
> +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return vp_ioread8(&oct_hw->common_cfg->config_generation);
> +}
> +
> +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
> +{
> +	return VIRTIO_ID_NET;
> +}
> +
> +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
> +{
> +	return PCI_VENDOR_ID_CAVIUM;
> +}
> +
> +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
> +{
> +	return PAGE_SIZE;
> +}
> +
> +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	return oct_hw->config_size;
> +}
> +
> +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf,
> +				  unsigned int len)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	octep_read_dev_config(oct_hw, offset, buf, len);
> +}
> +
> +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset,
> +				  const void *buf, unsigned int len)
> +{
> +	/* Not supported */
> +}
> +
> +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +
> +	oct_hw->config_cb.callback = cb->callback;
> +	oct_hw->config_cb.private = cb->private;
> +}
> +
> +static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev,
> +							       u16 idx)
> +{
> +	struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev);
> +	struct vdpa_notification_area area;
> +
> +	area.addr = oct_hw->vqs[idx].notify_pa;
> +	area.size = PAGE_SIZE;
> +
> +	return area;
> +}
> +
> +static struct vdpa_config_ops octep_vdpa_ops = {
> +	.get_device_features = octep_vdpa_get_device_features,
> +	.set_driver_features = octep_vdpa_set_driver_features,
> +	.get_driver_features = octep_vdpa_get_driver_features,
> +	.get_status	= octep_vdpa_get_status,
> +	.set_status	= octep_vdpa_set_status,
> +	.reset		= octep_vdpa_reset,
> +	.get_vq_num_max	= octep_vdpa_get_vq_num_max,
> +	.get_vq_state	= octep_vdpa_get_vq_state,
> +	.set_vq_state	= octep_vdpa_set_vq_state,
> +	.set_vq_cb	= octep_vdpa_set_vq_cb,
> +	.set_vq_ready	= octep_vdpa_set_vq_ready,
> +	.get_vq_ready	= octep_vdpa_get_vq_ready,
> +	.set_vq_num	= octep_vdpa_set_vq_num,
> +	.set_vq_address	= octep_vdpa_set_vq_address,
> +	.get_vq_irq	= NULL,
> +	.kick_vq	= octep_vdpa_kick_vq,
> +	.kick_vq_with_data	= octep_vdpa_kick_vq_with_data,
> +	.get_generation	= octep_vdpa_get_generation,
> +	.get_device_id	= octep_vdpa_get_device_id,
> +	.get_vendor_id	= octep_vdpa_get_vendor_id,
> +	.get_vq_align	= octep_vdpa_get_vq_align,
> +	.get_config_size	= octep_vdpa_get_config_size,
> +	.get_config	= octep_vdpa_get_config,
> +	.set_config	= octep_vdpa_set_config,
> +	.set_config_cb  = octep_vdpa_set_config_cb,
> +	.get_vq_notification = octep_get_vq_notification,
> +};
> +
> +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +	int ret;
> +
> +	ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar);
> +		return ret;
> +	}
> +
> +	tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar));
> +	if (!tbl[bar]) {
> +		dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar);
> +		pci_release_region(pdev, bar);
> +		ret = -ENOMEM;
> +	}
> +
> +	return ret;
> +}
> +
> +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar)
> +{
> +	pci_iounmap(pdev, tbl[bar]);
> +	pci_release_region(pdev, bar);
> +}
> +
> +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf)
> +{
> +	struct pci_dev *pf_dev = octpf->pdev;
> +	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct pci_bus_region bus_region;
> +
> +	octpf->res.start = res->start;
> +	octpf->res.end = res->end;
> +	octpf->vf_base = res->start;
> +
> +	bus_region.start = res->start;
> +	bus_region.end = res->start - 1;
> +
> +	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf)
> +{
> +	struct pci_dev *pf_dev = octpf->pdev;
> +	struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct pci_bus_region bus_region;
> +
> +	bus_region.start = octpf->res.start;
> +	bus_region.end = octpf->res.end;
> +
> +	pcibios_bus_to_resource(pf_dev->bus, res, &bus_region);
> +}
> +
> +static void octep_vdpa_remove_pf(struct pci_dev *pdev)
> +{
> +	struct octep_pf *octpf = pci_get_drvdata(pdev);
> +
> +	pci_disable_sriov(pdev);
> +
> +	if (octpf->base[OCTEP_HW_CAPS_BAR])
> +		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR);
> +
> +	if (octpf->base[OCTEP_HW_MBOX_BAR])
> +		octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +
> +	octep_vdpa_pf_bar_expand(octpf);
> +}
> +
> +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev)
> +{
> +	struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4;
> +
> +	memset(vf_res, 0, sizeof(*vf_res));
> +}
> +
> +static void octep_vdpa_remove_vf(struct pci_dev *pdev)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev);
> +	struct octep_hw *oct_hw;
> +	int status;
> +
> +	oct_hw = &mgmt_dev->oct_hw;
> +	status = atomic_read(&mgmt_dev->status);
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT);
> +
> +	cancel_work_sync(&mgmt_dev->setup_task);
> +	if (status == OCTEP_VDPA_DEV_STATUS_READY)
> +		vdpa_mgmtdev_unregister(&mgmt_dev->mdev);
> +
> +	if (oct_hw->base[OCTEP_HW_CAPS_BAR])
> +		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +
> +	if (oct_hw->base[OCTEP_HW_MBOX_BAR])
> +		octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR);
> +
> +	octep_vdpa_vf_bar_shrink(pdev);
> +}
> +
> +static void octep_vdpa_remove(struct pci_dev *pdev)
> +{
> +	if (pdev->is_virtfn)
> +		octep_vdpa_remove_vf(pdev);
> +	else
> +		octep_vdpa_remove_pf(pdev);
> +}
> +
> +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
> +			      const struct vdpa_dev_set_config *config)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev);
> +	struct octep_hw *oct_hw = &mgmt_dev->oct_hw;
> +	struct pci_dev *pdev = oct_hw->pdev;
> +	struct vdpa_device *vdpa_dev;
> +	struct octep_vdpa *oct_vdpa;
> +	u64 device_features;
> +	int ret;
> +
> +	oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1,
> +				     NULL, false);
> +	if (IS_ERR(oct_vdpa)) {
> +		dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device");
> +		return PTR_ERR(oct_vdpa);
> +	}
> +
> +	oct_vdpa->pdev = pdev;
> +	oct_vdpa->vdpa.dma_dev = &pdev->dev;
> +	oct_vdpa->vdpa.mdev = mdev;
> +	oct_vdpa->oct_hw = oct_hw;
> +	vdpa_dev = &oct_vdpa->vdpa;
> +
> +	device_features = oct_hw->features;
> +	if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> +		if (config->device_features & ~device_features) {
> +			dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
> +				config->device_features, device_features);
> +			ret = -EINVAL;
> +			goto vdpa_dev_put;
> +		}
> +		device_features &= config->device_features;
> +	}
> +
> +	oct_hw->features = device_features;
> +	dev_info(&pdev->dev, "Vdpa management device features : %llx\n", device_features);
> +
> +	ret = octep_verify_features(device_features);
> +	if (ret) {
> +		dev_warn(mdev->device,
> +			 "Must provision minimum features 0x%llx for this device",
> +			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |
> +			 BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_IN_ORDER));
> +		goto vdpa_dev_put;
> +	}
> +	if (name)
> +		ret = dev_set_name(&vdpa_dev->dev, "%s", name);
> +	else
> +		ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index);
> +
> +	ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to register to vDPA bus");
> +		goto vdpa_dev_put;
> +	}
> +	return 0;
> +
> +vdpa_dev_put:
> +	put_device(&oct_vdpa->vdpa.dev);
> +	return ret;
> +}
> +
> +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *vdpa_dev)
> +{
> +	_vdpa_unregister_device(vdpa_dev);
> +}
> +
> +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = {
> +	.dev_add = octep_vdpa_dev_add,
> +	.dev_del = octep_vdpa_dev_del
> +};
> +
> +static bool get_device_ready_status(u8 __iomem *addr)
> +{
> +	u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0));
> +
> +	if (signature == OCTEP_DEV_READY_SIGNATURE) {
> +		writeq(0, addr + OCTEP_VF_MBOX_DATA(0));
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static void octep_vdpa_setup_task(struct work_struct *work)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev,
> +							    setup_task);
> +	struct pci_dev *pdev = mgmt_dev->pdev;
> +	struct device *dev = &pdev->dev;
> +	struct octep_hw *oct_hw;
> +	unsigned long timeout;
> +	int ret;
> +
> +	oct_hw = &mgmt_dev->oct_hw;
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT);
> +
> +	/* Wait for a maximum of 5 sec */
> +	timeout = jiffies + msecs_to_jiffies(5000);
> +	while (!time_after(jiffies, timeout)) {
> +		if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) {
> +			atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT);
> +			break;
> +		}
> +
> +		if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) {
> +			dev_info(dev, "Stopping vDPA setup task.\n");
> +			return;
> +		}
> +
> +		usleep_range(1000, 1500);
> +	}
> +
> +	if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) {
> +		dev_err(dev, "BAR initialization is timed out\n");
> +		return;
> +	}
> +
> +	ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +	if (ret)
> +		return;
> +
> +	ret = octep_hw_caps_read(oct_hw, pdev);
> +	if (ret < 0)
> +		goto unmap_region;
> +
> +	mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops;
> +	mgmt_dev->mdev.id_table = id_table;
> +	mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring;
> +	mgmt_dev->mdev.supported_features = oct_hw->features;
> +	mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES);
> +	mgmt_dev->mdev.device = dev;
> +
> +	ret = vdpa_mgmtdev_register(&mgmt_dev->mdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to register vdpa management interface\n");
> +		goto unmap_region;
> +	}
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY);
> +
> +	return;
> +
> +unmap_region:
> +	octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR);
> +	oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL;
> +}
> +
> +static int octep_vdpa_probe_vf(struct pci_dev *pdev)
> +{
> +	struct octep_vdpa_mgmt_dev *mgmt_dev;
> +	struct device *dev = &pdev->dev;
> +	int ret;
> +
> +	ret = pcim_enable_device(pdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to enable device\n");
> +		return ret;
> +	}
> +
> +	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +	if (ret) {
> +		dev_err(dev, "No usable DMA configuration\n");
> +		return ret;
> +	}
> +	pci_set_master(pdev);
> +
> +	mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL);
> +	if (!mgmt_dev)
> +		return -ENOMEM;
> +
> +	ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR);
> +	if (ret)
> +		return ret;
> +
> +	mgmt_dev->pdev = pdev;
> +	pci_set_drvdata(pdev, mgmt_dev);
> +
> +	atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC);
> +	INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task);
> +	schedule_work(&mgmt_dev->setup_task);
> +	dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n");
> +
> +	return 0;
> +}
> +
> +static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx)
> +{
> +	struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4;
> +	struct octep_pf *pf = pci_get_drvdata(pf_dev);
> +	struct pci_bus_region bus_region;
> +
> +	vf_res->name = pci_name(vf_dev);
> +	vf_res->flags = pf_res->flags;
> +	vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent;
> +
> +	bus_region.start = pf->vf_base + idx * pf->vf_stride;
> +	bus_region.end = bus_region.start + pf->vf_stride - 1;
> +	pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region);
> +}
> +
> +static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs)
> +{
> +	struct octep_pf *pf = pci_get_drvdata(pdev);
> +	u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR];
> +	struct pci_dev *vf_pdev = NULL;
> +	bool done = false;
> +	int index = 0;
> +	int ret, i;
> +
> +	ret = pci_enable_sriov(pdev, num_vfs);
> +	if (ret)
> +		return ret;
> +
> +	pf->enabled_vfs = num_vfs;
> +
> +	while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) {
> +		if (vf_pdev->device != pf->vf_devid)
> +			continue;
> +
> +		octep_vdpa_assign_barspace(vf_pdev, pdev, index);
> +		if (++index == num_vfs) {
> +			done = true;
> +			break;
> +		}
> +	}
> +
> +	if (done) {
> +		for (i = 0; i < pf->enabled_vfs; i++)
> +			writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i));
> +	}
> +
> +	return num_vfs;
> +}
> +
> +static int octep_sriov_disable(struct pci_dev *pdev)
> +{
> +	struct octep_pf *pf = pci_get_drvdata(pdev);
> +
> +	if (!pci_num_vf(pdev))
> +		return 0;
> +
> +	pci_disable_sriov(pdev);
> +	pf->enabled_vfs = 0;
> +
> +	return 0;
> +}
> +
> +static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs)
> +{
> +	if (num_vfs > 0)
> +		return octep_sriov_enable(pdev, num_vfs);
> +	else
> +		return octep_sriov_disable(pdev);
> +}
> +
> +static u16 octep_get_vf_devid(struct pci_dev *pdev)
> +{
> +	u16 did;
> +
> +	switch (pdev->device) {
> +	case OCTEP_VDPA_DEVID_CN106K_PF:
> +		did = OCTEP_VDPA_DEVID_CN106K_VF;
> +		break;
> +	case OCTEP_VDPA_DEVID_CN105K_PF:
> +		did = OCTEP_VDPA_DEVID_CN105K_VF;
> +		break;
> +	case OCTEP_VDPA_DEVID_CN103K_PF:
> +		did = OCTEP_VDPA_DEVID_CN103K_VF;
> +		break;
> +	default:
> +		did = 0xFFFF;
> +		break;
> +	}
> +
> +	return did;
> +}
> +
> +static int octep_vdpa_pf_setup(struct octep_pf *octpf)
> +{
> +	u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +	struct pci_dev *pdev = octpf->pdev;
> +	int totalvfs;
> +	u64 val, len;
> +
> +	totalvfs = pci_sriov_get_totalvfs(pdev);
> +	if (unlikely(!totalvfs)) {
> +		dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs);
> +		return 0;
> +	}
> +
> +	addr = octpf->base[OCTEP_HW_MBOX_BAR];
> +	val = readq(addr + OCTEP_EPF_RINFO(0));
> +	if (val == 0) {
> +		dev_err(&pdev->dev, "Invalid device configuration\n");
> +		return -EINVAL;
> +	}
> +
> +	if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) {
> +		val &= ~GENMASK_ULL(35, 32);
> +		val |= BIT_ULL(32);
> +		writeq(val, addr + OCTEP_EPF_RINFO(0));
> +	}
> +
> +	len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR);
> +
> +	octpf->vf_stride = len / totalvfs;
> +	octpf->vf_devid = octep_get_vf_devid(pdev);
> +
> +	octep_vdpa_pf_bar_shrink(octpf);
> +
> +	return 0;
> +}
> +
> +static int octep_vdpa_probe_pf(struct pci_dev *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct octep_pf *octpf;
> +	int ret;
> +
> +	ret = pcim_enable_device(pdev);
> +	if (ret) {
> +		dev_err(dev, "Failed to enable device\n");
> +		return ret;
> +	}
> +
> +	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
> +	if (ret) {
> +		dev_err(dev, "No usable DMA configuration\n");
> +		return ret;
> +	}
> +	octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL);
> +	if (!octpf)
> +		return -ENOMEM;
> +
> +	ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +	if (ret)
> +		return ret;
> +
> +	pci_set_master(pdev);
> +	pci_set_drvdata(pdev, octpf);
> +	octpf->pdev = pdev;
> +
> +	ret = octep_vdpa_pf_setup(octpf);
> +	if (ret)
> +		goto unmap_region;
> +
> +	return 0;
> +
> +unmap_region:
> +	octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR);
> +	return ret;
> +}
> +
> +static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> +	if (pdev->is_virtfn)
> +		return octep_vdpa_probe_vf(pdev);
> +	else
> +		return octep_vdpa_probe_pf(pdev);
> +}
> +
> +static struct pci_device_id octep_pci_vdpa_map[] = {
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) },
> +	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) },
> +	{ 0 },
> +};
> +
> +static struct pci_driver octep_pci_vdpa = {
> +	.name     = OCTEP_VDPA_DRIVER_NAME,
> +	.id_table = octep_pci_vdpa_map,
> +	.probe    = octep_vdpa_probe,
> +	.remove   = octep_vdpa_remove,
> +	.sriov_configure = octep_vdpa_sriov_configure
> +};
> +
> +module_pci_driver(octep_pci_vdpa);
> +
> +MODULE_AUTHOR("Marvell");
> +MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver");
> +MODULE_LICENSE("GPL");
> -- 
> 2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2024-04-25  9:36 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-27 11:21 [PATCH] virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices Srujana Challa
2024-03-29  4:28 ` Jason Wang
2024-03-29 12:34   ` [EXTERNAL] " Srujana Challa
2024-03-31 11:31     ` Michael S. Tsirkin
2024-04-07  3:24       ` Jason Wang
2024-04-07  3:34     ` Jason Wang
2024-04-10 10:15       ` Srujana Challa
2024-04-10 11:19         ` Michael S. Tsirkin
2024-04-10 12:34           ` Srujana Challa
2024-04-11  6:01             ` Jason Wang
2024-04-12  5:12               ` Srujana Challa
2024-04-12  6:41                 ` Jason Wang
2024-04-12  9:48                   ` Srujana Challa
2024-04-15  6:49                     ` Jason Wang
2024-04-15 12:42                       ` Srujana Challa
2024-04-16  3:17                         ` Jason Wang
2024-04-22 20:42                           ` Michael S. Tsirkin
2024-04-24 13:05                             ` Srujana Challa
2024-04-23  5:40                           ` Srujana Challa
2024-04-11  5:59         ` Jason Wang
2024-03-29 11:37 ` Stefano Garzarella
2024-03-29 13:02   ` [EXTERNAL] " Srujana Challa
2024-03-29 13:19     ` Stefano Garzarella
2024-04-10 12:40       ` Srujana Challa
2024-04-22 20:44 ` Michael S. Tsirkin
2024-04-24 19:08 ` [PATCH v2] " Srujana Challa
2024-04-25  1:57   ` Jason Wang
2024-04-25  9:36   ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).