linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] nvme: Enable acceleration feature of A64FX processor
@ 2019-02-01 12:46 Takao Indoh
  2019-02-01 14:54 ` Keith Busch
  2019-02-01 15:51 ` Christoph Hellwig
  0 siblings, 2 replies; 13+ messages in thread
From: Takao Indoh @ 2019-02-01 12:46 UTC (permalink / raw)
  To: keith.busch, axboe, hch, sagi; +Cc: linux-nvme, linux-kernel, Takao Indoh

From: Takao Indoh <indou.takao@fujitsu.com>

Fujitsu A64FX processor has a feature to accelerate data transfer of
internal bus by relaxed ordering. It is enabled when the bit 56 of dma
address is set to 1.

This patch introduces this acceleration feature to the NVMe driver to
enhance NVMe device performance.

Signed-off-by: Takao Indoh <indou.takao@fujitsu.com>
---
 drivers/nvme/host/core.c |  6 ++++
 drivers/nvme/host/nvme.h |  7 +++++
 drivers/nvme/host/pci.c  | 65 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 150e49723c15..8167c3756b05 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -37,6 +37,9 @@
 
 #define NVME_MINORS		(1U << MINORBITS)
 
+DEFINE_STATIC_KEY_FALSE(nvme_quirk_a64fx_force_relax_key);
+EXPORT_SYMBOL_GPL(nvme_quirk_a64fx_force_relax_key);
+
 unsigned int admin_timeout = 60;
 module_param(admin_timeout, uint, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@@ -2493,6 +2496,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
 	}
 
+	if (ctrl->quirks & NVME_QUIRK_A64FX_FORCE_RELAX)
+		static_branch_enable(&nvme_quirk_a64fx_force_relax_key);
+
 	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
 	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
 	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab961bdeea89..fe02d021ee9c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -23,6 +23,7 @@
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
 #include <linux/rcupdate.h>
+#include <linux/jump_label.h>
 
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
@@ -37,6 +38,8 @@ extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
 
+DECLARE_STATIC_KEY_FALSE(nvme_quirk_a64fx_force_relax_key);
+
 enum {
 	NVME_NS_LBA		= 0,
 	NVME_NS_LIGHTNVM	= 1,
@@ -95,6 +98,10 @@ enum nvme_quirks {
 	 * Ignore device provided subnqn.
 	 */
 	NVME_QUIRK_IGNORE_DEV_SUBNQN		= (1 << 8),
+	/*
+	 * Force relaxed ordering for A64FX controller
+	 */
+	NVME_QUIRK_A64FX_FORCE_RELAX		= (1 << 9),
 };
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..cffe390d4c41 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -835,6 +835,45 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 	return BLK_STS_OK;
 }
 
+static inline void nvme_pci_enable_a64fx_relax_bit(struct nvme_sgl_desc *sge)
+{
+	sge->addr |= (1ULL << 56);
+}
+
+/*
+ * A64FX's controller allow relaxed order by setting 1 on bit 56 of dma address
+ * for performance enhancement.
+ *
+ * This traverses the sgl list and set the bit on ever dma address for
+ * data read.
+ */
+static void nvme_pci_quirk_a64fx_force_relax(struct request *req,
+		struct nvme_rw_command *cmd, int entries)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_sgl_desc *sg_list;
+	int i, j;
+
+	/* do nothing if sgl is not used or command is not read */
+	if (!iod->use_sgl || cmd->opcode != nvme_cmd_read)
+		return;
+
+	if (entries == 1) {
+		nvme_pci_enable_a64fx_relax_bit(&cmd->dptr.sgl);
+		return;
+	}
+
+	i = 0; j = 0;
+	sg_list = nvme_pci_iod_list(req)[j];
+	do {
+		if (i == SGES_PER_PAGE) {
+			i = 0;
+			sg_list = nvme_pci_iod_list(req)[++j];
+		}
+		nvme_pci_enable_a64fx_relax_bit(&sg_list[i++]);
+	} while (--entries > 0);
+}
+
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
@@ -869,6 +908,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 	if (ret != BLK_STS_OK)
 		goto out_unmap;
 
+	if (static_branch_unlikely(&nvme_quirk_a64fx_force_relax_key))
+		nvme_pci_quirk_a64fx_force_relax(req, &cmnd->rw, nr_mapped);
+
 	ret = BLK_STS_IOERR;
 	if (blk_integrity_rq(req)) {
 		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
@@ -2748,6 +2790,27 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
+/*
+ * PCI vendor id of Fujitsu and device id for root port in the A64FX processor
+ */
+#define PCI_VENDOR_ID_FUJITSU			0x10cf
+#define PCI_DEVICE_ID_FUJITSU_A64FX_ROOTPORT	0x1952
+
+static unsigned long check_system_vendor_acceleration(void)
+{
+	struct pci_dev *pdev_root;
+	/*
+	 * When Fujitsu A64FX Root Port is found, acceleration feature
+	 * can be enabled.
+	 */
+	pdev_root = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+	if (pdev_root && (pdev_root->vendor == PCI_VENDOR_ID_FUJITSU) &&
+	    (pdev_root->device == PCI_DEVICE_ID_FUJITSU_A64FX_ROOTPORT))
+		return NVME_QUIRK_A64FX_FORCE_RELAX;
+
+	return 0;
+}
+
 static void nvme_async_probe(void *data, async_cookie_t cookie)
 {
 	struct nvme_dev *dev = data;
@@ -2794,6 +2857,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	quirks |= check_vendor_combination_bug(pdev);
 
+	quirks |= check_system_vendor_acceleration();
+
 	/*
 	 * Double check that our mempool alloc size will cover the biggest
 	 * command we support.
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-01 12:46 [PATCH] nvme: Enable acceleration feature of A64FX processor Takao Indoh
@ 2019-02-01 14:54 ` Keith Busch
  2019-02-05 12:56   ` Takao Indoh
  2019-02-01 15:51 ` Christoph Hellwig
  1 sibling, 1 reply; 13+ messages in thread
From: Keith Busch @ 2019-02-01 14:54 UTC (permalink / raw)
  To: Takao Indoh; +Cc: axboe, hch, sagi, linux-nvme, linux-kernel, Takao Indoh

On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> From: Takao Indoh <indou.takao@fujitsu.com>
> 
> Fujitsu A64FX processor has a feature to accelerate data transfer of
> internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> address is set to 1.

Wait, what? RO is a standard PCIe TLP attribute. Why would we need this?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-01 12:46 [PATCH] nvme: Enable acceleration feature of A64FX processor Takao Indoh
  2019-02-01 14:54 ` Keith Busch
@ 2019-02-01 15:51 ` Christoph Hellwig
  2019-02-05 12:56   ` Takao Indoh
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2019-02-01 15:51 UTC (permalink / raw)
  To: Takao Indoh
  Cc: keith.busch, axboe, hch, sagi, linux-nvme, linux-kernel, Takao Indoh

On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> From: Takao Indoh <indou.takao@fujitsu.com>
> 
> Fujitsu A64FX processor has a feature to accelerate data transfer of
> internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> address is set to 1.
> 
> This patch introduces this acceleration feature to the NVMe driver to
> enhance NVMe device performance.

This has absolutely no business in a PCIe driver, sorry.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-01 14:54 ` Keith Busch
@ 2019-02-05 12:56   ` Takao Indoh
  2019-02-05 14:39     ` Keith Busch
  0 siblings, 1 reply; 13+ messages in thread
From: Takao Indoh @ 2019-02-05 12:56 UTC (permalink / raw)
  To: Keith Busch; +Cc: Takao Indoh, axboe, hch, sagi, linux-nvme, linux-kernel

On Fri, Feb 01, 2019 at 07:54:14AM -0700, Keith Busch wrote:
> On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> > From: Takao Indoh <indou.takao@fujitsu.com>
> > 
> > Fujitsu A64FX processor has a feature to accelerate data transfer of
> > internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> > address is set to 1.
> 
> Wait, what? RO is a standard PCIe TLP attribute. Why would we need this?

I should have explained this patch more carefully.

Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
field in the TLP header, however, this mechanism cannot be utilized if
the device does not support RO feature. Fujitsu A64FX processor has an
alternate feature to enable RO in its Root Port by setting the bit 56 of
DMA address. This mechanism enables to utilize RO feature even if the
device does not support standard PCIe RO.

The data packet with its DMA address bit 56 is set, is transferred from
the device to the PCI root port with Strong Ordering (SO), and then it
is transferred with RO to the host memory.

This patch adds new code into NVMe driver to set bit 56 of DMA address
to utilize this feature. The reason why I do this in NVMe driver is that
here is an only place where we can traverses a sgl list to update the
DMA addresses. We can transfer data buffers with RO, but we cannot use
RO as for writes to the admin completion queue and the I/O completion
queue from the NVMe controller to the host. These writes need to be done
with SO to avoid data corruption. This patch scans data buffers queued
in the sgl list and update their DMA addresses to send data buffers with
RO.

Thanks,
Takao Indoh


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-01 15:51 ` Christoph Hellwig
@ 2019-02-05 12:56   ` Takao Indoh
  0 siblings, 0 replies; 13+ messages in thread
From: Takao Indoh @ 2019-02-05 12:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Takao Indoh, keith.busch, axboe, sagi, linux-nvme, linux-kernel

On Fri, Feb 01, 2019 at 04:51:20PM +0100, Christoph Hellwig wrote:
> On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> > From: Takao Indoh <indou.takao@fujitsu.com>
> > 
> > Fujitsu A64FX processor has a feature to accelerate data transfer of
> > internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> > address is set to 1.
> > 
> > This patch introduces this acceleration feature to the NVMe driver to
> > enhance NVMe device performance.
> 
> This has absolutely no business in a PCIe driver, sorry.
> 

At first let me explain detail of this feature. I wrote the same
explanation in the reply to Keith, but I write here again just in case.

Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
field in the TLP header, however, this mechanism cannot be utilized if
the device does not support RO feature. Fujitsu A64FX processor has an
alternate feature to enable RO in its Root Port by setting the bit 56 of
DMA address. This mechanism enables to utilize RO feature even if the
device does not support standard PCIe RO.

The data packet with its DMA address bit 56 is set, is transferred from
the device to the PCI root port with Strong Ordering (SO), and then it
is transferred with RO to the host memory.

This patch adds new code into NVMe driver to set bit 56 of DMA address
to utilize this feature. The reason why I do this in NVMe driver is that
here is an only place where we can traverses a sgl list to update the
DMA addresses. We can transfer data buffers with RO, but we cannot use
RO as for writes to the admin completion queue and the I/O completion
queue from the NVMe controller to the host. These writes need to be done
with SO to avoid data corruption. This patch scans data buffers queued
in the sgl list and update their DMA addresses to send data buffers with
RO.

Thanks,
Takao Indoh



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-05 12:56   ` Takao Indoh
@ 2019-02-05 14:39     ` Keith Busch
  2019-02-05 16:13       ` Christoph Hellwig
  2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
  0 siblings, 2 replies; 13+ messages in thread
From: Keith Busch @ 2019-02-05 14:39 UTC (permalink / raw)
  To: Takao Indoh; +Cc: Takao Indoh, axboe, hch, sagi, linux-nvme, linux-kernel

On Tue, Feb 05, 2019 at 09:56:05PM +0900, Takao Indoh wrote:
> On Fri, Feb 01, 2019 at 07:54:14AM -0700, Keith Busch wrote:
> > On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> > > From: Takao Indoh <indou.takao@fujitsu.com>
> > > 
> > > Fujitsu A64FX processor has a feature to accelerate data transfer of
> > > internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> > > address is set to 1.
> > 
> > Wait, what? RO is a standard PCIe TLP attribute. Why would we need this?
> 
> I should have explained this patch more carefully.
> 
> Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
> field in the TLP header, however, this mechanism cannot be utilized if
> the device does not support RO feature. Fujitsu A64FX processor has an
> alternate feature to enable RO in its Root Port by setting the bit 56 of
> DMA address. This mechanism enables to utilize RO feature even if the
> device does not support standard PCIe RO.

I think you're better of just purchasing devices that support the
capability per spec rather than with a non-standard work around.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-05 14:39     ` Keith Busch
@ 2019-02-05 16:13       ` Christoph Hellwig
  2019-02-13 12:03         ` Takao Indoh
  2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2019-02-05 16:13 UTC (permalink / raw)
  To: Keith Busch
  Cc: Takao Indoh, Takao Indoh, axboe, hch, sagi, linux-nvme, linux-kernel

On Tue, Feb 05, 2019 at 07:39:06AM -0700, Keith Busch wrote:
> > Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
> > field in the TLP header, however, this mechanism cannot be utilized if
> > the device does not support RO feature. Fujitsu A64FX processor has an
> > alternate feature to enable RO in its Root Port by setting the bit 56 of
> > DMA address. This mechanism enables to utilize RO feature even if the
> > device does not support standard PCIe RO.
> 
> I think you're better of just purchasing devices that support the
> capability per spec rather than with a non-standard work around.

Agreed, this seems like a pretty gross hack.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-05 16:13       ` Christoph Hellwig
@ 2019-02-13 12:03         ` Takao Indoh
  2019-02-14 17:11           ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Takao Indoh @ 2019-02-13 12:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, Takao Indoh, axboe, sagi, linux-nvme, linux-kernel

On Tue, Feb 05, 2019 at 05:13:47PM +0100, Christoph Hellwig wrote:
> On Tue, Feb 05, 2019 at 07:39:06AM -0700, Keith Busch wrote:
> > > Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
> > > field in the TLP header, however, this mechanism cannot be utilized if
> > > the device does not support RO feature. Fujitsu A64FX processor has an
> > > alternate feature to enable RO in its Root Port by setting the bit 56 of
> > > DMA address. This mechanism enables to utilize RO feature even if the
> > > device does not support standard PCIe RO.
> > 
> > I think you're better of just purchasing devices that support the
> > capability per spec rather than with a non-standard work around.
> 
> Agreed, this seems like a pretty gross hack.

Ok, let me think about how I should change this patch.
I'm thinking that the problem of this patch is adding processor specific
code into NVMe common driver, is this correct? Or another problem? It
would be great if you could give me a hint to improve this patch.

Thanks,
Takao Indoh


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-13 12:03         ` Takao Indoh
@ 2019-02-14 17:11           ` Christoph Hellwig
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2019-02-14 17:11 UTC (permalink / raw)
  To: Takao Indoh
  Cc: Christoph Hellwig, Keith Busch, Takao Indoh, axboe, sagi,
	linux-nvme, linux-kernel

On Wed, Feb 13, 2019 at 09:03:58PM +0900, Takao Indoh wrote:
> Ok, let me think about how I should change this patch.

Just drop it.

> I'm thinking that the problem of this patch is adding processor specific
> code into NVMe common driver, is this correct? Or another problem? It
> would be great if you could give me a hint to improve this patch.

For one it is processor / root port specific.  But even more importantly
it is incorrect.  If the device wants to issue a relaxed order
transaction it should do so all by itself, not by hacking around it
from the kernel.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-05 14:39     ` Keith Busch
  2019-02-05 16:13       ` Christoph Hellwig
@ 2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
  2019-02-14 21:17         ` Keith Busch
  2019-02-20  9:46         ` Takao Indoh
  1 sibling, 2 replies; 13+ messages in thread
From: Elliott, Robert (Persistent Memory) @ 2019-02-14 20:44 UTC (permalink / raw)
  To: Keith Busch, Takao Indoh
  Cc: Takao Indoh, sagi, linux-kernel, linux-nvme, axboe, hch



> -----Original Message-----
> From: Linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Keith Busch
> Sent: Tuesday, February 5, 2019 8:39 AM
> To: Takao Indoh <indou.takao@fujitsu.com>
> Cc: Takao Indoh <indou.takao@jp.fujitsu.com>; sagi@grimberg.me; linux-kernel@vger.kernel.org; linux-
> nvme@lists.infradead.org; axboe@fb.com; hch@lst.de
> Subject: Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
> 
> On Tue, Feb 05, 2019 at 09:56:05PM +0900, Takao Indoh wrote:
> > On Fri, Feb 01, 2019 at 07:54:14AM -0700, Keith Busch wrote:
> > > On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> > > > From: Takao Indoh <indou.takao@fujitsu.com>
> > > >
> > > > Fujitsu A64FX processor has a feature to accelerate data transfer of
> > > > internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> > > > address is set to 1.
> > >
> > > Wait, what? RO is a standard PCIe TLP attribute. Why would we need this?
> >
> > I should have explained this patch more carefully.
> >
> > Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
> > field in the TLP header, however, this mechanism cannot be utilized if
> > the device does not support RO feature. Fujitsu A64FX processor has an
> > alternate feature to enable RO in its Root Port by setting the bit 56 of
> > DMA address. This mechanism enables to utilize RO feature even if the
> > device does not support standard PCIe RO.
> 
> I think you're better of just purchasing devices that support the
> capability per spec rather than with a non-standard work around.
> 

The PCIe and NVMe specifications dosn't standardize a way to tell the device
when to use RO, which leads to system workarounds like this.

The Enable Relaxed Ordering bit defined by PCIe tells the device when it
cannot use RO, but doesn't advise when it should or shall use RO.

For SCSI Express (SOP+PQI), we were going to allow specifying these
on a per-command basis:
* TLP attributes (No Snoop, Relaxed Ordering, ID-based Ordering)
* TLP processing hints (Processing Hints and Steering Tags)

to be used by the data transfers for the command. In some systems, one
setting per queue or per device might suffice. Transactions to the
queues and doorbells require stronger ordering.

For this workaround:
* making an extra pass through the SGL to set the address bit is 
inefficient; it should be done as the SGL is created.
* why doesn't it support PRP Lists?
* how does this interact with an iommu, if there is one? Must the 
address with bit 56 also be granted permission, or is that
stripped off before any iommu comparisons?







^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
@ 2019-02-14 21:17         ` Keith Busch
  2019-02-20  9:46         ` Takao Indoh
  1 sibling, 0 replies; 13+ messages in thread
From: Keith Busch @ 2019-02-14 21:17 UTC (permalink / raw)
  To: Elliott, Robert (Persistent Memory)
  Cc: Takao Indoh, Takao Indoh, sagi, linux-kernel, linux-nvme, axboe, hch

On Thu, Feb 14, 2019 at 12:44:48PM -0800, Elliott, Robert (Persistent Memory) wrote:
> 
> The PCIe and NVMe specifications dosn't standardize a way to tell the device
> when to use RO, which leads to system workarounds like this.
> 
> The Enable Relaxed Ordering bit defined by PCIe tells the device when it
> cannot use RO, but doesn't advise when it should or shall use RO.

In general, it is always safe to use RO for any memory writes that have
no order dependency on other RO writes. It's impossible for the PCIe spec
to standardize what packets may or may not have a such a dependency:
that is specific to the higher protocol of device, so RO behavior has
to be out of scope for PCI spec. It only says to don't use it when it
isn't safe to do so, like for MSI.

For NVMe, there is no order dependency on PRP/SGL data since it is
perfectly valid for the controller to transfer these out-of-order
already. Letting the memory controller re-order them would also have to
be spec compliant.

The host is not allowed to assume the data is there until it observes
the CQE for that command, so the CQE is the only NVMe protocol dev->host
transfer that has a strict order depenency and not valid for RO (you
risk data corruption if you do this wrong).

The NVMe spec does't spell this out, but some controller implementations
do this today anyway. If it is really that confusing to hardware vendors,
though, I don't think it'd be harmful to propose an ECN to clarify
appropriate RO usage, and also a plus if it would get more vendors to
take notice of this optimization.
 
> For SCSI Express (SOP+PQI), we were going to allow specifying these
> on a per-command basis:
> * TLP attributes (No Snoop, Relaxed Ordering, ID-based Ordering)
> * TLP processing hints (Processing Hints and Steering Tags)
>
> to be used by the data transfers for the command. In some systems, one
> setting per queue or per device might suffice. Transactions to the
> queues and doorbells require stronger ordering.
> 
> For this workaround:
> * making an extra pass through the SGL to set the address bit is
> inefficient; it should be done as the SGL is created.
> * why doesn't it support PRP Lists?
> * how does this interact with an iommu, if there is one? Must the
> address with bit 56 also be granted permission, or is that
> stripped off before any iommu comparisons?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
  2019-02-14 21:17         ` Keith Busch
@ 2019-02-20  9:46         ` Takao Indoh
  2019-02-22 17:07           ` Keith Busch
  1 sibling, 1 reply; 13+ messages in thread
From: Takao Indoh @ 2019-02-20  9:46 UTC (permalink / raw)
  To: Elliott, Robert (Persistent Memory)
  Cc: Keith Busch, Takao Indoh, sagi, linux-kernel, linux-nvme, axboe, hch

On Thu, Feb 14, 2019 at 08:44:48PM +0000, Elliott, Robert (Persistent Memory) wrote:
> 
> 
> > -----Original Message-----
> > From: Linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Keith Busch
> > Sent: Tuesday, February 5, 2019 8:39 AM
> > To: Takao Indoh <indou.takao@fujitsu.com>
> > Cc: Takao Indoh <indou.takao@jp.fujitsu.com>; sagi@grimberg.me; linux-kernel@vger.kernel.org; linux-
> > nvme@lists.infradead.org; axboe@fb.com; hch@lst.de
> > Subject: Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
> > 
> > On Tue, Feb 05, 2019 at 09:56:05PM +0900, Takao Indoh wrote:
> > > On Fri, Feb 01, 2019 at 07:54:14AM -0700, Keith Busch wrote:
> > > > On Fri, Feb 01, 2019 at 09:46:15PM +0900, Takao Indoh wrote:
> > > > > From: Takao Indoh <indou.takao@fujitsu.com>
> > > > >
> > > > > Fujitsu A64FX processor has a feature to accelerate data transfer of
> > > > > internal bus by relaxed ordering. It is enabled when the bit 56 of dma
> > > > > address is set to 1.
> > > >
> > > > Wait, what? RO is a standard PCIe TLP attribute. Why would we need this?
> > >
> > > I should have explained this patch more carefully.
> > >
> > > Standard PCIe devices can use Relaxed Ordering (RO) by setting Attr
> > > field in the TLP header, however, this mechanism cannot be utilized if
> > > the device does not support RO feature. Fujitsu A64FX processor has an
> > > alternate feature to enable RO in its Root Port by setting the bit 56 of
> > > DMA address. This mechanism enables to utilize RO feature even if the
> > > device does not support standard PCIe RO.
> > 
> > I think you're better of just purchasing devices that support the
> > capability per spec rather than with a non-standard work around.
> > 
> 
> The PCIe and NVMe specifications dosn't standardize a way to tell the device
> when to use RO, which leads to system workarounds like this.
> 
> The Enable Relaxed Ordering bit defined by PCIe tells the device when it
> cannot use RO, but doesn't advise when it should or shall use RO.
> 
> For SCSI Express (SOP+PQI), we were going to allow specifying these
> on a per-command basis:
> * TLP attributes (No Snoop, Relaxed Ordering, ID-based Ordering)
> * TLP processing hints (Processing Hints and Steering Tags)
> 
> to be used by the data transfers for the command. In some systems, one
> setting per queue or per device might suffice. Transactions to the
> queues and doorbells require stronger ordering.
> 
> For this workaround:
> * making an extra pass through the SGL to set the address bit is 
> inefficient; it should be done as the SGL is created.

Thanks for your comment, do you mean this should be done in
nvme_pci_setup_sgls()/nvme_pci_setup_prps()?

> * why doesn't it support PRP Lists?

This patch does not support PRP because PRP is used for small data and
we cannot get enough performance improvement by this feature. But I can
support PRP to improve performance of the device which is compliant with
NVMe Spec 1.0 or does not support SGL.

> * how does this interact with an iommu, if there is one? Must the 
> address with bit 56 also be granted permission, or is that
> stripped off before any iommu comparisons?

The latter. A bit 56 is cleared in Root Port before pass it to iommu.

Thanks,
Takao Indoh

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: Enable acceleration feature of A64FX processor
  2019-02-20  9:46         ` Takao Indoh
@ 2019-02-22 17:07           ` Keith Busch
  0 siblings, 0 replies; 13+ messages in thread
From: Keith Busch @ 2019-02-22 17:07 UTC (permalink / raw)
  To: Takao Indoh
  Cc: Elliott, Robert (Persistent Memory),
	Takao Indoh, sagi, linux-kernel, linux-nvme, axboe, hch

On Wed, Feb 20, 2019 at 06:46:11PM +0900, Takao Indoh wrote:
> On Thu, Feb 14, 2019 at 08:44:48PM +0000, Elliott, Robert (Persistent Memory) wrote:
> > * how does this interact with an iommu, if there is one? Must the 
> > address with bit 56 also be granted permission, or is that
> > stripped off before any iommu comparisons?
> 
> The latter. A bit 56 is cleared in Root Port before pass it to iommu.

What if the intendend destination is a peer and never hits the root port?

Really, though, PCI device vendors need to just use the existing
capability as intended and not have arch specific work-arounds. I'm sure
nvme can't be the only device class you'd want this behavior.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2019-02-22 17:07 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-01 12:46 [PATCH] nvme: Enable acceleration feature of A64FX processor Takao Indoh
2019-02-01 14:54 ` Keith Busch
2019-02-05 12:56   ` Takao Indoh
2019-02-05 14:39     ` Keith Busch
2019-02-05 16:13       ` Christoph Hellwig
2019-02-13 12:03         ` Takao Indoh
2019-02-14 17:11           ` Christoph Hellwig
2019-02-14 20:44       ` Elliott, Robert (Persistent Memory)
2019-02-14 21:17         ` Keith Busch
2019-02-20  9:46         ` Takao Indoh
2019-02-22 17:07           ` Keith Busch
2019-02-01 15:51 ` Christoph Hellwig
2019-02-05 12:56   ` Takao Indoh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).