All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05
@ 2017-06-05 19:52 Tom Lendacky
       [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Lendacky @ 2017-06-05 19:52 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Arindam Nath

This patch series addresses some performance issues in the AMD IOMMU
driver:

- Reduce the amount of MMIO performed during command submission
- When the command queue is (near) full, only wait till there is enough
  room for the command rather than wait for the whole queue to be empty
- Limit the flushing of protection domain TLBs to only the protection
  domains associated with the iova data being freed

This patch series is based on the master branch of the iommu tree.

---

Tom Lendacky (3):
      iommu/amd: Reduce amount of MMIO when submitting commands
      iommu/amd: Reduce delay waiting for command buffer space
      iommu/amd: Optimize the IOMMU queue flush


 drivers/iommu/amd_iommu.c       |  100 ++++++++++++++++++++++++++++-----------
 drivers/iommu/amd_iommu_init.c  |    2 +
 drivers/iommu/amd_iommu_types.h |    2 +
 3 files changed, 77 insertions(+), 27 deletions(-)

-- 
Tom Lendacky

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 1/3] iommu/amd: Reduce amount of MMIO when submitting commands
       [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
@ 2017-06-05 19:52   ` Tom Lendacky
  2017-06-05 19:52   ` [PATCH v1 2/3] iommu/amd: Reduce delay waiting for command buffer space Tom Lendacky
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: Tom Lendacky @ 2017-06-05 19:52 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Arindam Nath

As newer, higher speed devices are developed, perf data shows that the
amount of MMIO that is performed when submitting commands to the IOMMU
causes performance issues. Currently, the command submission path reads
the command buffer head and tail pointers and then writes the tail
pointer once the command is ready.

The tail pointer is only ever updated by the driver so it can be tracked
by the driver without having to read it from the hardware.

The head pointer is updated by the hardware, but can be read
opportunistically. Reading the head pointer only when it appears that
there might not be room in the command buffer and then re-checking the
available space reduces the number of times the head pointer has to be
read.

Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
---
 drivers/iommu/amd_iommu.c       |   35 ++++++++++++++++++++++-------------
 drivers/iommu/amd_iommu_init.c  |    2 ++
 drivers/iommu/amd_iommu_types.h |    2 ++
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 63cacf5..faf0ddf 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -848,19 +848,20 @@ static int wait_on_sem(volatile u64 *sem)
 }
 
 static void copy_cmd_to_buffer(struct amd_iommu *iommu,
-			       struct iommu_cmd *cmd,
-			       u32 tail)
+			       struct iommu_cmd *cmd)
 {
 	u8 *target;
 
-	target = iommu->cmd_buf + tail;
-	tail   = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+	target = iommu->cmd_buf + iommu->cmd_buf_tail;
+
+	iommu->cmd_buf_tail += sizeof(*cmd);
+	iommu->cmd_buf_tail %= CMD_BUFFER_SIZE;
 
 	/* Copy command to buffer */
 	memcpy(target, cmd, sizeof(*cmd));
 
 	/* Tell the IOMMU about it */
-	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 }
 
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
@@ -1018,23 +1019,31 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu,
 				      struct iommu_cmd *cmd,
 				      bool sync)
 {
-	u32 left, tail, head, next_tail;
+	bool read_head = true;
+	u32 left, next_tail;
 
+	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
 again:
-
-	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-	next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
-	left      = (head - next_tail) % CMD_BUFFER_SIZE;
+	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
 
 	if (left <= 0x20) {
 		struct iommu_cmd sync_cmd;
 		int ret;
 
+		if (read_head) {
+			/* Update head and recheck remaining space */
+			iommu->cmd_buf_head = readl(iommu->mmio_base +
+						    MMIO_CMD_HEAD_OFFSET);
+			read_head = false;
+			goto again;
+		}
+
+		read_head = true;
+
 		iommu->cmd_sem = 0;
 
 		build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem);
-		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
+		copy_cmd_to_buffer(iommu, &sync_cmd);
 
 		if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0)
 			return ret;
@@ -1042,7 +1051,7 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu,
 		goto again;
 	}
 
-	copy_cmd_to_buffer(iommu, cmd, tail);
+	copy_cmd_to_buffer(iommu, cmd);
 
 	/* We need to sync now to make sure all commands are processed */
 	iommu->need_sync = sync;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 5a11328..3fa7e3b 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -588,6 +588,8 @@ void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
 
 	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
 	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	iommu->cmd_buf_head = 0;
+	iommu->cmd_buf_tail = 0;
 
 	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
 }
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 4de8f41..6960d7d 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -516,6 +516,8 @@ struct amd_iommu {
 
 	/* command buffer virtual address */
 	u8 *cmd_buf;
+	u32 cmd_buf_head;
+	u32 cmd_buf_tail;
 
 	/* event buffer virtual address */
 	u8 *evt_buf;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v1 2/3] iommu/amd: Reduce delay waiting for command buffer space
       [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
  2017-06-05 19:52   ` [PATCH v1 1/3] iommu/amd: Reduce amount of MMIO when submitting commands Tom Lendacky
@ 2017-06-05 19:52   ` Tom Lendacky
  2017-06-05 19:52   ` [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush Tom Lendacky
  2017-06-08 12:43   ` [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05 Joerg Roedel
  3 siblings, 0 replies; 24+ messages in thread
From: Tom Lendacky @ 2017-06-05 19:52 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Arindam Nath

Currently if there is no room to add a command to the command buffer, the
driver performs a "completion wait" which only returns when all commands
on the queue have been processed. There is no need to wait for the entire
command queue to be executed before adding the next command.

Update the driver to perform the same udelay() loop that the "completion
wait" performs, but instead re-read the head pointer to determine if
sufficient space is available.  The very first time it is found that there
is no space available, the udelay() will be skipped to immediately perform
the opportunistic read of the head pointer. If it is still found that
there is not sufficient space, then the udelay() will be performed.

Signed-off-by: Leo Duran <leo.duran-5C7GfCeVMHo@public.gmane.org>
Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
---
 drivers/iommu/amd_iommu.c |   33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index faf0ddf..856103b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1019,7 +1019,7 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu,
 				      struct iommu_cmd *cmd,
 				      bool sync)
 {
-	bool read_head = true;
+	unsigned int count = 0;
 	u32 left, next_tail;
 
 	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
@@ -1027,33 +1027,26 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu,
 	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
 
 	if (left <= 0x20) {
-		struct iommu_cmd sync_cmd;
-		int ret;
-
-		if (read_head) {
-			/* Update head and recheck remaining space */
-			iommu->cmd_buf_head = readl(iommu->mmio_base +
-						    MMIO_CMD_HEAD_OFFSET);
-			read_head = false;
-			goto again;
-		}
-
-		read_head = true;
-
-		iommu->cmd_sem = 0;
+		/* Skip udelay() the first time around */
+		if (count++) {
+			if (count == LOOP_TIMEOUT) {
+				pr_err("AMD-Vi: Command buffer timeout\n");
+				return -EIO;
+			}
 
-		build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem);
-		copy_cmd_to_buffer(iommu, &sync_cmd);
+			udelay(1);
+		}
 
-		if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0)
-			return ret;
+		/* Update head and recheck remaining space */
+		iommu->cmd_buf_head = readl(iommu->mmio_base +
+					    MMIO_CMD_HEAD_OFFSET);
 
 		goto again;
 	}
 
 	copy_cmd_to_buffer(iommu, cmd);
 
-	/* We need to sync now to make sure all commands are processed */
+	/* Do we need to make sure all commands are processed? */
 	iommu->need_sync = sync;
 
 	return 0;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
  2017-06-05 19:52   ` [PATCH v1 1/3] iommu/amd: Reduce amount of MMIO when submitting commands Tom Lendacky
  2017-06-05 19:52   ` [PATCH v1 2/3] iommu/amd: Reduce delay waiting for command buffer space Tom Lendacky
@ 2017-06-05 19:52   ` Tom Lendacky
       [not found]     ` <20170605195235.11512.52995.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
  2017-06-08 12:43   ` [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05 Joerg Roedel
  3 siblings, 1 reply; 24+ messages in thread
From: Tom Lendacky @ 2017-06-05 19:52 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Arindam Nath

After reducing the amount of MMIO performed by the IOMMU during operation,
perf data shows that flushing the TLB for all protection domains during
DMA unmapping is a performance issue. It is not necessary to flush the
TLBs for all protection domains, only the protection domains associated
with iova's on the flush queue.

Create a separate queue that tracks the protection domains associated with
the iova's on the flush queue. This new queue optimizes the flushing of
TLBs to the required protection domains.

Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
---
 drivers/iommu/amd_iommu.c |   56 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 856103b..a5e77f0 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -103,7 +103,18 @@ struct flush_queue {
 	struct flush_queue_entry *entries;
 };
 
+struct flush_pd_queue_entry {
+	struct protection_domain *pd;
+};
+
+struct flush_pd_queue {
+	/* No lock needed, protected by flush_queue lock */
+	unsigned next;
+	struct flush_pd_queue_entry *entries;
+};
+
 static DEFINE_PER_CPU(struct flush_queue, flush_queue);
+static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
 
 static atomic_t queue_timer_on;
 static struct timer_list queue_timer;
@@ -2227,16 +2238,20 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
  *
  *****************************************************************************/
 
-static void __queue_flush(struct flush_queue *queue)
+static void __queue_flush(struct flush_queue *queue,
+			  struct flush_pd_queue *pd_queue)
 {
-	struct protection_domain *domain;
 	unsigned long flags;
 	int idx;
 
 	/* First flush TLB of all known domains */
 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-	list_for_each_entry(domain, &amd_iommu_pd_list, list)
-		domain_flush_tlb(domain);
+	for (idx = 0; idx < pd_queue->next; ++idx) {
+		struct flush_pd_queue_entry *entry;
+
+		entry = pd_queue->entries + idx;
+		domain_flush_tlb(entry->pd);
+	}
 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
 
 	/* Wait until flushes have completed */
@@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue *queue)
 		entry->dma_dom = NULL;
 	}
 
+	pd_queue->next = 0;
 	queue->next = 0;
 }
 
@@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		struct flush_pd_queue *pd_queue;
 		struct flush_queue *queue;
 		unsigned long flags;
 
 		queue = per_cpu_ptr(&flush_queue, cpu);
+		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
 		spin_lock_irqsave(&queue->lock, flags);
 		if (queue->next > 0)
-			__queue_flush(queue);
+			__queue_flush(queue, pd_queue);
 		spin_unlock_irqrestore(&queue->lock, flags);
 	}
 }
@@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long unsused)
 static void queue_add(struct dma_ops_domain *dma_dom,
 		      unsigned long address, unsigned long pages)
 {
+	struct flush_pd_queue_entry *pd_entry;
+	struct flush_pd_queue *pd_queue;
 	struct flush_queue_entry *entry;
 	struct flush_queue *queue;
 	unsigned long flags;
@@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain *dma_dom,
 	address >>= PAGE_SHIFT;
 
 	queue = get_cpu_ptr(&flush_queue);
+	pd_queue = get_cpu_ptr(&flush_pd_queue);
 	spin_lock_irqsave(&queue->lock, flags);
 
 	if (queue->next == FLUSH_QUEUE_SIZE)
-		__queue_flush(queue);
+		__queue_flush(queue, pd_queue);
+
+	for (idx = 0; idx < pd_queue->next; ++idx) {
+		pd_entry = pd_queue->entries + idx;
+		if (pd_entry->pd == &dma_dom->domain)
+			break;
+	}
+	if (idx == pd_queue->next) {
+		/* New protection domain, add it to the list */
+		pd_entry = pd_queue->entries + pd_queue->next++;
+		pd_entry->pd = &dma_dom->domain;
+	}
 
 	idx   = queue->next++;
 	entry = queue->entries + idx;
@@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain *dma_dom,
 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
 
+	put_cpu_ptr(&flush_pd_queue);
 	put_cpu_ptr(&flush_queue);
 }
 
@@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
 		return ret;
 
 	for_each_possible_cpu(cpu) {
+		struct flush_pd_queue *pd_queue = per_cpu_ptr(&flush_pd_queue,
+							      cpu);
 		struct flush_queue *queue = per_cpu_ptr(&flush_queue, cpu);
 
 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
@@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
 			goto out_put_iova;
 
 		spin_lock_init(&queue->lock);
+
+		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
+					    sizeof(*pd_queue->entries),
+					    GFP_KERNEL);
+		if (!pd_queue->entries)
+			goto out_put_iova;
 	}
 
 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
@@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
 
 out_put_iova:
 	for_each_possible_cpu(cpu) {
+		struct flush_pd_queue *pd_queue = per_cpu_ptr(&flush_pd_queue,
+							      cpu);
 		struct flush_queue *queue = per_cpu_ptr(&flush_queue, cpu);
 
 		kfree(queue->entries);
+		kfree(pd_queue->entries);
 	}
 
 	return -ENOMEM;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]     ` <20170605195235.11512.52995.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
@ 2017-06-06 10:02       ` Nath, Arindam
       [not found]         ` <MWHPR12MB15181A6A020ACA2F53DF70339CCB0-Gy0DoCVfaSXKu+HfpMNLNQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-06-06 12:05       ` Joerg Roedel
  1 sibling, 1 reply; 24+ messages in thread
From: Nath, Arindam @ 2017-06-06 10:02 UTC (permalink / raw)
  To: Craig Stein, Jan Vesely, Lendacky, Thomas,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

>-----Original Message-----
>From: Lendacky, Thomas
>Sent: Tuesday, June 06, 2017 1:23 AM
>To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
>Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
><joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
>Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
>Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
>
>After reducing the amount of MMIO performed by the IOMMU during
>operation,
>perf data shows that flushing the TLB for all protection domains during
>DMA unmapping is a performance issue. It is not necessary to flush the
>TLBs for all protection domains, only the protection domains associated
>with iova's on the flush queue.
>
>Create a separate queue that tracks the protection domains associated with
>the iova's on the flush queue. This new queue optimizes the flushing of
>TLBs to the required protection domains.
>
>Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
>Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
>---
> drivers/iommu/amd_iommu.c |   56
>++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 50 insertions(+), 6 deletions(-)
>
>diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
>index 856103b..a5e77f0 100644
>--- a/drivers/iommu/amd_iommu.c
>+++ b/drivers/iommu/amd_iommu.c
>@@ -103,7 +103,18 @@ struct flush_queue {
> 	struct flush_queue_entry *entries;
> };
>
>+struct flush_pd_queue_entry {
>+	struct protection_domain *pd;
>+};
>+
>+struct flush_pd_queue {
>+	/* No lock needed, protected by flush_queue lock */
>+	unsigned next;
>+	struct flush_pd_queue_entry *entries;
>+};
>+
> static DEFINE_PER_CPU(struct flush_queue, flush_queue);
>+static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
>
> static atomic_t queue_timer_on;
> static struct timer_list queue_timer;
>@@ -2227,16 +2238,20 @@ static struct iommu_group
>*amd_iommu_device_group(struct device *dev)
>  *
>
>***********************************************************
>******************/
>
>-static void __queue_flush(struct flush_queue *queue)
>+static void __queue_flush(struct flush_queue *queue,
>+			  struct flush_pd_queue *pd_queue)
> {
>-	struct protection_domain *domain;
> 	unsigned long flags;
> 	int idx;
>
> 	/* First flush TLB of all known domains */
> 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
>-	list_for_each_entry(domain, &amd_iommu_pd_list, list)
>-		domain_flush_tlb(domain);
>+	for (idx = 0; idx < pd_queue->next; ++idx) {
>+		struct flush_pd_queue_entry *entry;
>+
>+		entry = pd_queue->entries + idx;
>+		domain_flush_tlb(entry->pd);
>+	}
> 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
>
> 	/* Wait until flushes have completed */
>@@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
>*queue)
> 		entry->dma_dom = NULL;
> 	}
>
>+	pd_queue->next = 0;
> 	queue->next = 0;
> }
>
>@@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> 	int cpu;
>
> 	for_each_possible_cpu(cpu) {
>+		struct flush_pd_queue *pd_queue;
> 		struct flush_queue *queue;
> 		unsigned long flags;
>
> 		queue = per_cpu_ptr(&flush_queue, cpu);
>+		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> 		spin_lock_irqsave(&queue->lock, flags);
> 		if (queue->next > 0)
>-			__queue_flush(queue);
>+			__queue_flush(queue, pd_queue);
> 		spin_unlock_irqrestore(&queue->lock, flags);
> 	}
> }
>@@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
>unsused)
> static void queue_add(struct dma_ops_domain *dma_dom,
> 		      unsigned long address, unsigned long pages)
> {
>+	struct flush_pd_queue_entry *pd_entry;
>+	struct flush_pd_queue *pd_queue;
> 	struct flush_queue_entry *entry;
> 	struct flush_queue *queue;
> 	unsigned long flags;
>@@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
>*dma_dom,
> 	address >>= PAGE_SHIFT;
>
> 	queue = get_cpu_ptr(&flush_queue);
>+	pd_queue = get_cpu_ptr(&flush_pd_queue);
> 	spin_lock_irqsave(&queue->lock, flags);
>
> 	if (queue->next == FLUSH_QUEUE_SIZE)
>-		__queue_flush(queue);
>+		__queue_flush(queue, pd_queue);
>+
>+	for (idx = 0; idx < pd_queue->next; ++idx) {
>+		pd_entry = pd_queue->entries + idx;
>+		if (pd_entry->pd == &dma_dom->domain)
>+			break;
>+	}
>+	if (idx == pd_queue->next) {
>+		/* New protection domain, add it to the list */
>+		pd_entry = pd_queue->entries + pd_queue->next++;
>+		pd_entry->pd = &dma_dom->domain;
>+	}
>
> 	idx   = queue->next++;
> 	entry = queue->entries + idx;
>@@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
>*dma_dom,
> 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
> 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
>
>+	put_cpu_ptr(&flush_pd_queue);
> 	put_cpu_ptr(&flush_queue);
> }
>
>@@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
> 		return ret;
>
> 	for_each_possible_cpu(cpu) {
>+		struct flush_pd_queue *pd_queue =
>per_cpu_ptr(&flush_pd_queue,
>+							      cpu);
> 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
>cpu);
>
> 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
>@@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
> 			goto out_put_iova;
>
> 		spin_lock_init(&queue->lock);
>+
>+		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
>+					    sizeof(*pd_queue->entries),
>+					    GFP_KERNEL);
>+		if (!pd_queue->entries)
>+			goto out_put_iova;
> 	}
>
> 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
>@@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
>
> out_put_iova:
> 	for_each_possible_cpu(cpu) {
>+		struct flush_pd_queue *pd_queue =
>per_cpu_ptr(&flush_pd_queue,
>+							      cpu);
> 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
>cpu);
>
> 		kfree(queue->entries);
>+		kfree(pd_queue->entries);
> 	}
>
> 	return -ENOMEM;

Craig and Jan, can you please confirm whether this patch fixes the IOMMU timeout errors you encountered before? If it does, then this is a better implementation of the fix I provided few weeks back.

Thanks,
Arindam

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]     ` <20170605195235.11512.52995.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
  2017-06-06 10:02       ` Nath, Arindam
@ 2017-06-06 12:05       ` Joerg Roedel
       [not found]         ` <20170606120516.GD30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  1 sibling, 1 reply; 24+ messages in thread
From: Joerg Roedel @ 2017-06-06 12:05 UTC (permalink / raw)
  To: Tom Lendacky
  Cc: Arindam Nath, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Hey Tom,

On Mon, Jun 05, 2017 at 02:52:35PM -0500, Tom Lendacky wrote:
> After reducing the amount of MMIO performed by the IOMMU during operation,
> perf data shows that flushing the TLB for all protection domains during
> DMA unmapping is a performance issue. It is not necessary to flush the
> TLBs for all protection domains, only the protection domains associated
> with iova's on the flush queue.
> 
> Create a separate queue that tracks the protection domains associated with
> the iova's on the flush queue. This new queue optimizes the flushing of
> TLBs to the required protection domains.
> 
> Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
> Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
> ---
>  drivers/iommu/amd_iommu.c |   56 ++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 50 insertions(+), 6 deletions(-)

I also did a major rewrite of the AMD IOMMU queue handling and flushing
code last week. It is functionally complete and I am currently testing,
documenting it, and cleaning it up. I pushed the current state of it to

	git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux.git amd-iommu

Its quite intrusive as it implements a per-domain flush-queue, and uses
a ring-buffer instead of a real queue. But you see the details in the
code.

Can you please have a look and give it a test in your setup?


Thanks,

	Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]         ` <20170606120516.GD30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2017-06-06 13:36           ` Tom Lendacky
       [not found]             ` <85356483-1d5e-251f-57e3-d9f761239100-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Lendacky @ 2017-06-06 13:36 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Arindam Nath, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On 6/6/2017 7:05 AM, Joerg Roedel wrote:
> Hey Tom,

Hi Joerg,

> 
> On Mon, Jun 05, 2017 at 02:52:35PM -0500, Tom Lendacky wrote:
>> After reducing the amount of MMIO performed by the IOMMU during operation,
>> perf data shows that flushing the TLB for all protection domains during
>> DMA unmapping is a performance issue. It is not necessary to flush the
>> TLBs for all protection domains, only the protection domains associated
>> with iova's on the flush queue.
>>
>> Create a separate queue that tracks the protection domains associated with
>> the iova's on the flush queue. This new queue optimizes the flushing of
>> TLBs to the required protection domains.
>>
>> Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
>> Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
>> ---
>>   drivers/iommu/amd_iommu.c |   56 ++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 50 insertions(+), 6 deletions(-)
> 
> I also did a major rewrite of the AMD IOMMU queue handling and flushing
> code last week. It is functionally complete and I am currently testing,
> documenting it, and cleaning it up. I pushed the current state of it to
> 
> 	git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux.git amd-iommu
> 
> Its quite intrusive as it implements a per-domain flush-queue, and uses
> a ring-buffer instead of a real queue. But you see the details in the
> code.
> 
> Can you please have a look and give it a test in your setup?

I'll try and look at this as soon as I can... I'm sharing the test
setup and I might not be able to get access again for a day or two.

Thanks,
Tom

> 
> 
> Thanks,
> 
> 	Joerg
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]             ` <85356483-1d5e-251f-57e3-d9f761239100-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-07 14:03               ` Tom Lendacky
       [not found]                 ` <32599b14-c138-3c89-6834-0335fec0b3f6-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Lendacky @ 2017-06-07 14:03 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Arindam Nath, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On 6/6/2017 8:36 AM, Tom Lendacky wrote:
> On 6/6/2017 7:05 AM, Joerg Roedel wrote:
>> Hey Tom,
> 
> Hi Joerg,
> 
>>
>> On Mon, Jun 05, 2017 at 02:52:35PM -0500, Tom Lendacky wrote:
>>> After reducing the amount of MMIO performed by the IOMMU during 
>>> operation,
>>> perf data shows that flushing the TLB for all protection domains during
>>> DMA unmapping is a performance issue. It is not necessary to flush the
>>> TLBs for all protection domains, only the protection domains associated
>>> with iova's on the flush queue.
>>>
>>> Create a separate queue that tracks the protection domains associated 
>>> with
>>> the iova's on the flush queue. This new queue optimizes the flushing of
>>> TLBs to the required protection domains.
>>>
>>> Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
>>> Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
>>> ---
>>>   drivers/iommu/amd_iommu.c |   56 
>>> ++++++++++++++++++++++++++++++++++++++++-----
>>>   1 file changed, 50 insertions(+), 6 deletions(-)
>>
>> I also did a major rewrite of the AMD IOMMU queue handling and flushing
>> code last week. It is functionally complete and I am currently testing,
>> documenting it, and cleaning it up. I pushed the current state of it to
>>
>>     git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux.git 
>> amd-iommu
>>
>> Its quite intrusive as it implements a per-domain flush-queue, and uses
>> a ring-buffer instead of a real queue. But you see the details in the
>> code.
>>
>> Can you please have a look and give it a test in your setup?
> 
> I'll try and look at this as soon as I can... I'm sharing the test
> setup and I might not be able to get access again for a day or two.
> 

I was able to run your patches in combination with the first two patches
that I submitted and the results look good.  Let me know if you'd like
me to resubmit the series minus the third patch.

Thanks,
Tom

> Thanks,
> Tom
> 
>>
>>
>> Thanks,
>>
>>     Joerg
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                 ` <32599b14-c138-3c89-6834-0335fec0b3f6-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-07 14:17                   ` Joerg Roedel
  0 siblings, 0 replies; 24+ messages in thread
From: Joerg Roedel @ 2017-06-07 14:17 UTC (permalink / raw)
  To: Tom Lendacky
  Cc: Arindam Nath, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Hey Tom,

On Wed, Jun 07, 2017 at 09:03:15AM -0500, Tom Lendacky wrote:
> I was able to run your patches in combination with the first two patches
> that I submitted and the results look good.  Let me know if you'd like
> me to resubmit the series minus the third patch.

Thanks a lot for testing the patches! You don't need to resubmit them, I
take the first two of this set and apply them along with my patches to
the iommu tree.


Thanks,

	Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05
       [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-06-05 19:52   ` [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush Tom Lendacky
@ 2017-06-08 12:43   ` Joerg Roedel
  3 siblings, 0 replies; 24+ messages in thread
From: Joerg Roedel @ 2017-06-08 12:43 UTC (permalink / raw)
  To: Tom Lendacky
  Cc: Arindam Nath, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Mon, Jun 05, 2017 at 02:52:03PM -0500, Tom Lendacky wrote:
> This patch series addresses some performance issues in the AMD IOMMU
> driver:
> 
> - Reduce the amount of MMIO performed during command submission
> - When the command queue is (near) full, only wait till there is enough
>   room for the command rather than wait for the whole queue to be empty
> - Limit the flushing of protection domain TLBs to only the protection
>   domains associated with the iova data being freed
> 
> This patch series is based on the master branch of the iommu tree.
> 
> ---
> 
> Tom Lendacky (3):
>       iommu/amd: Reduce amount of MMIO when submitting commands
>       iommu/amd: Reduce delay waiting for command buffer space

Okay, applied these two and my queue-ring series atop.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]         ` <MWHPR12MB15181A6A020ACA2F53DF70339CCB0-Gy0DoCVfaSXKu+HfpMNLNQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-08 20:33           ` Jan Vesely
       [not found]             ` <1496954035.4188.1.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-08 20:33 UTC (permalink / raw)
  To: Nath, Arindam, Craig Stein, Lendacky, Thomas,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[-- Attachment #1.1: Type: text/plain, Size: 6995 bytes --]

On Tue, 2017-06-06 at 10:02 +0000, Nath, Arindam wrote:
> > -----Original Message-----
> > From: Lendacky, Thomas
> > Sent: Tuesday, June 06, 2017 1:23 AM
> > To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> > Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
> > <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
> > Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
> > Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> > 
> > After reducing the amount of MMIO performed by the IOMMU during
> > operation,
> > perf data shows that flushing the TLB for all protection domains during
> > DMA unmapping is a performance issue. It is not necessary to flush the
> > TLBs for all protection domains, only the protection domains associated
> > with iova's on the flush queue.
> > 
> > Create a separate queue that tracks the protection domains associated with
> > the iova's on the flush queue. This new queue optimizes the flushing of
> > TLBs to the required protection domains.
> > 
> > Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
> > Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
> > ---
> > drivers/iommu/amd_iommu.c |   56
> > ++++++++++++++++++++++++++++++++++++++++-----
> > 1 file changed, 50 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> > index 856103b..a5e77f0 100644
> > --- a/drivers/iommu/amd_iommu.c
> > +++ b/drivers/iommu/amd_iommu.c
> > @@ -103,7 +103,18 @@ struct flush_queue {
> > 	struct flush_queue_entry *entries;
> > };
> > 
> > +struct flush_pd_queue_entry {
> > +	struct protection_domain *pd;
> > +};
> > +
> > +struct flush_pd_queue {
> > +	/* No lock needed, protected by flush_queue lock */
> > +	unsigned next;
> > +	struct flush_pd_queue_entry *entries;
> > +};
> > +
> > static DEFINE_PER_CPU(struct flush_queue, flush_queue);
> > +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
> > 
> > static atomic_t queue_timer_on;
> > static struct timer_list queue_timer;
> > @@ -2227,16 +2238,20 @@ static struct iommu_group
> > *amd_iommu_device_group(struct device *dev)
> >  *
> > 
> > ***********************************************************
> > ******************/
> > 
> > -static void __queue_flush(struct flush_queue *queue)
> > +static void __queue_flush(struct flush_queue *queue,
> > +			  struct flush_pd_queue *pd_queue)
> > {
> > -	struct protection_domain *domain;
> > 	unsigned long flags;
> > 	int idx;
> > 
> > 	/* First flush TLB of all known domains */
> > 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
> > -	list_for_each_entry(domain, &amd_iommu_pd_list, list)
> > -		domain_flush_tlb(domain);
> > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > +		struct flush_pd_queue_entry *entry;
> > +
> > +		entry = pd_queue->entries + idx;
> > +		domain_flush_tlb(entry->pd);
> > +	}
> > 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
> > 
> > 	/* Wait until flushes have completed */
> > @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
> > *queue)
> > 		entry->dma_dom = NULL;
> > 	}
> > 
> > +	pd_queue->next = 0;
> > 	queue->next = 0;
> > }
> > 
> > @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> > 	int cpu;
> > 
> > 	for_each_possible_cpu(cpu) {
> > +		struct flush_pd_queue *pd_queue;
> > 		struct flush_queue *queue;
> > 		unsigned long flags;
> > 
> > 		queue = per_cpu_ptr(&flush_queue, cpu);
> > +		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> > 		spin_lock_irqsave(&queue->lock, flags);
> > 		if (queue->next > 0)
> > -			__queue_flush(queue);
> > +			__queue_flush(queue, pd_queue);
> > 		spin_unlock_irqrestore(&queue->lock, flags);
> > 	}
> > }
> > @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
> > unsused)
> > static void queue_add(struct dma_ops_domain *dma_dom,
> > 		      unsigned long address, unsigned long pages)
> > {
> > +	struct flush_pd_queue_entry *pd_entry;
> > +	struct flush_pd_queue *pd_queue;
> > 	struct flush_queue_entry *entry;
> > 	struct flush_queue *queue;
> > 	unsigned long flags;
> > @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
> > *dma_dom,
> > 	address >>= PAGE_SHIFT;
> > 
> > 	queue = get_cpu_ptr(&flush_queue);
> > +	pd_queue = get_cpu_ptr(&flush_pd_queue);
> > 	spin_lock_irqsave(&queue->lock, flags);
> > 
> > 	if (queue->next == FLUSH_QUEUE_SIZE)
> > -		__queue_flush(queue);
> > +		__queue_flush(queue, pd_queue);
> > +
> > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > +		pd_entry = pd_queue->entries + idx;
> > +		if (pd_entry->pd == &dma_dom->domain)
> > +			break;
> > +	}
> > +	if (idx == pd_queue->next) {
> > +		/* New protection domain, add it to the list */
> > +		pd_entry = pd_queue->entries + pd_queue->next++;
> > +		pd_entry->pd = &dma_dom->domain;
> > +	}
> > 
> > 	idx   = queue->next++;
> > 	entry = queue->entries + idx;
> > @@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
> > *dma_dom,
> > 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
> > 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
> > 
> > +	put_cpu_ptr(&flush_pd_queue);
> > 	put_cpu_ptr(&flush_queue);
> > }
> > 
> > @@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
> > 		return ret;
> > 
> > 	for_each_possible_cpu(cpu) {
> > +		struct flush_pd_queue *pd_queue =
> > per_cpu_ptr(&flush_pd_queue,
> > +							      cpu);
> > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > cpu);
> > 
> > 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > @@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
> > 			goto out_put_iova;
> > 
> > 		spin_lock_init(&queue->lock);
> > +
> > +		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > +					    sizeof(*pd_queue->entries),
> > +					    GFP_KERNEL);
> > +		if (!pd_queue->entries)
> > +			goto out_put_iova;
> > 	}
> > 
> > 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
> > @@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
> > 
> > out_put_iova:
> > 	for_each_possible_cpu(cpu) {
> > +		struct flush_pd_queue *pd_queue =
> > per_cpu_ptr(&flush_pd_queue,
> > +							      cpu);
> > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > cpu);
> > 
> > 		kfree(queue->entries);
> > +		kfree(pd_queue->entries);
> > 	}
> > 
> > 	return -ENOMEM;
> 
> Craig and Jan, can you please confirm whether this patch fixes the
> IOMMU timeout errors you encountered before? If it does, then this is
> a better implementation of the fix I provided few weeks back.

I have only remote access to the machine, so I won't be able to test
until June 22nd.

Jan

> 
> Thanks,
> Arindam

-- 
Jan Vesely <jan.vesely-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]             ` <1496954035.4188.1.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-08 23:31               ` Craig Stein
  2017-06-21 16:20               ` Jan Vesely
  1 sibling, 0 replies; 24+ messages in thread
From: Craig Stein @ 2017-06-08 23:31 UTC (permalink / raw)
  To: Jan Vesely
  Cc: Lendacky, Thomas, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[-- Attachment #1.1: Type: text/plain, Size: 8022 bytes --]

I will test as soon as I can, just getting ready for summer vacation right
now

On Jun 8, 2017 14:34, "Jan Vesely" <jan.vesely-kgbqMDwikbSVc3sceRu5cw@public.gmane.org> wrote:

> On Tue, 2017-06-06 at 10:02 +0000, Nath, Arindam wrote:
> > > -----Original Message-----
> > > From: Lendacky, Thomas
> > > Sent: Tuesday, June 06, 2017 1:23 AM
> > > To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> > > Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
> > > <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
> > > Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
> > > Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> > >
> > > After reducing the amount of MMIO performed by the IOMMU during
> > > operation,
> > > perf data shows that flushing the TLB for all protection domains during
> > > DMA unmapping is a performance issue. It is not necessary to flush the
> > > TLBs for all protection domains, only the protection domains associated
> > > with iova's on the flush queue.
> > >
> > > Create a separate queue that tracks the protection domains associated
> with
> > > the iova's on the flush queue. This new queue optimizes the flushing of
> > > TLBs to the required protection domains.
> > >
> > > Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
> > > Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
> > > ---
> > > drivers/iommu/amd_iommu.c |   56
> > > ++++++++++++++++++++++++++++++++++++++++-----
> > > 1 file changed, 50 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> > > index 856103b..a5e77f0 100644
> > > --- a/drivers/iommu/amd_iommu.c
> > > +++ b/drivers/iommu/amd_iommu.c
> > > @@ -103,7 +103,18 @@ struct flush_queue {
> > >     struct flush_queue_entry *entries;
> > > };
> > >
> > > +struct flush_pd_queue_entry {
> > > +   struct protection_domain *pd;
> > > +};
> > > +
> > > +struct flush_pd_queue {
> > > +   /* No lock needed, protected by flush_queue lock */
> > > +   unsigned next;
> > > +   struct flush_pd_queue_entry *entries;
> > > +};
> > > +
> > > static DEFINE_PER_CPU(struct flush_queue, flush_queue);
> > > +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
> > >
> > > static atomic_t queue_timer_on;
> > > static struct timer_list queue_timer;
> > > @@ -2227,16 +2238,20 @@ static struct iommu_group
> > > *amd_iommu_device_group(struct device *dev)
> > >  *
> > >
> > > ***********************************************************
> > > ******************/
> > >
> > > -static void __queue_flush(struct flush_queue *queue)
> > > +static void __queue_flush(struct flush_queue *queue,
> > > +                     struct flush_pd_queue *pd_queue)
> > > {
> > > -   struct protection_domain *domain;
> > >     unsigned long flags;
> > >     int idx;
> > >
> > >     /* First flush TLB of all known domains */
> > >     spin_lock_irqsave(&amd_iommu_pd_lock, flags);
> > > -   list_for_each_entry(domain, &amd_iommu_pd_list, list)
> > > -           domain_flush_tlb(domain);
> > > +   for (idx = 0; idx < pd_queue->next; ++idx) {
> > > +           struct flush_pd_queue_entry *entry;
> > > +
> > > +           entry = pd_queue->entries + idx;
> > > +           domain_flush_tlb(entry->pd);
> > > +   }
> > >     spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
> > >
> > >     /* Wait until flushes have completed */
> > > @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
> > > *queue)
> > >             entry->dma_dom = NULL;
> > >     }
> > >
> > > +   pd_queue->next = 0;
> > >     queue->next = 0;
> > > }
> > >
> > > @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> > >     int cpu;
> > >
> > >     for_each_possible_cpu(cpu) {
> > > +           struct flush_pd_queue *pd_queue;
> > >             struct flush_queue *queue;
> > >             unsigned long flags;
> > >
> > >             queue = per_cpu_ptr(&flush_queue, cpu);
> > > +           pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> > >             spin_lock_irqsave(&queue->lock, flags);
> > >             if (queue->next > 0)
> > > -                   __queue_flush(queue);
> > > +                   __queue_flush(queue, pd_queue);
> > >             spin_unlock_irqrestore(&queue->lock, flags);
> > >     }
> > > }
> > > @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
> > > unsused)
> > > static void queue_add(struct dma_ops_domain *dma_dom,
> > >                   unsigned long address, unsigned long pages)
> > > {
> > > +   struct flush_pd_queue_entry *pd_entry;
> > > +   struct flush_pd_queue *pd_queue;
> > >     struct flush_queue_entry *entry;
> > >     struct flush_queue *queue;
> > >     unsigned long flags;
> > > @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
> > > *dma_dom,
> > >     address >>= PAGE_SHIFT;
> > >
> > >     queue = get_cpu_ptr(&flush_queue);
> > > +   pd_queue = get_cpu_ptr(&flush_pd_queue);
> > >     spin_lock_irqsave(&queue->lock, flags);
> > >
> > >     if (queue->next == FLUSH_QUEUE_SIZE)
> > > -           __queue_flush(queue);
> > > +           __queue_flush(queue, pd_queue);
> > > +
> > > +   for (idx = 0; idx < pd_queue->next; ++idx) {
> > > +           pd_entry = pd_queue->entries + idx;
> > > +           if (pd_entry->pd == &dma_dom->domain)
> > > +                   break;
> > > +   }
> > > +   if (idx == pd_queue->next) {
> > > +           /* New protection domain, add it to the list */
> > > +           pd_entry = pd_queue->entries + pd_queue->next++;
> > > +           pd_entry->pd = &dma_dom->domain;
> > > +   }
> > >
> > >     idx   = queue->next++;
> > >     entry = queue->entries + idx;
> > > @@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
> > > *dma_dom,
> > >     if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
> > >             mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
> > >
> > > +   put_cpu_ptr(&flush_pd_queue);
> > >     put_cpu_ptr(&flush_queue);
> > > }
> > >
> > > @@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
> > >             return ret;
> > >
> > >     for_each_possible_cpu(cpu) {
> > > +           struct flush_pd_queue *pd_queue =
> > > per_cpu_ptr(&flush_pd_queue,
> > > +                                                         cpu);
> > >             struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > cpu);
> > >
> > >             queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > @@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
> > >                     goto out_put_iova;
> > >
> > >             spin_lock_init(&queue->lock);
> > > +
> > > +           pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > +                                       sizeof(*pd_queue->entries),
> > > +                                       GFP_KERNEL);
> > > +           if (!pd_queue->entries)
> > > +                   goto out_put_iova;
> > >     }
> > >
> > >     err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
> > > @@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
> > >
> > > out_put_iova:
> > >     for_each_possible_cpu(cpu) {
> > > +           struct flush_pd_queue *pd_queue =
> > > per_cpu_ptr(&flush_pd_queue,
> > > +                                                         cpu);
> > >             struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > cpu);
> > >
> > >             kfree(queue->entries);
> > > +           kfree(pd_queue->entries);
> > >     }
> > >
> > >     return -ENOMEM;
> >
> > Craig and Jan, can you please confirm whether this patch fixes the
> > IOMMU timeout errors you encountered before? If it does, then this is
> > a better implementation of the fix I provided few weeks back.
>
> I have only remote access to the machine, so I won't be able to test
> until June 22nd.
>
> Jan
>
> >
> > Thanks,
> > Arindam
>
> --
> Jan Vesely <jan.vesely-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>

[-- Attachment #1.2: Type: text/html, Size: 11333 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]             ` <1496954035.4188.1.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  2017-06-08 23:31               ` Craig Stein
@ 2017-06-21 16:20               ` Jan Vesely
       [not found]                 ` <1498062018.17007.6.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  1 sibling, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-21 16:20 UTC (permalink / raw)
  To: Nath, Arindam, Craig Stein, Lendacky, Thomas,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[-- Attachment #1.1: Type: text/plain, Size: 7517 bytes --]

Hi Arindam,

has this patch been replaced by Joerg's "[PATCH 0/7] iommu/amd:
Optimize iova queue flushing" series?

Jan

On Thu, 2017-06-08 at 22:33 +0200, Jan Vesely wrote:
> On Tue, 2017-06-06 at 10:02 +0000, Nath, Arindam wrote:
> > > -----Original Message-----
> > > From: Lendacky, Thomas
> > > Sent: Tuesday, June 06, 2017 1:23 AM
> > > To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> > > Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
> > > <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
> > > Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
> > > Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> > > 
> > > After reducing the amount of MMIO performed by the IOMMU during
> > > operation,
> > > perf data shows that flushing the TLB for all protection domains during
> > > DMA unmapping is a performance issue. It is not necessary to flush the
> > > TLBs for all protection domains, only the protection domains associated
> > > with iova's on the flush queue.
> > > 
> > > Create a separate queue that tracks the protection domains associated with
> > > the iova's on the flush queue. This new queue optimizes the flushing of
> > > TLBs to the required protection domains.
> > > 
> > > Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
> > > Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
> > > ---
> > > drivers/iommu/amd_iommu.c |   56
> > > ++++++++++++++++++++++++++++++++++++++++-----
> > > 1 file changed, 50 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> > > index 856103b..a5e77f0 100644
> > > --- a/drivers/iommu/amd_iommu.c
> > > +++ b/drivers/iommu/amd_iommu.c
> > > @@ -103,7 +103,18 @@ struct flush_queue {
> > > 	struct flush_queue_entry *entries;
> > > };
> > > 
> > > +struct flush_pd_queue_entry {
> > > +	struct protection_domain *pd;
> > > +};
> > > +
> > > +struct flush_pd_queue {
> > > +	/* No lock needed, protected by flush_queue lock */
> > > +	unsigned next;
> > > +	struct flush_pd_queue_entry *entries;
> > > +};
> > > +
> > > static DEFINE_PER_CPU(struct flush_queue, flush_queue);
> > > +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
> > > 
> > > static atomic_t queue_timer_on;
> > > static struct timer_list queue_timer;
> > > @@ -2227,16 +2238,20 @@ static struct iommu_group
> > > *amd_iommu_device_group(struct device *dev)
> > >  *
> > > 
> > > ***********************************************************
> > > ******************/
> > > 
> > > -static void __queue_flush(struct flush_queue *queue)
> > > +static void __queue_flush(struct flush_queue *queue,
> > > +			  struct flush_pd_queue *pd_queue)
> > > {
> > > -	struct protection_domain *domain;
> > > 	unsigned long flags;
> > > 	int idx;
> > > 
> > > 	/* First flush TLB of all known domains */
> > > 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
> > > -	list_for_each_entry(domain, &amd_iommu_pd_list, list)
> > > -		domain_flush_tlb(domain);
> > > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > > +		struct flush_pd_queue_entry *entry;
> > > +
> > > +		entry = pd_queue->entries + idx;
> > > +		domain_flush_tlb(entry->pd);
> > > +	}
> > > 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
> > > 
> > > 	/* Wait until flushes have completed */
> > > @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
> > > *queue)
> > > 		entry->dma_dom = NULL;
> > > 	}
> > > 
> > > +	pd_queue->next = 0;
> > > 	queue->next = 0;
> > > }
> > > 
> > > @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> > > 	int cpu;
> > > 
> > > 	for_each_possible_cpu(cpu) {
> > > +		struct flush_pd_queue *pd_queue;
> > > 		struct flush_queue *queue;
> > > 		unsigned long flags;
> > > 
> > > 		queue = per_cpu_ptr(&flush_queue, cpu);
> > > +		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> > > 		spin_lock_irqsave(&queue->lock, flags);
> > > 		if (queue->next > 0)
> > > -			__queue_flush(queue);
> > > +			__queue_flush(queue, pd_queue);
> > > 		spin_unlock_irqrestore(&queue->lock, flags);
> > > 	}
> > > }
> > > @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
> > > unsused)
> > > static void queue_add(struct dma_ops_domain *dma_dom,
> > > 		      unsigned long address, unsigned long pages)
> > > {
> > > +	struct flush_pd_queue_entry *pd_entry;
> > > +	struct flush_pd_queue *pd_queue;
> > > 	struct flush_queue_entry *entry;
> > > 	struct flush_queue *queue;
> > > 	unsigned long flags;
> > > @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
> > > *dma_dom,
> > > 	address >>= PAGE_SHIFT;
> > > 
> > > 	queue = get_cpu_ptr(&flush_queue);
> > > +	pd_queue = get_cpu_ptr(&flush_pd_queue);
> > > 	spin_lock_irqsave(&queue->lock, flags);
> > > 
> > > 	if (queue->next == FLUSH_QUEUE_SIZE)
> > > -		__queue_flush(queue);
> > > +		__queue_flush(queue, pd_queue);
> > > +
> > > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > > +		pd_entry = pd_queue->entries + idx;
> > > +		if (pd_entry->pd == &dma_dom->domain)
> > > +			break;
> > > +	}
> > > +	if (idx == pd_queue->next) {
> > > +		/* New protection domain, add it to the list */
> > > +		pd_entry = pd_queue->entries + pd_queue->next++;
> > > +		pd_entry->pd = &dma_dom->domain;
> > > +	}
> > > 
> > > 	idx   = queue->next++;
> > > 	entry = queue->entries + idx;
> > > @@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
> > > *dma_dom,
> > > 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
> > > 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
> > > 
> > > +	put_cpu_ptr(&flush_pd_queue);
> > > 	put_cpu_ptr(&flush_queue);
> > > }
> > > 
> > > @@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
> > > 		return ret;
> > > 
> > > 	for_each_possible_cpu(cpu) {
> > > +		struct flush_pd_queue *pd_queue =
> > > per_cpu_ptr(&flush_pd_queue,
> > > +							      cpu);
> > > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > cpu);
> > > 
> > > 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > @@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
> > > 			goto out_put_iova;
> > > 
> > > 		spin_lock_init(&queue->lock);
> > > +
> > > +		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > +					    sizeof(*pd_queue->entries),
> > > +					    GFP_KERNEL);
> > > +		if (!pd_queue->entries)
> > > +			goto out_put_iova;
> > > 	}
> > > 
> > > 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
> > > @@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
> > > 
> > > out_put_iova:
> > > 	for_each_possible_cpu(cpu) {
> > > +		struct flush_pd_queue *pd_queue =
> > > per_cpu_ptr(&flush_pd_queue,
> > > +							      cpu);
> > > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > cpu);
> > > 
> > > 		kfree(queue->entries);
> > > +		kfree(pd_queue->entries);
> > > 	}
> > > 
> > > 	return -ENOMEM;
> > 
> > Craig and Jan, can you please confirm whether this patch fixes the
> > IOMMU timeout errors you encountered before? If it does, then this is
> > a better implementation of the fix I provided few weeks back.
> 
> I have only remote access to the machine, so I won't be able to test
> until June 22nd.
> 
> Jan
> 
> > 
> > Thanks,
> > Arindam
> 
> 

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                 ` <1498062018.17007.6.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-21 17:01                   ` Tom Lendacky
       [not found]                     ` <bf685f44-019c-4c21-25d4-6a6ea647b7cc-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Tom Lendacky @ 2017-06-21 17:01 UTC (permalink / raw)
  To: Jan Vesely, Nath, Arindam, Craig Stein,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On 6/21/2017 11:20 AM, Jan Vesely wrote:
> Hi Arindam,
> 
> has this patch been replaced by Joerg's "[PATCH 0/7] iommu/amd:
> Optimize iova queue flushing" series?

Yes, Joerg's patches replaced this patch.  He applied just the first two
patches of this series.

Thanks,
Tom

> 
> Jan
> 
> On Thu, 2017-06-08 at 22:33 +0200, Jan Vesely wrote:
>> On Tue, 2017-06-06 at 10:02 +0000, Nath, Arindam wrote:
>>>> -----Original Message-----
>>>> From: Lendacky, Thomas
>>>> Sent: Tuesday, June 06, 2017 1:23 AM
>>>> To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
>>>> Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
>>>> <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
>>>> Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
>>>> Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
>>>>
>>>> After reducing the amount of MMIO performed by the IOMMU during
>>>> operation,
>>>> perf data shows that flushing the TLB for all protection domains during
>>>> DMA unmapping is a performance issue. It is not necessary to flush the
>>>> TLBs for all protection domains, only the protection domains associated
>>>> with iova's on the flush queue.
>>>>
>>>> Create a separate queue that tracks the protection domains associated with
>>>> the iova's on the flush queue. This new queue optimizes the flushing of
>>>> TLBs to the required protection domains.
>>>>
>>>> Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
>>>> Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
>>>> ---
>>>> drivers/iommu/amd_iommu.c |   56
>>>> ++++++++++++++++++++++++++++++++++++++++-----
>>>> 1 file changed, 50 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
>>>> index 856103b..a5e77f0 100644
>>>> --- a/drivers/iommu/amd_iommu.c
>>>> +++ b/drivers/iommu/amd_iommu.c
>>>> @@ -103,7 +103,18 @@ struct flush_queue {
>>>> 	struct flush_queue_entry *entries;
>>>> };
>>>>
>>>> +struct flush_pd_queue_entry {
>>>> +	struct protection_domain *pd;
>>>> +};
>>>> +
>>>> +struct flush_pd_queue {
>>>> +	/* No lock needed, protected by flush_queue lock */
>>>> +	unsigned next;
>>>> +	struct flush_pd_queue_entry *entries;
>>>> +};
>>>> +
>>>> static DEFINE_PER_CPU(struct flush_queue, flush_queue);
>>>> +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
>>>>
>>>> static atomic_t queue_timer_on;
>>>> static struct timer_list queue_timer;
>>>> @@ -2227,16 +2238,20 @@ static struct iommu_group
>>>> *amd_iommu_device_group(struct device *dev)
>>>>   *
>>>>
>>>> ***********************************************************
>>>> ******************/
>>>>
>>>> -static void __queue_flush(struct flush_queue *queue)
>>>> +static void __queue_flush(struct flush_queue *queue,
>>>> +			  struct flush_pd_queue *pd_queue)
>>>> {
>>>> -	struct protection_domain *domain;
>>>> 	unsigned long flags;
>>>> 	int idx;
>>>>
>>>> 	/* First flush TLB of all known domains */
>>>> 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
>>>> -	list_for_each_entry(domain, &amd_iommu_pd_list, list)
>>>> -		domain_flush_tlb(domain);
>>>> +	for (idx = 0; idx < pd_queue->next; ++idx) {
>>>> +		struct flush_pd_queue_entry *entry;
>>>> +
>>>> +		entry = pd_queue->entries + idx;
>>>> +		domain_flush_tlb(entry->pd);
>>>> +	}
>>>> 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
>>>>
>>>> 	/* Wait until flushes have completed */
>>>> @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
>>>> *queue)
>>>> 		entry->dma_dom = NULL;
>>>> 	}
>>>>
>>>> +	pd_queue->next = 0;
>>>> 	queue->next = 0;
>>>> }
>>>>
>>>> @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
>>>> 	int cpu;
>>>>
>>>> 	for_each_possible_cpu(cpu) {
>>>> +		struct flush_pd_queue *pd_queue;
>>>> 		struct flush_queue *queue;
>>>> 		unsigned long flags;
>>>>
>>>> 		queue = per_cpu_ptr(&flush_queue, cpu);
>>>> +		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
>>>> 		spin_lock_irqsave(&queue->lock, flags);
>>>> 		if (queue->next > 0)
>>>> -			__queue_flush(queue);
>>>> +			__queue_flush(queue, pd_queue);
>>>> 		spin_unlock_irqrestore(&queue->lock, flags);
>>>> 	}
>>>> }
>>>> @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
>>>> unsused)
>>>> static void queue_add(struct dma_ops_domain *dma_dom,
>>>> 		      unsigned long address, unsigned long pages)
>>>> {
>>>> +	struct flush_pd_queue_entry *pd_entry;
>>>> +	struct flush_pd_queue *pd_queue;
>>>> 	struct flush_queue_entry *entry;
>>>> 	struct flush_queue *queue;
>>>> 	unsigned long flags;
>>>> @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
>>>> *dma_dom,
>>>> 	address >>= PAGE_SHIFT;
>>>>
>>>> 	queue = get_cpu_ptr(&flush_queue);
>>>> +	pd_queue = get_cpu_ptr(&flush_pd_queue);
>>>> 	spin_lock_irqsave(&queue->lock, flags);
>>>>
>>>> 	if (queue->next == FLUSH_QUEUE_SIZE)
>>>> -		__queue_flush(queue);
>>>> +		__queue_flush(queue, pd_queue);
>>>> +
>>>> +	for (idx = 0; idx < pd_queue->next; ++idx) {
>>>> +		pd_entry = pd_queue->entries + idx;
>>>> +		if (pd_entry->pd == &dma_dom->domain)
>>>> +			break;
>>>> +	}
>>>> +	if (idx == pd_queue->next) {
>>>> +		/* New protection domain, add it to the list */
>>>> +		pd_entry = pd_queue->entries + pd_queue->next++;
>>>> +		pd_entry->pd = &dma_dom->domain;
>>>> +	}
>>>>
>>>> 	idx   = queue->next++;
>>>> 	entry = queue->entries + idx;
>>>> @@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
>>>> *dma_dom,
>>>> 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
>>>> 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
>>>>
>>>> +	put_cpu_ptr(&flush_pd_queue);
>>>> 	put_cpu_ptr(&flush_queue);
>>>> }
>>>>
>>>> @@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
>>>> 		return ret;
>>>>
>>>> 	for_each_possible_cpu(cpu) {
>>>> +		struct flush_pd_queue *pd_queue =
>>>> per_cpu_ptr(&flush_pd_queue,
>>>> +							      cpu);
>>>> 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
>>>> cpu);
>>>>
>>>> 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
>>>> @@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
>>>> 			goto out_put_iova;
>>>>
>>>> 		spin_lock_init(&queue->lock);
>>>> +
>>>> +		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
>>>> +					    sizeof(*pd_queue->entries),
>>>> +					    GFP_KERNEL);
>>>> +		if (!pd_queue->entries)
>>>> +			goto out_put_iova;
>>>> 	}
>>>>
>>>> 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
>>>> @@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
>>>>
>>>> out_put_iova:
>>>> 	for_each_possible_cpu(cpu) {
>>>> +		struct flush_pd_queue *pd_queue =
>>>> per_cpu_ptr(&flush_pd_queue,
>>>> +							      cpu);
>>>> 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
>>>> cpu);
>>>>
>>>> 		kfree(queue->entries);
>>>> +		kfree(pd_queue->entries);
>>>> 	}
>>>>
>>>> 	return -ENOMEM;
>>>
>>> Craig and Jan, can you please confirm whether this patch fixes the
>>> IOMMU timeout errors you encountered before? If it does, then this is
>>> a better implementation of the fix I provided few weeks back.
>>
>> I have only remote access to the machine, so I won't be able to test
>> until June 22nd.
>>
>> Jan
>>
>>>
>>> Thanks,
>>> Arindam
>>
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                     ` <bf685f44-019c-4c21-25d4-6a6ea647b7cc-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-21 21:09                       ` Jan Vesely
       [not found]                         ` <1498079371.17007.18.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-21 21:09 UTC (permalink / raw)
  To: Tom Lendacky, Nath, Arindam, Craig Stein,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[-- Attachment #1.1: Type: text/plain, Size: 8932 bytes --]

On Wed, 2017-06-21 at 12:01 -0500, Tom Lendacky wrote:
> On 6/21/2017 11:20 AM, Jan Vesely wrote:
> > Hi Arindam,
> > 
> > has this patch been replaced by Joerg's "[PATCH 0/7] iommu/amd:
> > Optimize iova queue flushing" series?
> 
> Yes, Joerg's patches replaced this patch.  He applied just the first two
> patches of this series.

Joerg's patches applied on top of 4.10.17 do not solve my issue (do I
need the first two patches of this series?). the machine still hangs on
boot with a flood of IOMMU wait loop timed out messages.

on the other hand patch 3/3 v1 applied on top of 4.10.17 fixes the
problem and the machine boots successfully

regards,
Jan


> 
> Thanks,
> Tom
> 
> > 
> > Jan
> > 
> > On Thu, 2017-06-08 at 22:33 +0200, Jan Vesely wrote:
> > > On Tue, 2017-06-06 at 10:02 +0000, Nath, Arindam wrote:
> > > > > -----Original Message-----
> > > > > From: Lendacky, Thomas
> > > > > Sent: Tuesday, June 06, 2017 1:23 AM
> > > > > To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> > > > > Cc: Nath, Arindam <Arindam.Nath-5C7GfCeVMHo@public.gmane.org>; Joerg Roedel
> > > > > <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; Duran, Leo <leo.duran-5C7GfCeVMHo@public.gmane.org>; Suthikulpanit,
> > > > > Suravee <Suravee.Suthikulpanit-5C7GfCeVMHo@public.gmane.org>
> > > > > Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> > > > > 
> > > > > After reducing the amount of MMIO performed by the IOMMU during
> > > > > operation,
> > > > > perf data shows that flushing the TLB for all protection domains during
> > > > > DMA unmapping is a performance issue. It is not necessary to flush the
> > > > > TLBs for all protection domains, only the protection domains associated
> > > > > with iova's on the flush queue.
> > > > > 
> > > > > Create a separate queue that tracks the protection domains associated with
> > > > > the iova's on the flush queue. This new queue optimizes the flushing of
> > > > > TLBs to the required protection domains.
> > > > > 
> > > > > Reviewed-by: Arindam Nath <arindam.nath-5C7GfCeVMHo@public.gmane.org>
> > > > > Signed-off-by: Tom Lendacky <thomas.lendacky-5C7GfCeVMHo@public.gmane.org>
> > > > > ---
> > > > > drivers/iommu/amd_iommu.c |   56
> > > > > ++++++++++++++++++++++++++++++++++++++++-----
> > > > > 1 file changed, 50 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> > > > > index 856103b..a5e77f0 100644
> > > > > --- a/drivers/iommu/amd_iommu.c
> > > > > +++ b/drivers/iommu/amd_iommu.c
> > > > > @@ -103,7 +103,18 @@ struct flush_queue {
> > > > > 	struct flush_queue_entry *entries;
> > > > > };
> > > > > 
> > > > > +struct flush_pd_queue_entry {
> > > > > +	struct protection_domain *pd;
> > > > > +};
> > > > > +
> > > > > +struct flush_pd_queue {
> > > > > +	/* No lock needed, protected by flush_queue lock */
> > > > > +	unsigned next;
> > > > > +	struct flush_pd_queue_entry *entries;
> > > > > +};
> > > > > +
> > > > > static DEFINE_PER_CPU(struct flush_queue, flush_queue);
> > > > > +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
> > > > > 
> > > > > static atomic_t queue_timer_on;
> > > > > static struct timer_list queue_timer;
> > > > > @@ -2227,16 +2238,20 @@ static struct iommu_group
> > > > > *amd_iommu_device_group(struct device *dev)
> > > > >   *
> > > > > 
> > > > > ***********************************************************
> > > > > ******************/
> > > > > 
> > > > > -static void __queue_flush(struct flush_queue *queue)
> > > > > +static void __queue_flush(struct flush_queue *queue,
> > > > > +			  struct flush_pd_queue *pd_queue)
> > > > > {
> > > > > -	struct protection_domain *domain;
> > > > > 	unsigned long flags;
> > > > > 	int idx;
> > > > > 
> > > > > 	/* First flush TLB of all known domains */
> > > > > 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
> > > > > -	list_for_each_entry(domain, &amd_iommu_pd_list, list)
> > > > > -		domain_flush_tlb(domain);
> > > > > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > > > > +		struct flush_pd_queue_entry *entry;
> > > > > +
> > > > > +		entry = pd_queue->entries + idx;
> > > > > +		domain_flush_tlb(entry->pd);
> > > > > +	}
> > > > > 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
> > > > > 
> > > > > 	/* Wait until flushes have completed */
> > > > > @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
> > > > > *queue)
> > > > > 		entry->dma_dom = NULL;
> > > > > 	}
> > > > > 
> > > > > +	pd_queue->next = 0;
> > > > > 	queue->next = 0;
> > > > > }
> > > > > 
> > > > > @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> > > > > 	int cpu;
> > > > > 
> > > > > 	for_each_possible_cpu(cpu) {
> > > > > +		struct flush_pd_queue *pd_queue;
> > > > > 		struct flush_queue *queue;
> > > > > 		unsigned long flags;
> > > > > 
> > > > > 		queue = per_cpu_ptr(&flush_queue, cpu);
> > > > > +		pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> > > > > 		spin_lock_irqsave(&queue->lock, flags);
> > > > > 		if (queue->next > 0)
> > > > > -			__queue_flush(queue);
> > > > > +			__queue_flush(queue, pd_queue);
> > > > > 		spin_unlock_irqrestore(&queue->lock, flags);
> > > > > 	}
> > > > > }
> > > > > @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
> > > > > unsused)
> > > > > static void queue_add(struct dma_ops_domain *dma_dom,
> > > > > 		      unsigned long address, unsigned long pages)
> > > > > {
> > > > > +	struct flush_pd_queue_entry *pd_entry;
> > > > > +	struct flush_pd_queue *pd_queue;
> > > > > 	struct flush_queue_entry *entry;
> > > > > 	struct flush_queue *queue;
> > > > > 	unsigned long flags;
> > > > > @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
> > > > > *dma_dom,
> > > > > 	address >>= PAGE_SHIFT;
> > > > > 
> > > > > 	queue = get_cpu_ptr(&flush_queue);
> > > > > +	pd_queue = get_cpu_ptr(&flush_pd_queue);
> > > > > 	spin_lock_irqsave(&queue->lock, flags);
> > > > > 
> > > > > 	if (queue->next == FLUSH_QUEUE_SIZE)
> > > > > -		__queue_flush(queue);
> > > > > +		__queue_flush(queue, pd_queue);
> > > > > +
> > > > > +	for (idx = 0; idx < pd_queue->next; ++idx) {
> > > > > +		pd_entry = pd_queue->entries + idx;
> > > > > +		if (pd_entry->pd == &dma_dom->domain)
> > > > > +			break;
> > > > > +	}
> > > > > +	if (idx == pd_queue->next) {
> > > > > +		/* New protection domain, add it to the list */
> > > > > +		pd_entry = pd_queue->entries + pd_queue->next++;
> > > > > +		pd_entry->pd = &dma_dom->domain;
> > > > > +	}
> > > > > 
> > > > > 	idx   = queue->next++;
> > > > > 	entry = queue->entries + idx;
> > > > > @@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain
> > > > > *dma_dom,
> > > > > 	if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
> > > > > 		mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
> > > > > 
> > > > > +	put_cpu_ptr(&flush_pd_queue);
> > > > > 	put_cpu_ptr(&flush_queue);
> > > > > }
> > > > > 
> > > > > @@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
> > > > > 		return ret;
> > > > > 
> > > > > 	for_each_possible_cpu(cpu) {
> > > > > +		struct flush_pd_queue *pd_queue =
> > > > > per_cpu_ptr(&flush_pd_queue,
> > > > > +							      cpu);
> > > > > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > > > cpu);
> > > > > 
> > > > > 		queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > > > @@ -2819,6 +2854,12 @@ int __init amd_iommu_init_api(void)
> > > > > 			goto out_put_iova;
> > > > > 
> > > > > 		spin_lock_init(&queue->lock);
> > > > > +
> > > > > +		pd_queue->entries = kzalloc(FLUSH_QUEUE_SIZE *
> > > > > +					    sizeof(*pd_queue->entries),
> > > > > +					    GFP_KERNEL);
> > > > > +		if (!pd_queue->entries)
> > > > > +			goto out_put_iova;
> > > > > 	}
> > > > > 
> > > > > 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
> > > > > @@ -2836,9 +2877,12 @@ int __init amd_iommu_init_api(void)
> > > > > 
> > > > > out_put_iova:
> > > > > 	for_each_possible_cpu(cpu) {
> > > > > +		struct flush_pd_queue *pd_queue =
> > > > > per_cpu_ptr(&flush_pd_queue,
> > > > > +							      cpu);
> > > > > 		struct flush_queue *queue = per_cpu_ptr(&flush_queue,
> > > > > cpu);
> > > > > 
> > > > > 		kfree(queue->entries);
> > > > > +		kfree(pd_queue->entries);
> > > > > 	}
> > > > > 
> > > > > 	return -ENOMEM;
> > > > 
> > > > Craig and Jan, can you please confirm whether this patch fixes the
> > > > IOMMU timeout errors you encountered before? If it does, then this is
> > > > a better implementation of the fix I provided few weeks back.
> > > 
> > > I have only remote access to the machine, so I won't be able to test
> > > until June 22nd.
> > > 
> > > Jan
> > > 
> > > > 
> > > > Thanks,
> > > > Arindam
> > > 
> > > 

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                         ` <1498079371.17007.18.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-22  9:20                           ` Joerg Roedel
       [not found]                             ` <20170622092053.GV30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Joerg Roedel @ 2017-06-22  9:20 UTC (permalink / raw)
  To: Jan Vesely
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein

On Wed, Jun 21, 2017 at 05:09:31PM -0400, Jan Vesely wrote:
> On Wed, 2017-06-21 at 12:01 -0500, Tom Lendacky wrote:
> > On 6/21/2017 11:20 AM, Jan Vesely wrote:
> > > Hi Arindam,
> > > 
> > > has this patch been replaced by Joerg's "[PATCH 0/7] iommu/amd:
> > > Optimize iova queue flushing" series?
> > 
> > Yes, Joerg's patches replaced this patch.  He applied just the first two
> > patches of this series.
> 
> Joerg's patches applied on top of 4.10.17 do not solve my issue (do I
> need the first two patches of this series?). the machine still hangs on
> boot with a flood of IOMMU wait loop timed out messages.
> 
> on the other hand patch 3/3 v1 applied on top of 4.10.17 fixes the
> problem and the machine boots successfully

Interesting. I did some measurements on the IOTLB flush-rate with my
network load-test. This test is designed to heavily excerise the IOMMU
map/unmap path and thus cause many IOTLB invalidations too.

Results are:

	Current upstream v4.12-rc6:	~147000 flushes/s
	With Toms patches:		  ~5900 flushes/s
	With my patches:		  ~1200 flushes/s

So while Toms patches also get the flush-rate down significantly, it is
even lower with my patches. This indicates that the problem is
triggerable even with low flush rates.

But I have no idea why it still triggers with my patches, but not with
Toms. The approaches follow the same idea of only flushing domains that
have map/unmap operations on them.

I really think we need the patch to blacklist ATS on these GPUs
upstream.

Regards,

	Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                             ` <20170622092053.GV30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2017-06-22 15:13                               ` Jan Vesely
       [not found]                                 ` <1498144389.17007.25.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-22 15:13 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein


[-- Attachment #1.1: Type: text/plain, Size: 2121 bytes --]

On Thu, 2017-06-22 at 11:20 +0200, Joerg Roedel wrote:
> On Wed, Jun 21, 2017 at 05:09:31PM -0400, Jan Vesely wrote:
> > On Wed, 2017-06-21 at 12:01 -0500, Tom Lendacky wrote:
> > > On 6/21/2017 11:20 AM, Jan Vesely wrote:
> > > > Hi Arindam,
> > > > 
> > > > has this patch been replaced by Joerg's "[PATCH 0/7] iommu/amd:
> > > > Optimize iova queue flushing" series?
> > > 
> > > Yes, Joerg's patches replaced this patch.  He applied just the first two
> > > patches of this series.
> > 
> > Joerg's patches applied on top of 4.10.17 do not solve my issue (do I
> > need the first two patches of this series?). the machine still hangs on
> > boot with a flood of IOMMU wait loop timed out messages.
> > 
> > on the other hand patch 3/3 v1 applied on top of 4.10.17 fixes the
> > problem and the machine boots successfully
> 
> Interesting. I did some measurements on the IOTLB flush-rate with my
> network load-test. This test is designed to heavily excerise the IOMMU
> map/unmap path and thus cause many IOTLB invalidations too.

It looks like I tested different patches.
linux-4.10.17 with both
"iommu/amd: Optimize iova queue flushing"
and
"iommu/amd: Disable previously enabled IOMMUs at boot"
(I haven't tested the series independently)

works OK. The machine booted successfully and I was able to test clover
based OpenCL and simple OpenGL on both iGPU(carrizo) and dGPU(iceland).

thanks and sorry for the confusion,
Jan

> 
> Results are:
> 
> 	Current upstream v4.12-rc6:	~147000 flushes/s
> 	With Toms patches:		  ~5900 flushes/s
> 	With my patches:		  ~1200 flushes/s
> 
> So while Toms patches also get the flush-rate down significantly, it is
> even lower with my patches. This indicates that the problem is
> triggerable even with low flush rates.
> 
> But I have no idea why it still triggers with my patches, but not with
> Toms. The approaches follow the same idea of only flushing domains that
> have map/unmap operations on them.
> 
> I really think we need the patch to blacklist ATS on these GPUs
> upstream.
> 
> Regards,
> 
> 	Joerg
> 

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                 ` <1498144389.17007.25.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-22 21:57                                   ` Joerg Roedel
       [not found]                                     ` <20170622215735.GW30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Joerg Roedel @ 2017-06-22 21:57 UTC (permalink / raw)
  To: Jan Vesely
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein

On Thu, Jun 22, 2017 at 11:13:09AM -0400, Jan Vesely wrote:
> It looks like I tested different patches.
> linux-4.10.17 with both
> "iommu/amd: Optimize iova queue flushing"

This patch isn't in my tree and will not go upstream.

> and
> "iommu/amd: Disable previously enabled IOMMUs at boot"

This patch solves a different problem.

> (I haven't tested the series independently)
> 
> works OK. The machine booted successfully and I was able to test clover
> based OpenCL and simple OpenGL on both iGPU(carrizo) and dGPU(iceland).

For a conclusive test please use what is in the iommu-tree, as this is
what I plan to send upstream. You can use the 'next' branch of

	git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git

to get all patches, including my flush optimization series.


Thanks,

	Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                     ` <20170622215735.GW30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2017-06-23 14:20                                       ` Jan Vesely
       [not found]                                         ` <1498227647.17007.31.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-23 14:20 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein


[-- Attachment #1.1: Type: text/plain, Size: 5833 bytes --]

On Thu, 2017-06-22 at 23:57 +0200, Joerg Roedel wrote:
> On Thu, Jun 22, 2017 at 11:13:09AM -0400, Jan Vesely wrote:
> > It looks like I tested different patches.
> > linux-4.10.17 with both
> > "iommu/amd: Optimize iova queue flushing"
> 
> This patch isn't in my tree and will not go upstream.
> 
> > and
> > "iommu/amd: Disable previously enabled IOMMUs at boot"
> 
> This patch solves a different problem.
> 
> > (I haven't tested the series independently)
> > 
> > works OK. The machine booted successfully and I was able to test clover
> > based OpenCL and simple OpenGL on both iGPU(carrizo) and dGPU(iceland).
> 
> For a conclusive test please use what is in the iommu-tree, as this is
> what I plan to send upstream. You can use the 'next' branch of
> 
> 	git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git

Tested commit c71bf5f133056aae71e8ae7ea66240574bd44f54.

The machine boots and runs OK, although it takes few minutes to boot up
(looks USB related).

OpenGL and OpenCL run OK on both GPUs.

I was able to trigger "Completion-Wait loop timed out" messages in the
following situation:
Hung OpenCL task running on dGPU.
dGPU goes to sleep.
sigterm to hung task.
it seems to recover OK after the dGPU is powered back on

dmesg:
[ 1628.049683] amdgpu: [powerplay] VI should always have 2 performance levels
[ 1628.845195] amdgpu 0000:07:00.0: GPU pci config reset
[ 1667.270351] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.270437] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.270491] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.270505] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.270556] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.270607] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.270614] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.270664] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.270714] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.270721] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.270770] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.270846] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.270868] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.270922] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.270982] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.270992] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.271043] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.271096] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.271109] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.271164] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.271230] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.271245] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.271338] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.271394] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.271403] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.271458] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.271518] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.271533] amdgpu 0000:07:00.0: couldn't schedule ib on ring <sdma0>
[ 1667.271588] [drm:amdgpu_job_run [amdgpu]] *ERROR* Error scheduling IBs (-22)
[ 1667.271644] [drm:amd_sched_main [amdgpu]] *ERROR* Failed to run job!
[ 1667.426742] AMD-Vi: Completion-Wait loop timed out
[ 1667.570025] AMD-Vi: Completion-Wait loop timed out
[ 1667.713326] AMD-Vi: Completion-Wait loop timed out
[ 1667.867561] AMD-Vi: Completion-Wait loop timed out
[ 1668.010886] AMD-Vi: Completion-Wait loop timed out
[ 1668.154207] AMD-Vi: Completion-Wait loop timed out
[ 1668.283193] AMD-Vi: Event logged [
[ 1668.283201] IOTLB_INV_TIMEOUT device=07:00.0 address=0x000000040ce6e240]
[ 1668.430357] AMD-Vi: Completion-Wait loop timed out
[ 1668.581169] AMD-Vi: Completion-Wait loop timed out
[ 1668.718046] AMD-Vi: Completion-Wait loop timed out
[ 1668.854914] AMD-Vi: Completion-Wait loop timed out
[ 1668.991774] AMD-Vi: Completion-Wait loop timed out
[ 1669.128638] AMD-Vi: Completion-Wait loop timed out
[ 1669.272391] AMD-Vi: Completion-Wait loop timed out
[ 1669.285193] AMD-Vi: Event logged [
[ 1669.285200] IOTLB_INV_TIMEOUT device=07:00.0 address=0x000000040ce6e2b0]
[ 1669.285756] [drm] PCIE GART of 3072M enabled (table at 0x0000000000040000).
[ 1669.288274] amdgpu: [powerplay] can't get the mac of 5
[ 1669.302600] [drm] ring test on 0 succeeded in 16 usecs
[ 1669.302987] [drm] ring test on 1 succeeded in 17 usecs
[ 1669.303037] [drm] ring test on 2 succeeded in 21 usecs
[ 1669.303063] [drm] ring test on 3 succeeded in 10 usecs
[ 1669.303088] [drm] ring test on 4 succeeded in 10 usecs
[ 1669.303114] [drm] ring test on 5 succeeded in 10 usecs
[ 1669.303142] [drm] ring test on 6 succeeded in 11 usecs
[ 1669.303167] [drm] ring test on 7 succeeded in 10 usecs
[ 1669.303195] [drm] ring test on 8 succeeded in 11 usecs
[ 1669.303229] [drm] ring test on 9 succeeded in 3 usecs
[ 1669.303235] [drm] ring test on 10 succeeded in 3 usecs
[ 1675.029247] amdgpu: [powerplay] VI should always have 2 performance levels
[ 1675.823322] amdgpu 0000:07:00.0: GPU pci config reset

lspci:
07:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI]
Topaz XT [Radeon R7 M260/M265 / M340/M360 / M440/M445] (rev ff)

Jan

> 
> to get all patches, including my flush optimization series.
> 
> 
> Thanks,
> 
> 	Joerg
> 

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                         ` <1498227647.17007.31.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-26 12:14                                           ` Joerg Roedel
       [not found]                                             ` <20170626121430.GX30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Joerg Roedel @ 2017-06-26 12:14 UTC (permalink / raw)
  To: Jan Vesely
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein

On Fri, Jun 23, 2017 at 10:20:47AM -0400, Jan Vesely wrote:
> I was able to trigger "Completion-Wait loop timed out" messages in the
> following situation:
> Hung OpenCL task running on dGPU.
> dGPU goes to sleep.
> sigterm to hung task.
> it seems to recover OK after the dGPU is powered back on

How does that 'dGPU goes to sleep' work? Do you put it to sleep manually
via sysfs or something? Or is that something that amdgpu does on its
own?

It looks like the GPU just switches the ATS unit off when it goes to
sleep and doesn't answer the invalidation anymore, which explains the
completion-wait timeouts.



	Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                             ` <20170626121430.GX30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2017-06-27 16:24                                               ` Jan Vesely
       [not found]                                                 ` <1498580675.10525.3.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Vesely @ 2017-06-27 16:24 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein


[-- Attachment #1.1: Type: text/plain, Size: 1465 bytes --]

On Mon, 2017-06-26 at 14:14 +0200, Joerg Roedel wrote:
> On Fri, Jun 23, 2017 at 10:20:47AM -0400, Jan Vesely wrote:
> > I was able to trigger "Completion-Wait loop timed out" messages in the
> > following situation:
> > Hung OpenCL task running on dGPU.
> > dGPU goes to sleep.
> > sigterm to hung task.
> > it seems to recover OK after the dGPU is powered back on
> 
> How does that 'dGPU goes to sleep' work? Do you put it to sleep manually
> via sysfs or something? Or is that something that amdgpu does on its
> own?

AMD folks should be able to provide more details. afaik, the driver
uses ACPI methods to power on/off the device. Driver routines wake the
device up before accessing it and there is a timeout to turn it off
after few seconds of inactivity.

> 
> It looks like the GPU just switches the ATS unit off when it goes to
> sleep and doesn't answer the invalidation anymore, which explains the
> completion-wait timeouts.

Both MMIO regs and PCIe config regs are turned off so it would not
surprise me if all PCIe requests were ignored by the device in off
state. it should be possible to request device wake up before
invalidating the relevant IOMMU domain. I'll leave to more
knowledgeable ppl to judge whether it's a good idea (we can also
postpone such invalidations until the device is woken by other means)


Jan

> 
> 
> 
> 	Joerg
> 

-- 
Jan Vesely <jan.vesely-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                                 ` <1498580675.10525.3.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
@ 2017-06-28  8:36                                                   ` Joerg Roedel
       [not found]                                                     ` <20170628083659.GA30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Joerg Roedel @ 2017-06-28  8:36 UTC (permalink / raw)
  To: Jan Vesely, Alexander.Deucher-5C7GfCeVMHo
  Cc: Tom Lendacky, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein

[Adding Alex Deucher]

Hey Alex,

On Tue, Jun 27, 2017 at 12:24:35PM -0400, Jan Vesely wrote:
> On Mon, 2017-06-26 at 14:14 +0200, Joerg Roedel wrote:

> > How does that 'dGPU goes to sleep' work? Do you put it to sleep manually
> > via sysfs or something? Or is that something that amdgpu does on its
> > own?
> 
> AMD folks should be able to provide more details. afaik, the driver
> uses ACPI methods to power on/off the device. Driver routines wake the
> device up before accessing it and there is a timeout to turn it off
> after few seconds of inactivity.
> 
> > 
> > It looks like the GPU just switches the ATS unit off when it goes to
> > sleep and doesn't answer the invalidation anymore, which explains the
> > completion-wait timeouts.
> 
> Both MMIO regs and PCIe config regs are turned off so it would not
> surprise me if all PCIe requests were ignored by the device in off
> state. it should be possible to request device wake up before
> invalidating the relevant IOMMU domain. I'll leave to more
> knowledgeable ppl to judge whether it's a good idea (we can also
> postpone such invalidations until the device is woken by other means)

Can you maybe sched some light on how the sleep-mode of the GPUs work?
Is it initiated by the GPU driver or from somewhere else? In the case
discussed here it looks like the ATS unit of the GPU is switched of,
causing IOTLB invalidation timeouts on the IOMMU side.

If that is the case we might need some sort of dma-api extension so that
the GPU driver can tell the iommu driver that the device is going to be
quiet.


Thanks,

       Joerg

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                                     ` <20170628083659.GA30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2017-06-28 22:14                                                       ` Deucher, Alexander
       [not found]                                                         ` <BN6PR12MB16525D2E89F4AB61DC36EFBEF7DD0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Deucher, Alexander @ 2017-06-28 22:14 UTC (permalink / raw)
  To: 'Joerg Roedel', Jan Vesely
  Cc: Lendacky, Thomas, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Craig Stein

> -----Original Message-----
> From: Joerg Roedel [mailto:joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org]
> Sent: Wednesday, June 28, 2017 4:37 AM
> To: Jan Vesely; Deucher, Alexander
> Cc: Lendacky, Thomas; Nath, Arindam; Craig Stein; iommu-cunTk1MwBs/ROKNJybVBZg@public.gmane.org
> foundation.org; Duran, Leo; Suthikulpanit, Suravee
> Subject: Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> 
> [Adding Alex Deucher]
> 
> Hey Alex,
> 
> On Tue, Jun 27, 2017 at 12:24:35PM -0400, Jan Vesely wrote:
> > On Mon, 2017-06-26 at 14:14 +0200, Joerg Roedel wrote:
> 
> > > How does that 'dGPU goes to sleep' work? Do you put it to sleep
> manually
> > > via sysfs or something? Or is that something that amdgpu does on its
> > > own?
> >
> > AMD folks should be able to provide more details. afaik, the driver
> > uses ACPI methods to power on/off the device. Driver routines wake the
> > device up before accessing it and there is a timeout to turn it off
> > after few seconds of inactivity.
> >
> > >
> > > It looks like the GPU just switches the ATS unit off when it goes to
> > > sleep and doesn't answer the invalidation anymore, which explains the
> > > completion-wait timeouts.
> >
> > Both MMIO regs and PCIe config regs are turned off so it would not
> > surprise me if all PCIe requests were ignored by the device in off
> > state. it should be possible to request device wake up before
> > invalidating the relevant IOMMU domain. I'll leave to more
> > knowledgeable ppl to judge whether it's a good idea (we can also
> > postpone such invalidations until the device is woken by other means)
> 
> Can you maybe sched some light on how the sleep-mode of the GPUs work?
> Is it initiated by the GPU driver or from somewhere else? In the case
> discussed here it looks like the ATS unit of the GPU is switched of,
> causing IOTLB invalidation timeouts on the IOMMU side.
> 
> If that is the case we might need some sort of dma-api extension so that
> the GPU driver can tell the iommu driver that the device is going to be
> quiet.

I assume you are talking about Hybrid/PowerXpress laptops where the dGPU can be powered down dynamically?  That is done via the runtime pm subsystem in the kernel.  We register several callbacks with that, and then it takes care of the power down auto timers and such.  The actual mechanism to power down the GPU varies for platform to platform (platform specific ACPI methods on early systems, D3cold on newer ones).

Alex

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
       [not found]                                                         ` <BN6PR12MB16525D2E89F4AB61DC36EFBEF7DD0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-07-04 16:29                                                           ` Craig Stein
  0 siblings, 0 replies; 24+ messages in thread
From: Craig Stein @ 2017-07-04 16:29 UTC (permalink / raw)
  To: Deucher, Alexander
  Cc: Lendacky, Thomas, Jan Vesely, Nath, Arindam,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[-- Attachment #1.1: Type: text/plain, Size: 2794 bytes --]

Just to be clear which patch should I test and would you provide me with
the link of its location?

Thanks,
Craig

On Jun 28, 2017 16:14, "Deucher, Alexander" <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>
wrote:

> > -----Original Message-----
> > From: Joerg Roedel [mailto:joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org]
> > Sent: Wednesday, June 28, 2017 4:37 AM
> > To: Jan Vesely; Deucher, Alexander
> > Cc: Lendacky, Thomas; Nath, Arindam; Craig Stein; iommu-cunTk1MwBs/ROKNJybVBZg@public.gmane.org
> > foundation.org; Duran, Leo; Suthikulpanit, Suravee
> > Subject: Re: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> >
> > [Adding Alex Deucher]
> >
> > Hey Alex,
> >
> > On Tue, Jun 27, 2017 at 12:24:35PM -0400, Jan Vesely wrote:
> > > On Mon, 2017-06-26 at 14:14 +0200, Joerg Roedel wrote:
> >
> > > > How does that 'dGPU goes to sleep' work? Do you put it to sleep
> > manually
> > > > via sysfs or something? Or is that something that amdgpu does on its
> > > > own?
> > >
> > > AMD folks should be able to provide more details. afaik, the driver
> > > uses ACPI methods to power on/off the device. Driver routines wake the
> > > device up before accessing it and there is a timeout to turn it off
> > > after few seconds of inactivity.
> > >
> > > >
> > > > It looks like the GPU just switches the ATS unit off when it goes to
> > > > sleep and doesn't answer the invalidation anymore, which explains the
> > > > completion-wait timeouts.
> > >
> > > Both MMIO regs and PCIe config regs are turned off so it would not
> > > surprise me if all PCIe requests were ignored by the device in off
> > > state. it should be possible to request device wake up before
> > > invalidating the relevant IOMMU domain. I'll leave to more
> > > knowledgeable ppl to judge whether it's a good idea (we can also
> > > postpone such invalidations until the device is woken by other means)
> >
> > Can you maybe sched some light on how the sleep-mode of the GPUs work?
> > Is it initiated by the GPU driver or from somewhere else? In the case
> > discussed here it looks like the ATS unit of the GPU is switched of,
> > causing IOTLB invalidation timeouts on the IOMMU side.
> >
> > If that is the case we might need some sort of dma-api extension so that
> > the GPU driver can tell the iommu driver that the device is going to be
> > quiet.
>
> I assume you are talking about Hybrid/PowerXpress laptops where the dGPU
> can be powered down dynamically?  That is done via the runtime pm subsystem
> in the kernel.  We register several callbacks with that, and then it takes
> care of the power down auto timers and such.  The actual mechanism to power
> down the GPU varies for platform to platform (platform specific ACPI
> methods on early systems, D3cold on newer ones).
>
> Alex
>
>

[-- Attachment #1.2: Type: text/html, Size: 3660 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2017-07-04 16:29 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-05 19:52 [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05 Tom Lendacky
     [not found] ` <20170605195203.11512.20579.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
2017-06-05 19:52   ` [PATCH v1 1/3] iommu/amd: Reduce amount of MMIO when submitting commands Tom Lendacky
2017-06-05 19:52   ` [PATCH v1 2/3] iommu/amd: Reduce delay waiting for command buffer space Tom Lendacky
2017-06-05 19:52   ` [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush Tom Lendacky
     [not found]     ` <20170605195235.11512.52995.stgit-qCXWGYdRb2BnqfbPTmsdiZQ+2ll4COg0XqFh9Ls21Oc@public.gmane.org>
2017-06-06 10:02       ` Nath, Arindam
     [not found]         ` <MWHPR12MB15181A6A020ACA2F53DF70339CCB0-Gy0DoCVfaSXKu+HfpMNLNQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-08 20:33           ` Jan Vesely
     [not found]             ` <1496954035.4188.1.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-08 23:31               ` Craig Stein
2017-06-21 16:20               ` Jan Vesely
     [not found]                 ` <1498062018.17007.6.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-21 17:01                   ` Tom Lendacky
     [not found]                     ` <bf685f44-019c-4c21-25d4-6a6ea647b7cc-5C7GfCeVMHo@public.gmane.org>
2017-06-21 21:09                       ` Jan Vesely
     [not found]                         ` <1498079371.17007.18.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-22  9:20                           ` Joerg Roedel
     [not found]                             ` <20170622092053.GV30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2017-06-22 15:13                               ` Jan Vesely
     [not found]                                 ` <1498144389.17007.25.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-22 21:57                                   ` Joerg Roedel
     [not found]                                     ` <20170622215735.GW30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2017-06-23 14:20                                       ` Jan Vesely
     [not found]                                         ` <1498227647.17007.31.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-26 12:14                                           ` Joerg Roedel
     [not found]                                             ` <20170626121430.GX30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2017-06-27 16:24                                               ` Jan Vesely
     [not found]                                                 ` <1498580675.10525.3.camel-kgbqMDwikbSVc3sceRu5cw@public.gmane.org>
2017-06-28  8:36                                                   ` Joerg Roedel
     [not found]                                                     ` <20170628083659.GA30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2017-06-28 22:14                                                       ` Deucher, Alexander
     [not found]                                                         ` <BN6PR12MB16525D2E89F4AB61DC36EFBEF7DD0-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-07-04 16:29                                                           ` Craig Stein
2017-06-06 12:05       ` Joerg Roedel
     [not found]         ` <20170606120516.GD30388-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2017-06-06 13:36           ` Tom Lendacky
     [not found]             ` <85356483-1d5e-251f-57e3-d9f761239100-5C7GfCeVMHo@public.gmane.org>
2017-06-07 14:03               ` Tom Lendacky
     [not found]                 ` <32599b14-c138-3c89-6834-0335fec0b3f6-5C7GfCeVMHo@public.gmane.org>
2017-06-07 14:17                   ` Joerg Roedel
2017-06-08 12:43   ` [PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05 Joerg Roedel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.