All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate
@ 2018-02-28  0:38 Alistair Popple
  2018-02-28  3:08 ` Balbir Singh
  2018-02-28 10:14 ` Michael Ellerman
  0 siblings, 2 replies; 5+ messages in thread
From: Alistair Popple @ 2018-02-28  0:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Balbir Singh, Mark Hairgrove, mpe, Alistair Popple

When sending TLB invalidates to the NPU we need to send extra flushes due
to a hardware issue. The original implementation would lock the all the
ATSD MMIO registers sequentially before unlocking and relocking each of
them sequentially to do the extra flush.

This introduced a deadlock as it is possible for one thread to hold one
ATSD register whilst waiting for another register to be freed while the
other thread is holding that register waiting for the one in the first
thread to be freed.

For example if there are two threads and two ATSD registers:

Thread A	Thread B
Acquire 1
Acquire 2
Release 1	Acquire 1
Wait 1		Wait 2

Both threads will be stuck waiting to acquire a register resulting in an
RCU stall warning or soft lockup.

This patch solves the deadlock by refactoring the code to ensure registers
are not released between flushes and to ensure all registers are either
acquired or released together and in order.

Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD")
Signed-off-by: Alistair Popple <alistair@popple.id.au>

---

v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release

 arch/powerpc/platforms/powernv/npu-dma.c | 203 +++++++++++++++++--------------
 1 file changed, 113 insertions(+), 90 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 0a253b64ac5f..2fed4b116e19 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -410,6 +410,11 @@ struct npu_context {
 	void *priv;
 };
 
+struct mmio_atsd_reg {
+	struct npu *npu;
+	int reg;
+};
+
 /*
  * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
  * if none are available.
@@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
 	int i;
 
 	for (i = 0; i < npu->mmio_atsd_count; i++) {
-		if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+		if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
 			return i;
 	}
 
@@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
 
 static void put_mmio_atsd_reg(struct npu *npu, int reg)
 {
-	clear_bit(reg, &npu->mmio_atsd_usage);
+	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
 }
 
 /* MMIO ATSD register offsets */
 #define XTS_ATSD_AVA  1
 #define XTS_ATSD_STAT 2
 
-static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
-				unsigned long va)
+static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
+				unsigned long launch, unsigned long va)
 {
-	int mmio_atsd_reg;
-
-	do {
-		mmio_atsd_reg = get_mmio_atsd_reg(npu);
-		cpu_relax();
-	} while (mmio_atsd_reg < 0);
+	struct npu *npu = mmio_atsd_reg->npu;
+	int reg = mmio_atsd_reg->reg;
 
 	__raw_writeq(cpu_to_be64(va),
-		npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+		npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
 	eieio();
-	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
-
-	return mmio_atsd_reg;
+	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
 }
 
-static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
+static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+				unsigned long pid, bool flush)
 {
+	int i;
 	unsigned long launch;
 
-	/* IS set to invalidate matching PID */
-	launch = PPC_BIT(12);
+	for (i = 0; i <= max_npu2_index; i++) {
+		if (mmio_atsd_reg[i].reg < 0)
+			continue;
+
+		/* IS set to invalidate matching PID */
+		launch = PPC_BIT(12);
 
-	/* PRS set to process-scoped */
-	launch |= PPC_BIT(13);
+		/* PRS set to process-scoped */
+		launch |= PPC_BIT(13);
 
-	/* AP */
-	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+		/* AP */
+		launch |= (u64)
+			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
 
-	/* PID */
-	launch |= pid << PPC_BITLSHIFT(38);
+		/* PID */
+		launch |= pid << PPC_BITLSHIFT(38);
 
-	/* No flush */
-	launch |= !flush << PPC_BITLSHIFT(39);
+		/* No flush */
+		launch |= !flush << PPC_BITLSHIFT(39);
 
-	/* Invalidating the entire process doesn't use a va */
-	return mmio_launch_invalidate(npu, launch, 0);
+		/* Invalidating the entire process doesn't use a va */
+		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
+	}
 }
 
-static int mmio_invalidate_va(struct npu *npu, unsigned long va,
-			unsigned long pid, bool flush)
+static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+			unsigned long va, unsigned long pid, bool flush)
 {
+	int i;
 	unsigned long launch;
 
-	/* IS set to invalidate target VA */
-	launch = 0;
+	for (i = 0; i <= max_npu2_index; i++) {
+		if (mmio_atsd_reg[i].reg < 0)
+			continue;
+
+		/* IS set to invalidate target VA */
+		launch = 0;
 
-	/* PRS set to process scoped */
-	launch |= PPC_BIT(13);
+		/* PRS set to process scoped */
+		launch |= PPC_BIT(13);
 
-	/* AP */
-	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+		/* AP */
+		launch |= (u64)
+			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
 
-	/* PID */
-	launch |= pid << PPC_BITLSHIFT(38);
+		/* PID */
+		launch |= pid << PPC_BITLSHIFT(38);
 
-	/* No flush */
-	launch |= !flush << PPC_BITLSHIFT(39);
+		/* No flush */
+		launch |= !flush << PPC_BITLSHIFT(39);
 
-	return mmio_launch_invalidate(npu, launch, va);
+		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
+	}
 }
 
 #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
 
-struct mmio_atsd_reg {
-	struct npu *npu;
-	int reg;
-};
-
 static void mmio_invalidate_wait(
-	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
+	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
 {
 	struct npu *npu;
 	int i, reg;
@@ -522,16 +531,46 @@ static void mmio_invalidate_wait(
 		reg = mmio_atsd_reg[i].reg;
 		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
 			cpu_relax();
+	}
+}
 
-		put_mmio_atsd_reg(npu, reg);
+static void acquire_atsd_reg(struct npu_context *npu_context,
+			struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+	int i, j;
+	struct npu *npu;
+	struct pci_dev *npdev;
+	struct pnv_phb *nphb;
 
-		/*
-		 * The GPU requires two flush ATSDs to ensure all entries have
-		 * been flushed. We use PID 0 as it will never be used for a
-		 * process on the GPU.
-		 */
-		if (flush)
-			mmio_invalidate_pid(npu, 0, true);
+	for (i = 0; i <= max_npu2_index; i++) {
+		mmio_atsd_reg[i].reg = -1;
+		for (j = 0; j < NV_MAX_LINKS; j++) {
+			npdev = READ_ONCE(npu_context->npdev[i][j]);
+			if (!npdev)
+				continue;
+
+			nphb = pci_bus_to_host(npdev->bus)->private_data;
+			npu = &nphb->npu;
+			mmio_atsd_reg[i].npu = npu;
+			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+			while (mmio_atsd_reg[i].reg < 0) {
+				mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+				cpu_relax();
+			}
+			break;
+		}
+	}
+}
+
+static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+	int i;
+
+	for (i = 0; i <= max_npu2_index; i++) {
+		if (mmio_atsd_reg[i].reg < 0)
+			continue;
+
+		put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
 	}
 }
 
@@ -542,10 +581,6 @@ static void mmio_invalidate_wait(
 static void mmio_invalidate(struct npu_context *npu_context, int va,
 			unsigned long address, bool flush)
 {
-	int i, j;
-	struct npu *npu;
-	struct pnv_phb *nphb;
-	struct pci_dev *npdev;
 	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
 	unsigned long pid = npu_context->mm->context.id;
 
@@ -561,37 +596,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
 	 * Loop over all the NPUs this process is active on and launch
 	 * an invalidate.
 	 */
-	for (i = 0; i <= max_npu2_index; i++) {
-		mmio_atsd_reg[i].reg = -1;
-		for (j = 0; j < NV_MAX_LINKS; j++) {
-			npdev = npu_context->npdev[i][j];
-			if (!npdev)
-				continue;
-
-			nphb = pci_bus_to_host(npdev->bus)->private_data;
-			npu = &nphb->npu;
-			mmio_atsd_reg[i].npu = npu;
-
-			if (va)
-				mmio_atsd_reg[i].reg =
-					mmio_invalidate_va(npu, address, pid,
-							flush);
-			else
-				mmio_atsd_reg[i].reg =
-					mmio_invalidate_pid(npu, pid, flush);
-
-			/*
-			 * The NPU hardware forwards the shootdown to all GPUs
-			 * so we only have to launch one shootdown per NPU.
-			 */
-			break;
-		}
+	acquire_atsd_reg(npu_context, mmio_atsd_reg);
+	if (va)
+		mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
+	else
+		mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
+
+	mmio_invalidate_wait(mmio_atsd_reg);
+	if (flush) {
+		/*
+		 * The GPU requires two flush ATSDs to ensure all entries have
+		 * been flushed. We use PID 0 as it will never be used for a
+		 * process on the GPU.
+		 */
+		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
+		mmio_invalidate_wait(mmio_atsd_reg);
+		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
+		mmio_invalidate_wait(mmio_atsd_reg);
 	}
-
-	mmio_invalidate_wait(mmio_atsd_reg, flush);
-	if (flush)
-		/* Wait for the flush to complete */
-		mmio_invalidate_wait(mmio_atsd_reg, false);
+	release_atsd_reg(mmio_atsd_reg);
 }
 
 static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
 							&nvlink_index)))
 		return ERR_PTR(-ENODEV);
-	npu_context->npdev[npu->index][nvlink_index] = npdev;
+	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
 
 	if (!nphb->npu.nmmu_flush) {
 		/*
@@ -778,7 +801,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
 	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
 							&nvlink_index)))
 		return;
-	npu_context->npdev[npu->index][nvlink_index] = NULL;
+	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
 	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
 				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
 	kref_put(&npu_context->kref, pnv_npu2_release_context);
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate
  2018-02-28  0:38 [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate Alistair Popple
@ 2018-02-28  3:08 ` Balbir Singh
  2018-02-28 10:14 ` Michael Ellerman
  1 sibling, 0 replies; 5+ messages in thread
From: Balbir Singh @ 2018-02-28  3:08 UTC (permalink / raw)
  To: Alistair Popple; +Cc: linuxppc-dev, Mark Hairgrove, mpe

On Wed, 28 Feb 2018 11:38:14 +1100
Alistair Popple <alistair@popple.id.au> wrote:

> When sending TLB invalidates to the NPU we need to send extra flushes due
> to a hardware issue. The original implementation would lock the all the
> ATSD MMIO registers sequentially before unlocking and relocking each of
> them sequentially to do the extra flush.
> 
> This introduced a deadlock as it is possible for one thread to hold one
> ATSD register whilst waiting for another register to be freed while the
> other thread is holding that register waiting for the one in the first
> thread to be freed.
> 
> For example if there are two threads and two ATSD registers:
> 
> Thread A	Thread B
> Acquire 1
> Acquire 2
> Release 1	Acquire 1
> Wait 1		Wait 2
> 
> Both threads will be stuck waiting to acquire a register resulting in an
> RCU stall warning or soft lockup.
> 
> This patch solves the deadlock by refactoring the code to ensure registers
> are not released between flushes and to ensure all registers are either
> acquired or released together and in order.
> 
> Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD")
> Signed-off-by: Alistair Popple <alistair@popple.id.au>
> 
> ---
> 
> v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release
> 
>  arch/powerpc/platforms/powernv/npu-dma.c | 203 +++++++++++++++++--------------
>  1 file changed, 113 insertions(+), 90 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 0a253b64ac5f..2fed4b116e19 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -410,6 +410,11 @@ struct npu_context {
>  	void *priv;
>  };
>  
> +struct mmio_atsd_reg {
> +	struct npu *npu;
> +	int reg;
> +};
> +
>  /*
>   * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
>   * if none are available.
> @@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
>  	int i;
>  
>  	for (i = 0; i < npu->mmio_atsd_count; i++) {
> -		if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
> +		if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
>  			return i;
>  	}
>  
> @@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
>  
>  static void put_mmio_atsd_reg(struct npu *npu, int reg)
>  {
> -	clear_bit(reg, &npu->mmio_atsd_usage);
> +	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
>  }


I think we need to document in the code that we have a hard reliance on
the order of locks being incremental sequential and that any optimizations
otherwise will result in probable deadlocks. 

>  
>  /* MMIO ATSD register offsets */
>  #define XTS_ATSD_AVA  1
>  #define XTS_ATSD_STAT 2
>  
> -static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
> -				unsigned long va)
> +static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
> +				unsigned long launch, unsigned long va)
>  {
> -	int mmio_atsd_reg;
> -
> -	do {
> -		mmio_atsd_reg = get_mmio_atsd_reg(npu);
> -		cpu_relax();
> -	} while (mmio_atsd_reg < 0);
> +	struct npu *npu = mmio_atsd_reg->npu;
> +	int reg = mmio_atsd_reg->reg;
>  
>  	__raw_writeq(cpu_to_be64(va),
> -		npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
> +		npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
>  	eieio();
> -	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
> -
> -	return mmio_atsd_reg;
> +	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
>  }
>  
> -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
> +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> +				unsigned long pid, bool flush)
>  {
> +	int i;
>  	unsigned long launch;
>  
> -	/* IS set to invalidate matching PID */
> -	launch = PPC_BIT(12);
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		if (mmio_atsd_reg[i].reg < 0)
> +			continue;
> +
> +		/* IS set to invalidate matching PID */
> +		launch = PPC_BIT(12);
>  
> -	/* PRS set to process-scoped */
> -	launch |= PPC_BIT(13);
> +		/* PRS set to process-scoped */
> +		launch |= PPC_BIT(13);
>  
> -	/* AP */
> -	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> +		/* AP */
> +		launch |= (u64)
> +			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
>  
> -	/* PID */
> -	launch |= pid << PPC_BITLSHIFT(38);
> +		/* PID */
> +		launch |= pid << PPC_BITLSHIFT(38);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +		/* No flush */
> +		launch |= !flush << PPC_BITLSHIFT(39);
>  
> -	/* Invalidating the entire process doesn't use a va */
> -	return mmio_launch_invalidate(npu, launch, 0);
> +		/* Invalidating the entire process doesn't use a va */
> +		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
> +	}
>  }
>  
> -static int mmio_invalidate_va(struct npu *npu, unsigned long va,
> -			unsigned long pid, bool flush)
> +static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> +			unsigned long va, unsigned long pid, bool flush)
>  {
> +	int i;
>  	unsigned long launch;
>  
> -	/* IS set to invalidate target VA */
> -	launch = 0;
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		if (mmio_atsd_reg[i].reg < 0)
> +			continue;
> +
> +		/* IS set to invalidate target VA */
> +		launch = 0;
>  
> -	/* PRS set to process scoped */
> -	launch |= PPC_BIT(13);
> +		/* PRS set to process scoped */
> +		launch |= PPC_BIT(13);
>  
> -	/* AP */
> -	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> +		/* AP */
> +		launch |= (u64)
> +			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
>  
> -	/* PID */
> -	launch |= pid << PPC_BITLSHIFT(38);
> +		/* PID */
> +		launch |= pid << PPC_BITLSHIFT(38);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +		/* No flush */
> +		launch |= !flush << PPC_BITLSHIFT(39);
>  
> -	return mmio_launch_invalidate(npu, launch, va);
> +		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
> +	}
>  }
>  
>  #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
>  
> -struct mmio_atsd_reg {
> -	struct npu *npu;
> -	int reg;
> -};
> -
>  static void mmio_invalidate_wait(
> -	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
> +	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
>  {
>  	struct npu *npu;
>  	int i, reg;
> @@ -522,16 +531,46 @@ static void mmio_invalidate_wait(
>  		reg = mmio_atsd_reg[i].reg;
>  		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
>  			cpu_relax();
> +	}
> +}
>  
> -		put_mmio_atsd_reg(npu, reg);
> +static void acquire_atsd_reg(struct npu_context *npu_context,
> +			struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
> +{
> +	int i, j;
> +	struct npu *npu;
> +	struct pci_dev *npdev;
> +	struct pnv_phb *nphb;
>  
> -		/*
> -		 * The GPU requires two flush ATSDs to ensure all entries have
> -		 * been flushed. We use PID 0 as it will never be used for a
> -		 * process on the GPU.
> -		 */
> -		if (flush)
> -			mmio_invalidate_pid(npu, 0, true);
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		mmio_atsd_reg[i].reg = -1;
> +		for (j = 0; j < NV_MAX_LINKS; j++) {
> +			npdev = READ_ONCE(npu_context->npdev[i][j]);
> +			if (!npdev)
> +				continue;
> +
> +			nphb = pci_bus_to_host(npdev->bus)->private_data;
> +			npu = &nphb->npu;
> +			mmio_atsd_reg[i].npu = npu;
> +			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
> +			while (mmio_atsd_reg[i].reg < 0) {
> +				mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
> +				cpu_relax();

I guess a softlockup will occur if we don't get the atsd resource in time
and that might aid debugging.

Looks good to me otherwise
Acked-by: Balbir Singh <bsingharora@gmail.com>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate
  2018-02-28  0:38 [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate Alistair Popple
  2018-02-28  3:08 ` Balbir Singh
@ 2018-02-28 10:14 ` Michael Ellerman
  2018-03-01  2:55   ` Alistair Popple
  1 sibling, 1 reply; 5+ messages in thread
From: Michael Ellerman @ 2018-02-28 10:14 UTC (permalink / raw)
  To: Alistair Popple, linuxppc-dev
  Cc: Balbir Singh, Mark Hairgrove, Alistair Popple

Alistair Popple <alistair@popple.id.au> writes:

> When sending TLB invalidates to the NPU we need to send extra flushes due
> to a hardware issue. The original implementation would lock the all the
> ATSD MMIO registers sequentially before unlocking and relocking each of
> them sequentially to do the extra flush.
>
> This introduced a deadlock as it is possible for one thread to hold one
> ATSD register whilst waiting for another register to be freed while the
> other thread is holding that register waiting for the one in the first
> thread to be freed.
>
> For example if there are two threads and two ATSD registers:
>
> Thread A	Thread B
> Acquire 1
> Acquire 2
> Release 1	Acquire 1
> Wait 1		Wait 2
>
> Both threads will be stuck waiting to acquire a register resulting in an
> RCU stall warning or soft lockup.
>
> This patch solves the deadlock by refactoring the code to ensure registers
> are not released between flushes and to ensure all registers are either
> acquired or released together and in order.
>
> Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD")
> Signed-off-by: Alistair Popple <alistair@popple.id.au>
>
> ---
>
> v2: Added memory barriers around ->npdev[] and ATSD register aquisition/release

Apologies to Balbir who was standing nearby when I read this patch this
afternoon and copped a bit of a rant as a result ...

But READ_ONCE/WRITE_ONCE are not memory barriers, they are only compiler
barriers.

So the READ_ONCE/WRITE_ONCE usage in here may be necessary, but it's
probably not sufficient, we probably *also* need actual memory barriers.

But I could be wrong, my main gripe is that the locking/ordering in here
is not very obvious.

> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 0a253b64ac5f..2fed4b116e19 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
>  							&nvlink_index)))
>  		return ERR_PTR(-ENODEV);
> -	npu_context->npdev[npu->index][nvlink_index] = npdev;
> +	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
  
When you're publishing a struct via a pointer you would typically have a
barrier between the stores that set the fields of the struct, and the
store that publishes the struct. Otherwise the reader can see a
partially setup struct.

I think here the npdev was setup somewhere else, and maybe there has
been an intervening barrier, but it's not clear. A comment at least
would be good.

In general I feel like a spinlock or two could significantly simply the
locking/ordering in this code, and given we're doing MMIOs anyway would
not affect performance.

</rant>

cheers

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate
  2018-02-28 10:14 ` Michael Ellerman
@ 2018-03-01  2:55   ` Alistair Popple
  2018-03-01 13:16     ` Michael Ellerman
  0 siblings, 1 reply; 5+ messages in thread
From: Alistair Popple @ 2018-03-01  2:55 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Balbir Singh, Mark Hairgrove

> > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> > index 0a253b64ac5f..2fed4b116e19 100644
> > --- a/arch/powerpc/platforms/powernv/npu-dma.c
> > +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> > @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
> >  	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
> >  							&nvlink_index)))
> >  		return ERR_PTR(-ENODEV);
> > -	npu_context->npdev[npu->index][nvlink_index] = npdev;
> > +	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
>   
> When you're publishing a struct via a pointer you would typically have a
> barrier between the stores that set the fields of the struct, and the
> store that publishes the struct. Otherwise the reader can see a
> partially setup struct.

npdev is just a pointer to a pci_dev setup by the PCI code. We assign it to
npdev[][] to indicate to the mmu notifiers that an invalidation should also
be sent to this nvlink. However I don't think there is any ordering
required wrt to the rest of the npu_context setup - so long as when the
notifiers deference npu_context->npdev[i][j] it either sees a valid
non-NULL pointer or NULL assigned to npdev[][] everything should be ok.

> I think here the npdev was setup somewhere else, and maybe there has
> been an intervening barrier, but it's not clear. A comment at least
> would be good.

Yes it has been. I will submit a v3 with some more comments incorporating
the above. Unless you think my argument is wrong?

> In general I feel like a spinlock or two could significantly simply the
> locking/ordering in this code, and given we're doing MMIOs anyway would
> not affect performance.

Indeed, I am working on a patch to add a spinlock around the allocation of
the npu_context as pnv_npu2_destroy_context() needs to be serialised with
respect to pnv_npu2_init_context() on the same mm_struct. I'd incorrectly
assumed the driver would do this serialisation but apparently it can't so
we need to ensure kref_get(npu_context) can't race with it's destruction
(concurrent init_context() is protected by mmap_sem). I could fold that
(unposted) patch into this one if you think that would be better?

Thanks!

- Alistair

> </rant>
> 
> cheers
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate
  2018-03-01  2:55   ` Alistair Popple
@ 2018-03-01 13:16     ` Michael Ellerman
  0 siblings, 0 replies; 5+ messages in thread
From: Michael Ellerman @ 2018-03-01 13:16 UTC (permalink / raw)
  To: Alistair Popple; +Cc: linuxppc-dev, Balbir Singh, Mark Hairgrove

Alistair Popple <alistair@popple.id.au> writes:

>> > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
>> > index 0a253b64ac5f..2fed4b116e19 100644
>> > --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> > +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> > @@ -726,7 +749,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>> >  	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
>> >  							&nvlink_index)))
>> >  		return ERR_PTR(-ENODEV);
>> > -	npu_context->npdev[npu->index][nvlink_index] = npdev;
>> > +	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
>>   
>> When you're publishing a struct via a pointer you would typically have a
>> barrier between the stores that set the fields of the struct, and the
>> store that publishes the struct. Otherwise the reader can see a
>> partially setup struct.
>
> npdev is just a pointer to a pci_dev setup by the PCI code. We assign it to
> npdev[][] to indicate to the mmu notifiers that an invalidation should also
> be sent to this nvlink. However I don't think there is any ordering
> required wrt to the rest of the npu_context setup - so long as when the
> notifiers deference npu_context->npdev[i][j] it either sees a valid
> non-NULL pointer or NULL assigned to npdev[][] everything should be ok.

Yeah OK. You do then dereference npdev->bus, so you depend on that being
setup prior to the store that makes the npdev visible. But I guess we're
saying that happened eons ago somewhere in the PCI code and will have
happened by now.

>> I think here the npdev was setup somewhere else, and maybe there has
>> been an intervening barrier, but it's not clear. A comment at least
>> would be good.
>
> Yes it has been. I will submit a v3 with some more comments incorporating
> the above. Unless you think my argument is wrong?

If you can comment it then I think I'm happy with it.

>> In general I feel like a spinlock or two could significantly simply the
>> locking/ordering in this code, and given we're doing MMIOs anyway would
>> not affect performance.
>
> Indeed, I am working on a patch to add a spinlock around the allocation of
> the npu_context as pnv_npu2_destroy_context() needs to be serialised with
> respect to pnv_npu2_init_context() on the same mm_struct. I'd incorrectly
> assumed the driver would do this serialisation but apparently it can't so
> we need to ensure kref_get(npu_context) can't race with it's destruction
> (concurrent init_context() is protected by mmap_sem). I could fold that
> (unposted) patch into this one if you think that would be better?

No this is big enough already, I almost asked you to split it to make
the diff more readable :)

So just send the spinlock patch when it's ready as a separate patch.

cheers

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-03-01 13:16 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-28  0:38 [PATCH v2] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate Alistair Popple
2018-02-28  3:08 ` Balbir Singh
2018-02-28 10:14 ` Michael Ellerman
2018-03-01  2:55   ` Alistair Popple
2018-03-01 13:16     ` Michael Ellerman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.