All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
@ 2018-10-21 13:45 Markus Dobel
  2018-12-05 11:07 ` Mauro Carvalho Chehab
  0 siblings, 1 reply; 26+ messages in thread
From: Markus Dobel @ 2018-10-21 13:45 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, linux-media, Brad Love

The original commit (the one reverted in this patch) introduced a 
regression,
making a previously flawless adapter unresponsive after running a few 
hours
to days. Since I never experienced the problems that the original commit 
is
supposed to fix, I propose to revert the change until a regression-free
variant is found.

Before submitting this, I've been running a system 24x7 with this revert 
for
several weeks now, and it's running stable again.

It's not a pure revert, as the original commit does not revert cleanly
anymore due to other changes, but content-wise it is.

Signed-off-by: Markus Dobel <markus.dobel@gmx.de>
---
  drivers/media/pci/cx23885/cx23885-core.c | 60 ------------------------
  drivers/media/pci/cx23885/cx23885-reg.h  | 14 ------
  2 files changed, 74 deletions(-)

diff --git a/drivers/media/pci/cx23885/cx23885-core.c 
b/drivers/media/pci/cx23885/cx23885-core.c
index 39804d830305..606f6fc0e68b 100644
--- a/drivers/media/pci/cx23885/cx23885-core.c
+++ b/drivers/media/pci/cx23885/cx23885-core.c
@@ -601,25 +601,6 @@ static void cx23885_risc_disasm(struct 
cx23885_tsport *port,
  	}
  }

-static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
-{
-	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
-	uint32_t reg2_val = cx_read(TC_REQ_SET);
-
-	if (reg1_val && reg2_val) {
-		cx_write(TC_REQ, reg1_val);
-		cx_write(TC_REQ_SET, reg2_val);
-		cx_read(VID_B_DMA);
-		cx_read(VBI_B_DMA);
-		cx_read(VID_C_DMA);
-		cx_read(VBI_C_DMA);
-
-		dev_info(&dev->pci->dev,
-			"dma in progress detected 0x%08x 0x%08x, clearing\n",
-			reg1_val, reg2_val);
-	}
-}
-
  static void cx23885_shutdown(struct cx23885_dev *dev)
  {
  	/* disable RISC controller */
@@ -665,8 +646,6 @@ static void cx23885_reset(struct cx23885_dev *dev)
  	cx_write(CLK_DELAY, cx_read(CLK_DELAY) & 0x80000000);
  	cx_write(PAD_CTRL, 0x00500300);

-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);
  	msleep(100);

  	cx23885_sram_channel_setup(dev, &dev->sram_channels[SRAM_CH01],
@@ -683,11 +662,6 @@ static void cx23885_reset(struct cx23885_dev *dev)
  	cx23885_sram_channel_setup(dev, &dev->sram_channels[SRAM_CH09], 128, 
0);

  	cx23885_gpio_setup(dev);
-
-	cx23885_irq_get_mask(dev);
-
-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);
  }


@@ -702,8 +676,6 @@ static int cx23885_pci_quirks(struct cx23885_dev 
*dev)
  	if (dev->bridge == CX23885_BRIDGE_885)
  		cx_clear(RDR_TLCTL0, 1 << 4);

-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);
  	return 0;
  }

@@ -1392,9 +1364,6 @@ int cx23885_start_dma(struct cx23885_tsport *port,
  	dprintk(1, "%s() w: %d, h: %d, f: %d\n", __func__,
  		dev->width, dev->height, dev->field);

-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);
-
  	/* Stop the fifo and risc engine for this port */
  	cx_clear(port->reg_dma_ctl, port->dma_ctl_val);

@@ -1482,26 +1451,16 @@ int cx23885_start_dma(struct cx23885_tsport 
*port,
  	case CX23885_BRIDGE_888:
  		/* enable irqs */
  		dprintk(1, "%s() enabling TS int's and DMA\n", __func__);
-		/* clear dma in progress */
-		cx23885_clear_bridge_error(dev);
  		cx_set(port->reg_ts_int_msk,  port->ts_int_msk_val);
  		cx_set(port->reg_dma_ctl, port->dma_ctl_val);
-
-		/* clear dma in progress */
-		cx23885_clear_bridge_error(dev);
  		cx23885_irq_add(dev, port->pci_irqmask);
  		cx23885_irq_enable_all(dev);
-
-		/* clear dma in progress */
-		cx23885_clear_bridge_error(dev);
  		break;
  	default:
  		BUG();
  	}

  	cx_set(DEV_CNTRL2, (1<<5)); /* Enable RISC controller */
-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);

  	if (cx23885_boards[dev->board].portb == CX23885_MPEG_ENCODER)
  		cx23885_av_clk(dev, 1);
@@ -1509,11 +1468,6 @@ int cx23885_start_dma(struct cx23885_tsport 
*port,
  	if (debug > 4)
  		cx23885_tsport_reg_dump(port);

-	cx23885_irq_get_mask(dev);
-
-	/* clear dma in progress */
-	cx23885_clear_bridge_error(dev);
-
  	return 0;
  }

@@ -1521,26 +1475,12 @@ static int cx23885_stop_dma(struct 
cx23885_tsport *port)
  {
  	struct cx23885_dev *dev = port->dev;
  	u32 reg;
-	int delay = 0;
-	uint32_t reg1_val;
-	uint32_t reg2_val;

  	dprintk(1, "%s()\n", __func__);

  	/* Stop interrupts and DMA */
  	cx_clear(port->reg_ts_int_msk, port->ts_int_msk_val);
  	cx_clear(port->reg_dma_ctl, port->dma_ctl_val);
-	/* just in case wait for any dma to complete before allowing dealloc 
*/
-	mdelay(20);
-	for (delay = 0; delay < 100; delay++) {
-		reg1_val = cx_read(TC_REQ);
-		reg2_val = cx_read(TC_REQ_SET);
-		if (reg1_val == 0 || reg2_val == 0)
-			break;
-		mdelay(1);
-	}
-	dev_dbg(&dev->pci->dev, "delay=%d reg1=0x%08x reg2=0x%08x\n",
-		delay, reg1_val, reg2_val);

  	if (cx23885_boards[dev->board].portb == CX23885_MPEG_ENCODER) {
  		reg = cx_read(PAD_CTRL);
diff --git a/drivers/media/pci/cx23885/cx23885-reg.h 
b/drivers/media/pci/cx23885/cx23885-reg.h
index 08cec8d91742..2d3cbafe2402 100644
--- a/drivers/media/pci/cx23885/cx23885-reg.h
+++ b/drivers/media/pci/cx23885/cx23885-reg.h
@@ -288,18 +288,6 @@ Channel manager Data Structure entry = 20 DWORD
  #define AUDIO_EXT_INT_MSTAT	0x00040068
  #define AUDIO_EXT_INT_SSTAT	0x0004006C

-/* Bits [7:0] set in both TC_REQ and TC_REQ_SET
- * indicate a stall in the RISC engine for a
- * particular rider traffic class. This causes
- * the 885 and 888 bridges (unknown about 887)
- * to become inoperable. Setting bits in
- * TC_REQ_SET resets the corresponding bits
- * in TC_REQ (and TC_REQ_SET) allowing
- * operation to continue.
- */
-#define TC_REQ		0x00040090
-#define TC_REQ_SET	0x00040094
-
  #define RDR_CFG0	0x00050000
  #define RDR_CFG1	0x00050004
  #define RDR_CFG2	0x00050008
@@ -398,8 +386,6 @@ Channel manager Data Structure entry = 20 DWORD
  #define VID_B_PIXEL_FRMT	0x00130184

  /* Video C Interface */
-#define VID_C_DMA		0x00130200
-#define VBI_C_DMA		0x00130208
  #define VID_C_GPCNT		0x00130220
  #define VID_C_GPCNT_CTL		0x00130230
  #define VBI_C_GPCNT_CTL		0x00130234
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-10-21 13:45 [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
@ 2018-12-05 11:07 ` Mauro Carvalho Chehab
  2018-12-06 16:37   ` Brad Love
  0 siblings, 1 reply; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-05 11:07 UTC (permalink / raw)
  To: Markus Dobel, Brad Love; +Cc: linux-media

Em Sun, 21 Oct 2018 15:45:39 +0200
Markus Dobel <markus.dobel@gmx.de> escreveu:

> The original commit (the one reverted in this patch) introduced a 
> regression,
> making a previously flawless adapter unresponsive after running a few 
> hours
> to days. Since I never experienced the problems that the original commit 
> is
> supposed to fix, I propose to revert the change until a regression-free
> variant is found.
> 
> Before submitting this, I've been running a system 24x7 with this revert 
> for
> several weeks now, and it's running stable again.
> 
> It's not a pure revert, as the original commit does not revert cleanly
> anymore due to other changes, but content-wise it is.
> 
> Signed-off-by: Markus Dobel <markus.dobel@gmx.de>
> ---
>   drivers/media/pci/cx23885/cx23885-core.c | 60 ------------------------
>   drivers/media/pci/cx23885/cx23885-reg.h  | 14 ------
>   2 files changed, 74 deletions(-)
> 
> diff --git a/drivers/media/pci/cx23885/cx23885-core.c 
> b/drivers/media/pci/cx23885/cx23885-core.c
> index 39804d830305..606f6fc0e68b 100644
> --- a/drivers/media/pci/cx23885/cx23885-core.c
> +++ b/drivers/media/pci/cx23885/cx23885-core.c
> @@ -601,25 +601,6 @@ static void cx23885_risc_disasm(struct 
> cx23885_tsport *port,

Patch was mangled by your e-mailer: it broke longer lines, causing
it to not apply.

Also, before just reverting the entire thing, could you please check
if the enclosed hack would solve it?

If so, it should be easy to add a quirk at drivers/pci/quirks.c
in order to detect the Ryzen models with a bad DMA engine that
require periodic resets, and then make cx23885 to use it.

We did similar tricks before with some broken DMA engines, at
the time we had overlay support on drivers and AMD controllers
didn't support PCI2PCI DMA transfers.

Brad,

Could you please address this issue?


diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
index 39804d830305..8b012bee6b32 100644
--- a/drivers/media/pci/cx23885/cx23885-core.c
+++ b/drivers/media/pci/cx23885/cx23885-core.c
@@ -603,8 +603,14 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
 
 static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
 {
-	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
-	uint32_t reg2_val = cx_read(TC_REQ_SET);
+	uint32_t reg1_val, reg2_val;
+
+	/* TODO: check for Ryzen quirk */
+	if (1)
+		return;
+
+	reg1_val = cx_read(TC_REQ); /* read-only */
+	reg2_val = cx_read(TC_REQ_SET);
 
 	if (reg1_val && reg2_val) {
 		cx_write(TC_REQ, reg1_val);



Thanks,
Mauro

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-05 11:07 ` Mauro Carvalho Chehab
@ 2018-12-06 16:37   ` Brad Love
  2018-12-06 17:18     ` Markus Dobel
  0 siblings, 1 reply; 26+ messages in thread
From: Brad Love @ 2018-12-06 16:37 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Markus Dobel, Brad Love; +Cc: linux-media

Hi Mauro & Markus,


On 05/12/2018 05.07, Mauro Carvalho Chehab wrote:
> Em Sun, 21 Oct 2018 15:45:39 +0200
> Markus Dobel <markus.dobel@gmx.de> escreveu:
>
>> The original commit (the one reverted in this patch) introduced a 
>> regression,
>> making a previously flawless adapter unresponsive after running a few 
>> hours
>> to days. Since I never experienced the problems that the original commit 
>> is
>> supposed to fix, I propose to revert the change until a regression-free
>> variant is found.
>>
>> Before submitting this, I've been running a system 24x7 with this revert 
>> for
>> several weeks now, and it's running stable again.
>>
>> It's not a pure revert, as the original commit does not revert cleanly
>> anymore due to other changes, but content-wise it is.
>>
>> Signed-off-by: Markus Dobel <markus.dobel@gmx.de>
>> ---
>>   drivers/media/pci/cx23885/cx23885-core.c | 60 ------------------------
>>   drivers/media/pci/cx23885/cx23885-reg.h  | 14 ------
>>   2 files changed, 74 deletions(-)
>>
>> diff --git a/drivers/media/pci/cx23885/cx23885-core.c 
>> b/drivers/media/pci/cx23885/cx23885-core.c
>> index 39804d830305..606f6fc0e68b 100644
>> --- a/drivers/media/pci/cx23885/cx23885-core.c
>> +++ b/drivers/media/pci/cx23885/cx23885-core.c
>> @@ -601,25 +601,6 @@ static void cx23885_risc_disasm(struct 
>> cx23885_tsport *port,
> Patch was mangled by your e-mailer: it broke longer lines, causing
> it to not apply.
>
> Also, before just reverting the entire thing, could you please check
> if the enclosed hack would solve it?
>
> If so, it should be easy to add a quirk at drivers/pci/quirks.c
> in order to detect the Ryzen models with a bad DMA engine that
> require periodic resets, and then make cx23885 to use it.
>
> We did similar tricks before with some broken DMA engines, at
> the time we had overlay support on drivers and AMD controllers
> didn't support PCI2PCI DMA transfers.
>
> Brad,
>
> Could you please address this issue?


I'll try to address this today or tomorrow. Since the original patch was
applied I have not received any complaints from ryzen users, but we have
accumulated a few reports from Intel users with a variety of
motherboards that do now encounter issue. Strangely system load affects
the repro; low/no load exhibits the error condition, high system load
everything is fine. I'll do my best to send in a ryzen specific patch by
the weekend.

Regards,

Brad



>
>
> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
> index 39804d830305..8b012bee6b32 100644
> --- a/drivers/media/pci/cx23885/cx23885-core.c
> +++ b/drivers/media/pci/cx23885/cx23885-core.c
> @@ -603,8 +603,14 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
>  
>  static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
>  {
> -	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
> -	uint32_t reg2_val = cx_read(TC_REQ_SET);
> +	uint32_t reg1_val, reg2_val;
> +
> +	/* TODO: check for Ryzen quirk */
> +	if (1)
> +		return;
> +
> +	reg1_val = cx_read(TC_REQ); /* read-only */
> +	reg2_val = cx_read(TC_REQ_SET);
>  
>  	if (reg1_val && reg2_val) {
>  		cx_write(TC_REQ, reg1_val);
>
>
>
> Thanks,
> Mauro


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 16:37   ` Brad Love
@ 2018-12-06 17:18     ` Markus Dobel
  2018-12-06 18:01       ` Mauro Carvalho Chehab
  0 siblings, 1 reply; 26+ messages in thread
From: Markus Dobel @ 2018-12-06 17:18 UTC (permalink / raw)
  To: Brad Love, Mauro Carvalho Chehab; +Cc: linux-media

Hi everyone,

I will try if the hack mentioned fixes the issue for me on the weekend (but I assume, as if effectively removes the function).

Just in case this is of interest, I neither have Ryzen nor Intel, but an HP Microserver G7 with an AMD Turion II Neo  N54L, so the machine is more on the slow side. 

Regards, Markus
-- 
Gesendet mit zwei Streichhölzern, einem Gummiband und etwas Draht.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 17:18     ` Markus Dobel
@ 2018-12-06 18:01       ` Mauro Carvalho Chehab
  2018-12-06 18:36         ` Alex Deucher
  2018-12-16 10:37         ` [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
  0 siblings, 2 replies; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-06 18:01 UTC (permalink / raw)
  To: Markus Dobel; +Cc: Brad Love, linux-media

Em Thu, 06 Dec 2018 18:18:23 +0100
Markus Dobel <markus.dobel@gmx.de> escreveu:

> Hi everyone,
> 
> I will try if the hack mentioned fixes the issue for me on the weekend (but I assume, as if effectively removes the function).

It should, but it keeps a few changes. Just want to be sure that what
would be left won't cause issues. If this works, the logic that would
solve Ryzen DMA fixes will be contained into a single point, making
easier to maintain it.

> 
> Just in case this is of interest, I neither have Ryzen nor Intel, but an HP Microserver G7 with an AMD Turion II Neo  N54L, so the machine is more on the slow side. 

Good to know. It would probably worth to check if this Ryzen
bug occors with all versions of it or with just a subset.
I mean: maybe it is only at the first gen or Ryzen and doesn't
affect Ryzen 2 (or vice versa).

The PCI quirks logic will likely need to detect the PCI ID of
the memory controllers found at the buggy CPUs, in order to enable
the quirk only for the affected ones.

It could be worth talking with AMD people in order to be sure about
the differences at the DMA engine side.

Thanks,
Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 18:01       ` Mauro Carvalho Chehab
@ 2018-12-06 18:36         ` Alex Deucher
  2018-12-06 19:07           ` Mauro Carvalho Chehab
  2018-12-16 10:37         ` [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
  1 sibling, 1 reply; 26+ messages in thread
From: Alex Deucher @ 2018-12-06 18:36 UTC (permalink / raw)
  To: mchehab; +Cc: markus.dobel, Brad Love, linux-media

On Thu, Dec 6, 2018 at 1:05 PM Mauro Carvalho Chehab <mchehab@kernel.org> wrote:
>
> Em Thu, 06 Dec 2018 18:18:23 +0100
> Markus Dobel <markus.dobel@gmx.de> escreveu:
>
> > Hi everyone,
> >
> > I will try if the hack mentioned fixes the issue for me on the weekend (but I assume, as if effectively removes the function).
>
> It should, but it keeps a few changes. Just want to be sure that what
> would be left won't cause issues. If this works, the logic that would
> solve Ryzen DMA fixes will be contained into a single point, making
> easier to maintain it.
>
> >
> > Just in case this is of interest, I neither have Ryzen nor Intel, but an HP Microserver G7 with an AMD Turion II Neo  N54L, so the machine is more on the slow side.
>
> Good to know. It would probably worth to check if this Ryzen
> bug occors with all versions of it or with just a subset.
> I mean: maybe it is only at the first gen or Ryzen and doesn't
> affect Ryzen 2 (or vice versa).

The original commit also mentions some Xeons are affected too.  Seems
like this is potentially an issue on the device side rather than the
platform.

>
> The PCI quirks logic will likely need to detect the PCI ID of
> the memory controllers found at the buggy CPUs, in order to enable
> the quirk only for the affected ones.
>
> It could be worth talking with AMD people in order to be sure about
> the differences at the DMA engine side.
>

It's not clear to me what the pci or platform quirk would do.  The
workaround seems to be in the driver, not on the platform.

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 18:36         ` Alex Deucher
@ 2018-12-06 19:07           ` Mauro Carvalho Chehab
  2018-12-06 19:32             ` Mauro Carvalho Chehab
  0 siblings, 1 reply; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-06 19:07 UTC (permalink / raw)
  To: Alex Deucher; +Cc: markus.dobel, Brad Love, linux-media

Em Thu, 6 Dec 2018 13:36:24 -0500
Alex Deucher <alexdeucher@gmail.com> escreveu:

> On Thu, Dec 6, 2018 at 1:05 PM Mauro Carvalho Chehab <mchehab@kernel.org> wrote:
> >
> > Em Thu, 06 Dec 2018 18:18:23 +0100
> > Markus Dobel <markus.dobel@gmx.de> escreveu:
> >  
> > > Hi everyone,
> > >
> > > I will try if the hack mentioned fixes the issue for me on the weekend (but I assume, as if effectively removes the function).  
> >
> > It should, but it keeps a few changes. Just want to be sure that what
> > would be left won't cause issues. If this works, the logic that would
> > solve Ryzen DMA fixes will be contained into a single point, making
> > easier to maintain it.
> >  
> > >
> > > Just in case this is of interest, I neither have Ryzen nor Intel, but an HP Microserver G7 with an AMD Turion II Neo  N54L, so the machine is more on the slow side.  
> >
> > Good to know. It would probably worth to check if this Ryzen
> > bug occors with all versions of it or with just a subset.
> > I mean: maybe it is only at the first gen or Ryzen and doesn't
> > affect Ryzen 2 (or vice versa).  
> 
> The original commit also mentions some Xeons are affected too.  Seems
> like this is potentially an issue on the device side rather than the
> platform.

Maybe.

> >
> > The PCI quirks logic will likely need to detect the PCI ID of
> > the memory controllers found at the buggy CPUs, in order to enable
> > the quirk only for the affected ones.
> >
> > It could be worth talking with AMD people in order to be sure about
> > the differences at the DMA engine side.
> >  
> 
> It's not clear to me what the pci or platform quirk would do.  The
> workaround seems to be in the driver, not on the platform.

Yeah, the fix should be at the driver, but pci/quirk.c would be able
to detect memory controllers that would require a hack inside the
driver, in a similar way to what the media PCI drivers already do
for DMA controllers that don't support pci2pci transfers.

There, basically the PCI core (drivers/pci/pci.c and 
drivers/pci/quirks.c) sets a flag (pci_pci_problems) indicating
potential issues.

Then, the driver compares such flag in order to enable the specific quirk.

Ok, there would be a different way to handle it. The driver could use a 
logic similar to the one I wrote for drivers/edac/i7core_edac.c. There,
the logic seeks for some specific PCI device IDs using pci_get_device(),
adjusting the code accordingly, depending on the detected PCI devices.

In other words, running something like this (untested), at probe time should
produce similar results:

	/*
	 * FIXME: It probably makes sense to also be able to identify specific
	 * versions of the same PCI ID, just in case a latter stepping got a
	 * fix for the issue.
	 */
	const static struct {
		int vendor, dev;
	} broken_dev_id[] = {
		PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_foo,
		PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_bar,
	},

	bool cx23885_does_dma_require_reset(void) 
	{
		int i;
		struct pci_dev *pdev = NULL;

		for (i = 0; i < sizeof(broken_dev_id); i++) {
			pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
			if (pdev) {
				pci_put_device(pdev);
				return true;
			}
		}
		return false;
	}

Should work. In any case, we need to know what memory controllers 
have problems, and what are their PCI IDs, and add them (if not there yet)
at include/linux/pci_ids.h

Thanks,
Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 19:07           ` Mauro Carvalho Chehab
@ 2018-12-06 19:32             ` Mauro Carvalho Chehab
  2018-12-18 22:59               ` [PATCH v2] cx23885: only reset DMA on problematic CPUs Brad Love
  0 siblings, 1 reply; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-06 19:32 UTC (permalink / raw)
  To: Alex Deucher; +Cc: markus.dobel, Brad Love, linux-media

Em Thu, 6 Dec 2018 17:07:52 -0200
Mauro Carvalho Chehab <mchehab@kernel.org> escreveu:

> Em Thu, 6 Dec 2018 13:36:24 -0500
> Alex Deucher <alexdeucher@gmail.com> escreveu:
> 
> > On Thu, Dec 6, 2018 at 1:05 PM Mauro Carvalho Chehab <mchehab@kernel.org> wrote:  
> > >
> > > Em Thu, 06 Dec 2018 18:18:23 +0100
> > > Markus Dobel <markus.dobel@gmx.de> escreveu:
> > >    
> > > > Hi everyone,
> > > >
> > > > I will try if the hack mentioned fixes the issue for me on the weekend (but I assume, as if effectively removes the function).    
> > >
> > > It should, but it keeps a few changes. Just want to be sure that what
> > > would be left won't cause issues. If this works, the logic that would
> > > solve Ryzen DMA fixes will be contained into a single point, making
> > > easier to maintain it.
> > >    
> > > >
> > > > Just in case this is of interest, I neither have Ryzen nor Intel, but an HP Microserver G7 with an AMD Turion II Neo  N54L, so the machine is more on the slow side.    
> > >
> > > Good to know. It would probably worth to check if this Ryzen
> > > bug occors with all versions of it or with just a subset.
> > > I mean: maybe it is only at the first gen or Ryzen and doesn't
> > > affect Ryzen 2 (or vice versa).    
> > 
> > The original commit also mentions some Xeons are affected too.  Seems
> > like this is potentially an issue on the device side rather than the
> > platform.  
> 
> Maybe.
> 
> > >
> > > The PCI quirks logic will likely need to detect the PCI ID of
> > > the memory controllers found at the buggy CPUs, in order to enable
> > > the quirk only for the affected ones.
> > >
> > > It could be worth talking with AMD people in order to be sure about
> > > the differences at the DMA engine side.
> > >    
> > 
> > It's not clear to me what the pci or platform quirk would do.  The
> > workaround seems to be in the driver, not on the platform.  
> 
> Yeah, the fix should be at the driver, but pci/quirk.c would be able
> to detect memory controllers that would require a hack inside the
> driver, in a similar way to what the media PCI drivers already do
> for DMA controllers that don't support pci2pci transfers.
> 
> There, basically the PCI core (drivers/pci/pci.c and 
> drivers/pci/quirks.c) sets a flag (pci_pci_problems) indicating
> potential issues.
> 
> Then, the driver compares such flag in order to enable the specific quirk.
> 
> Ok, there would be a different way to handle it. The driver could use a 
> logic similar to the one I wrote for drivers/edac/i7core_edac.c. There,
> the logic seeks for some specific PCI device IDs using pci_get_device(),
> adjusting the code accordingly, depending on the detected PCI devices.
> 
> In other words, running something like this (untested), at probe time should
> produce similar results:
> 
> 	/*
> 	 * FIXME: It probably makes sense to also be able to identify specific
> 	 * versions of the same PCI ID, just in case a latter stepping got a
> 	 * fix for the issue.
> 	 */
> 	const static struct {
> 		int vendor, dev;
> 	} broken_dev_id[] = {
> 		PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_foo,
> 		PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_bar,
> 	},
> 
> 	bool cx23885_does_dma_require_reset(void) 
> 	{
> 		int i;
> 		struct pci_dev *pdev = NULL;
> 
> 		for (i = 0; i < sizeof(broken_dev_id); i++) {
> 			pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
> 			if (pdev) {
> 				pci_put_device(pdev);
> 				return true;
> 			}
> 		}
> 		return false;
> 	}
> 
> Should work. In any case, we need to know what memory controllers 
> have problems, and what are their PCI IDs, and add them (if not there yet)
> at include/linux/pci_ids.h
> 
> Thanks,
> Mauro

To be clearer, I'm thinking on something like the (untested)
code below (untested).

PS.: the PCI ID used there may be wrong. I just added one in
order to have a proof of concept.

Thanks,
Mauro

[PATCH] media: cx23885: only reset DMA on problematic CPUs

It is reported that changeset 95f408bbc4e4 ("media: cx23885:
Ryzen DMA related RiSC engine stall fixes") caused regresssions
with other CPUs.

Ensure that the quirk will be applied only for the CPUs that
are known to cause problems.

Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>

diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
index 39804d830305..48da7d194cc1 100644
--- a/drivers/media/pci/cx23885/cx23885-core.c
+++ b/drivers/media/pci/cx23885/cx23885-core.c
@@ -23,6 +23,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
+#include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -603,8 +604,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
 
 static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
 {
-	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
-	uint32_t reg2_val = cx_read(TC_REQ_SET);
+	uint32_t reg1_val, reg2_val;
+
+	if (!dev->need_dma_reset)
+		return;
+
+	reg1_val = cx_read(TC_REQ); /* read-only */
+	reg2_val = cx_read(TC_REQ_SET);
 
 	if (reg1_val && reg2_val) {
 		cx_write(TC_REQ, reg1_val);
@@ -2058,6 +2064,31 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
 	/* TODO: 23-19 */
 }
 
+static struct {
+	int vendor, dev;
+} const broken_dev_id[] = {
+	/* According with
+	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
+	 * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
+	 */
+	{ PCI_VENDOR_ID_AMD, 0x1451 },
+};
+
+static bool cx23885_does_need_dma_reset(void)
+{
+	int i;
+	struct pci_dev *pdev = NULL;
+
+	for (i = 0; i < sizeof(broken_dev_id); i++) {
+		pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
+		if (pdev) {
+			pci_dev_put(pdev);
+			return true;
+		}
+	}
+	return false;
+}
+
 static int cx23885_initdev(struct pci_dev *pci_dev,
 			   const struct pci_device_id *pci_id)
 {
@@ -2069,6 +2100,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
 	if (NULL == dev)
 		return -ENOMEM;
 
+	dev->need_dma_reset = cx23885_does_need_dma_reset();
+
 	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
 	if (err < 0)
 		goto fail_free;
diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
index d54c7ee1ab21..cf965efabe66 100644
--- a/drivers/media/pci/cx23885/cx23885.h
+++ b/drivers/media/pci/cx23885/cx23885.h
@@ -451,6 +451,8 @@ struct cx23885_dev {
 	/* Analog raw audio */
 	struct cx23885_audio_dev   *audio_dev;
 
+	/* Does the system require periodic DMA resets? */
+	unsigned int		need_dma_reset:1;
 };
 
 static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-06 18:01       ` Mauro Carvalho Chehab
  2018-12-06 18:36         ` Alex Deucher
@ 2018-12-16 10:37         ` Markus Dobel
  2018-12-16 14:23           ` Mauro Carvalho Chehab
  1 sibling, 1 reply; 26+ messages in thread
From: Markus Dobel @ 2018-12-16 10:37 UTC (permalink / raw)
  To: Mauro Carvalho Chehab; +Cc: Brad Love, linux-media, linux-media-owner

On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
> Em Thu, 06 Dec 2018 18:18:23 +0100
> Markus Dobel <markus.dobel@gmx.de> escreveu:
> 
>> Hi everyone,
>> 
>> I will try if the hack mentioned fixes the issue for me on the weekend 
>> (but I assume, as if effectively removes the function).
> 
> It should, but it keeps a few changes. Just want to be sure that what
> would be left won't cause issues. If this works, the logic that would
> solve Ryzen DMA fixes will be contained into a single point, making
> easier to maintain it.

Hi,

I wanted to have this setup running stable for a few days before 
replying, that's why I am answering only now.

But yes, as expected, with Mauro's hack, the driver has been stable for 
me for about a week, with several
scheduled recordings in tvheadend, none of them missed.

So, adding a reliable detection for affected chipsets, where the `if 
(1)` currently is, should work.

Regards,
Markus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-16 10:37         ` [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
@ 2018-12-16 14:23           ` Mauro Carvalho Chehab
  2018-12-18  2:05             ` Alex Deucher
  0 siblings, 1 reply; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-16 14:23 UTC (permalink / raw)
  To: Markus Dobel, Brad Love, Alex Deucher; +Cc: linux-media, linux-media-owner

Em Sun, 16 Dec 2018 11:37:02 +0100
Markus Dobel <markus.dobel@gmx.de> escreveu:

> On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
> > Em Thu, 06 Dec 2018 18:18:23 +0100
> > Markus Dobel <markus.dobel@gmx.de> escreveu:
> >   
> >> Hi everyone,
> >> 
> >> I will try if the hack mentioned fixes the issue for me on the weekend 
> >> (but I assume, as if effectively removes the function).  
> > 
> > It should, but it keeps a few changes. Just want to be sure that what
> > would be left won't cause issues. If this works, the logic that would
> > solve Ryzen DMA fixes will be contained into a single point, making
> > easier to maintain it.  
> 
> Hi,
> 
> I wanted to have this setup running stable for a few days before 
> replying, that's why I am answering only now.
> 
> But yes, as expected, with Mauro's hack, the driver has been stable for 
> me for about a week, with several
> scheduled recordings in tvheadend, none of them missed.
> 
> So, adding a reliable detection for affected chipsets, where the `if 
> (1)` currently is, should work.

Markus,

Thanks for testing!

Brad/Alex,

I guess we should then stick with this patch:
	https://patchwork.linuxtv.org/patch/53351/

The past approach that we used on cx88, bttv and other old drivers
were to patch drivers/pci/quirks.c, making them to "taint" DMA
memory controllers that were known to bad affect on media devices,
and then some logic at the drivers to check for such "taint".

However, that would require to touch another subsystem, with
usually cause delays. Also, as Alex pointed, this could well
be just a matter of incompatibility between the cx23885 and
the Ryzen DMA controller, and may not affect any other drivers.

So, let's start with a logic like what I proposed, fine
tuning it to the Ryzen DMA controllers with we know have
troubles with the driver. 

We need to list the PCI ID of the memory controllers at the
device ID table on that patch, though. At the RFC patch,
I just added an IOMMU PCI ID from a randon Ryzen CPU:

	+static struct {
	+	int vendor, dev;
	+} const broken_dev_id[] = {
	+	/* According with
	+	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
	+	 * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
	+	 */
	+	{ PCI_VENDOR_ID_AMD, 0x1451 },
	+};
	+

Ideally, the ID for the affected Ryzen DMA engines should be there at
include/linux/pci_ids.h, instead of hard-coded inside a driver.

Also, we should, instead, add there the PCI IDs of the DMA engines
that are known to have problems with the cx23885.

There one thing that still bothers me: could this problem be due to
some BIOS setup [1]? If so, are there any ways for dynamically
disabling such features inside the driver?

[1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/

Brad,

From your reports about the DMA issues, do you know what generations
of the Ryzen are affected? 

Alex,

Do you know if are there any differences at the IP block for the
DMA engine used on different Ryzen CPUs? I mean: I suspect that
the engine for Ryzen 2nd generation would likely be different than 
the one at the 1st generation, but, along the same generation, does
the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?

Thanks,
Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-16 14:23           ` Mauro Carvalho Chehab
@ 2018-12-18  2:05             ` Alex Deucher
  2018-12-18  6:32               ` Markus Dobel
  2018-12-18 12:45               ` Mauro Carvalho Chehab
  0 siblings, 2 replies; 26+ messages in thread
From: Alex Deucher @ 2018-12-18  2:05 UTC (permalink / raw)
  To: mchehab, Suthikulpanit, Suravee
  Cc: Markus Dobel, Brad Love, linux-media, linux-media-owner

On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
<mchehab@kernel.org> wrote:
>
> Em Sun, 16 Dec 2018 11:37:02 +0100
> Markus Dobel <markus.dobel@gmx.de> escreveu:
>
> > On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
> > > Em Thu, 06 Dec 2018 18:18:23 +0100
> > > Markus Dobel <markus.dobel@gmx.de> escreveu:
> > >
> > >> Hi everyone,
> > >>
> > >> I will try if the hack mentioned fixes the issue for me on the weekend
> > >> (but I assume, as if effectively removes the function).
> > >
> > > It should, but it keeps a few changes. Just want to be sure that what
> > > would be left won't cause issues. If this works, the logic that would
> > > solve Ryzen DMA fixes will be contained into a single point, making
> > > easier to maintain it.
> >
> > Hi,
> >
> > I wanted to have this setup running stable for a few days before
> > replying, that's why I am answering only now.
> >
> > But yes, as expected, with Mauro's hack, the driver has been stable for
> > me for about a week, with several
> > scheduled recordings in tvheadend, none of them missed.
> >
> > So, adding a reliable detection for affected chipsets, where the `if
> > (1)` currently is, should work.
>
> Markus,
>
> Thanks for testing!
>
> Brad/Alex,
>
> I guess we should then stick with this patch:
>         https://patchwork.linuxtv.org/patch/53351/
>
> The past approach that we used on cx88, bttv and other old drivers
> were to patch drivers/pci/quirks.c, making them to "taint" DMA
> memory controllers that were known to bad affect on media devices,
> and then some logic at the drivers to check for such "taint".
>
> However, that would require to touch another subsystem, with
> usually cause delays. Also, as Alex pointed, this could well
> be just a matter of incompatibility between the cx23885 and
> the Ryzen DMA controller, and may not affect any other drivers.
>
> So, let's start with a logic like what I proposed, fine
> tuning it to the Ryzen DMA controllers with we know have
> troubles with the driver.
>
> We need to list the PCI ID of the memory controllers at the
> device ID table on that patch, though. At the RFC patch,
> I just added an IOMMU PCI ID from a randon Ryzen CPU:
>
>         +static struct {
>         +       int vendor, dev;
>         +} const broken_dev_id[] = {
>         +       /* According with
>         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
>         +        */
>         +       { PCI_VENDOR_ID_AMD, 0x1451 },
>         +};
>         +
>
> Ideally, the ID for the affected Ryzen DMA engines should be there at
> include/linux/pci_ids.h, instead of hard-coded inside a driver.
>
> Also, we should, instead, add there the PCI IDs of the DMA engines
> that are known to have problems with the cx23885.

These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?


>
> There one thing that still bothers me: could this problem be due to
> some BIOS setup [1]? If so, are there any ways for dynamically
> disabling such features inside the driver?
>
> [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
>

possibly?  It's still not clear to me that this is specific to ryzen
chips rather than a problem with the DMA setup on the cx board.  Is
there a downside to enabling the workaround in general?  The original
commit mentioned that xeon platforms were affected as well.  Is it
possible it's just particular platforms with wonky bioses?  Maybe DMI
matching would be better?

> Brad,
>
> From your reports about the DMA issues, do you know what generations
> of the Ryzen are affected?
>
> Alex,
>
> Do you know if are there any differences at the IP block for the
> DMA engine used on different Ryzen CPUs? I mean: I suspect that
> the engine for Ryzen 2nd generation would likely be different than
> the one at the 1st generation, but, along the same generation, does
> the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?

+ Suravee.  I'm not really familiar with the changes, if any, that are
in the pcie bridges on various AMD CPUs.  Or if there are changes, it
would be hard to say whether this issue would affect them or not.

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-18  2:05             ` Alex Deucher
@ 2018-12-18  6:32               ` Markus Dobel
  2018-12-18 12:45               ` Mauro Carvalho Chehab
  1 sibling, 0 replies; 26+ messages in thread
From: Markus Dobel @ 2018-12-18  6:32 UTC (permalink / raw)
  To: Alex Deucher, mchehab, Suthikulpanit, Suravee
  Cc: Brad Love, linux-media, linux-media-owner



Am 18. Dezember 2018 03:05:11 MEZ schrieb Alex Deucher <alexdeucher@gmail.com>:

>possibly?  It's still not clear to me that this is specific to ryzen
>chips rather than a problem with the DMA setup on the cx board.  Is
>there a downside to enabling the workaround in general?  

Hi Alex,

yes, there is. At least for me, the resetting function breaks the driver, making the card unresponsive after a few hours of uptime. Without that function, the card is perfectly stable.

Markus

-- 
Gesendet mit zwei Streichhölzern, einem Gummiband und etwas Draht.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-18  2:05             ` Alex Deucher
  2018-12-18  6:32               ` Markus Dobel
@ 2018-12-18 12:45               ` Mauro Carvalho Chehab
  2018-12-18 23:11                 ` Brad Love
  2018-12-18 23:46                 ` Alex Deucher
  1 sibling, 2 replies; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-18 12:45 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Suthikulpanit, Suravee, Markus Dobel, Brad Love, linux-media,
	linux-media-owner

Em Mon, 17 Dec 2018 21:05:11 -0500
Alex Deucher <alexdeucher@gmail.com> escreveu:

> On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
> <mchehab@kernel.org> wrote:
> >
> > Em Sun, 16 Dec 2018 11:37:02 +0100
> > Markus Dobel <markus.dobel@gmx.de> escreveu:
> >  
> > > On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:  
> > > > Em Thu, 06 Dec 2018 18:18:23 +0100
> > > > Markus Dobel <markus.dobel@gmx.de> escreveu:
> > > >  
> > > >> Hi everyone,
> > > >>
> > > >> I will try if the hack mentioned fixes the issue for me on the weekend
> > > >> (but I assume, as if effectively removes the function).  
> > > >
> > > > It should, but it keeps a few changes. Just want to be sure that what
> > > > would be left won't cause issues. If this works, the logic that would
> > > > solve Ryzen DMA fixes will be contained into a single point, making
> > > > easier to maintain it.  
> > >
> > > Hi,
> > >
> > > I wanted to have this setup running stable for a few days before
> > > replying, that's why I am answering only now.
> > >
> > > But yes, as expected, with Mauro's hack, the driver has been stable for
> > > me for about a week, with several
> > > scheduled recordings in tvheadend, none of them missed.
> > >
> > > So, adding a reliable detection for affected chipsets, where the `if
> > > (1)` currently is, should work.  
> >
> > Markus,
> >
> > Thanks for testing!
> >
> > Brad/Alex,
> >
> > I guess we should then stick with this patch:
> >         https://patchwork.linuxtv.org/patch/53351/
> >
> > The past approach that we used on cx88, bttv and other old drivers
> > were to patch drivers/pci/quirks.c, making them to "taint" DMA
> > memory controllers that were known to bad affect on media devices,
> > and then some logic at the drivers to check for such "taint".
> >
> > However, that would require to touch another subsystem, with
> > usually cause delays. Also, as Alex pointed, this could well
> > be just a matter of incompatibility between the cx23885 and
> > the Ryzen DMA controller, and may not affect any other drivers.
> >
> > So, let's start with a logic like what I proposed, fine
> > tuning it to the Ryzen DMA controllers with we know have
> > troubles with the driver.
> >
> > We need to list the PCI ID of the memory controllers at the
> > device ID table on that patch, though. At the RFC patch,
> > I just added an IOMMU PCI ID from a randon Ryzen CPU:
> >
> >         +static struct {
> >         +       int vendor, dev;
> >         +} const broken_dev_id[] = {
> >         +       /* According with
> >         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> >         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
> >         +        */
> >         +       { PCI_VENDOR_ID_AMD, 0x1451 },
> >         +};
> >         +
> >
> > Ideally, the ID for the affected Ryzen DMA engines should be there at
> > include/linux/pci_ids.h, instead of hard-coded inside a driver.
> >
> > Also, we should, instead, add there the PCI IDs of the DMA engines
> > that are known to have problems with the cx23885.  
> 
> These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?

Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.

Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines 
have internally a RISC CPU that it is programmed, in runtime, to do
DMA scatter/gather. The actual DMA engine is there. For it to work, the
Northbridge (or the CPU chipset - as nowadays several chipsets integrated
the Northbridge inside an IP block at the CPU) has to do the counter part,
by allowing the board's DMA engine to access the mainboard's main memory,
usually via IOMMU, in a safe way[1].

[1] preventing memory corruption if two devices try to do DMA to the
same area, or if the DMA from the board tries to write at the same
time the CPU tries to access it.

Media PCI boards usually push the DMA logic to unusual conditions, as
a large amount of data is transferred, in a synchronous way,
between the PCIe card and memory.

If the video stream is recorded, the same physical memory DMA mapped area
where the data is written by the video board could be used on another DMA
transfer via the HD disk controller.

It is even possible to setup the Conexant's DMA engine to do transfers 
directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
using V4L2 API overlay mode.

There was a time where it used to be common to have Intel CPUs (or
Intel-compatible CPUs) using non-Intel North Bridges. On such time,
we've seen a lot of troubles with PCI to PCI transfers most of them
when using non-Intel north bridges. 

With some north bridges, having the same block of memory mapped
for two DMA operations (where memory writes come from the video
card and memory reads from the HD disk controller) was also
problematic, as the IOMMU had issues on managing two kinds of
transfer for the same physical memory block.

The report we have on the 95f408bb commit is:

   "media: cx23885: Ryzen DMA related RiSC engine stall fixes
    
    This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
    platforms and some XEON platforms. On these platforms it is possible to
    error out the RiSC engine and cause it to stall, whereafter the only
    way to reset the board to a working state is to reboot.
...
    [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"

Brad could fill more details here, but I've seen the "risc op code
error" before with bt878 and cx88 chipsets (with use a similar RISC).
We usually get such error when there's a problem with the North Bridge
that was not capable of doing their part at the DMA transfer.

As far as I know, the Hauppauge QuadHD boards can receive 4 different
HD MPEG-TS streams (either from cable or air transmissions). On cable,
one transponder can have up to ~40 Mbits/second. So, this board will
produce 4 streams of up to 40 Mbps each, happening on different times,
each filled in a synchronous way. As nobody watches 4 channels at
the same time, it is safe to assume that at least 3 channels will
be recorded (if not all 4 channels). So, we're talking about 320 MBps
of traffic that may be competing with other DMA traffic (including
some from the Kernel itself, in order to handle memory swap).

That can be recording channels for several weeks.

This usually pushes the North Bridge into their limits, and could
be revealing some North Bridge/IOMMU issues that it would otherwise 
be not noticed under normal traffic.

> >
> > There one thing that still bothers me: could this problem be due to
> > some BIOS setup [1]? If so, are there any ways for dynamically
> > disabling such features inside the driver?
> >
> > [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
> >  
> 
> possibly?  It's still not clear to me that this is specific to ryzen
> chips rather than a problem with the DMA setup on the cx board.  Is
> there a downside to enabling the workaround in general? 

The problem here is that the code with resets the DMA engine (required
for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.

So, one solution that would fit all doesn't seem to exist.

> The original commit mentioned that xeon platforms were affected as well.

Xeon uses different chipsets and a different solution for the North
Bridge functionality, with may explain why some Xeon CPUs have the
same issue.

> Is it possible it's just particular platforms with wonky bioses? 

Good point. Yeah, it could be triggered by a wonky bios or a bad setup 
(like enabling overclock or activating some chipset-specific feature
that would increase the chance for a DMA transfer to fail).

> Maybe DMI matching would be better?

Mapping via DMI could work too, but it would be a way harder to map,
as one would need to have a cx23885 board (if possible one with 4
tuners) and a series of different machines in order to test it.

Based with previous experiences with bttv and cx88, I suspect that
we'll end by needing to map all machines with the same chipset.

> 
> > Brad,
> >
> > From your reports about the DMA issues, do you know what generations
> > of the Ryzen are affected?
> >
> > Alex,
> >
> > Do you know if are there any differences at the IP block for the
> > DMA engine used on different Ryzen CPUs? I mean: I suspect that
> > the engine for Ryzen 2nd generation would likely be different than
> > the one at the 1st generation, but, along the same generation, does
> > the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?  
> 
> + Suravee.  I'm not really familiar with the changes, if any, that are
> in the pcie bridges on various AMD CPUs.  Or if there are changes, it
> would be hard to say whether this issue would affect them or not.
> 
> Alex



Thanks,
Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-06 19:32             ` Mauro Carvalho Chehab
@ 2018-12-18 22:59               ` Brad Love
  2018-12-18 23:49                 ` Alex Deucher
                                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Brad Love @ 2018-12-18 22:59 UTC (permalink / raw)
  To: linux-media, mchehab, markus.dobel, alexdeucher; +Cc: Brad Love

It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
related RiSC engine stall fixes") caused regresssions with other CPUs.

Ensure that the quirk will be applied only for the CPUs that
are known to cause problems.

A module option is added for explicit control of the behaviour.

Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")

Signed-off-by: Brad Love <brad@nextdimension.cc>
---
Changes since v1:
- Added module option for three way control
- Removed '7' from pci id description, Ryzen 3 is the same id

 drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
 drivers/media/pci/cx23885/cx23885.h      |  2 ++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
index 39804d8..fb721c7 100644
--- a/drivers/media/pci/cx23885/cx23885-core.c
+++ b/drivers/media/pci/cx23885/cx23885-core.c
@@ -23,6 +23,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
+#include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(CX23885_VERSION);
 
+/*
+ * Some platforms have been found to require periodic resetting of the DMA
+ * engine. Ryzen and XEON platforms are known to be affected. The symptom
+ * encountered is "mpeg risc op code error". Only Ryzen platforms employ
+ * this workaround if the option equals 1. The workaround can be explicitly
+ * disabled for all platforms by setting to 0, the workaround can be forced
+ * on for any platform by setting to 2.
+ */
+static unsigned int dma_reset_workaround = 1;
+module_param(dma_reset_workaround, int, 0644);
+MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
+
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
@@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
 
 static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
 {
-	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
-	uint32_t reg2_val = cx_read(TC_REQ_SET);
+	uint32_t reg1_val, reg2_val;
+
+	if (!dev->need_dma_reset)
+		return;
+
+	reg1_val = cx_read(TC_REQ); /* read-only */
+	reg2_val = cx_read(TC_REQ_SET);
 
 	if (reg1_val && reg2_val) {
 		cx_write(TC_REQ, reg1_val);
@@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
 	/* TODO: 23-19 */
 }
 
+static struct {
+	int vendor, dev;
+} const broken_dev_id[] = {
+	/* According with
+	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
+	 * 0x1451 is PCI ID for the IOMMU found on Ryzen
+	 */
+	{ PCI_VENDOR_ID_AMD, 0x1451 },
+};
+
+static bool cx23885_does_need_dma_reset(void)
+{
+	int i;
+	struct pci_dev *pdev = NULL;
+
+	if (dma_reset_workaround == 0)
+		return false;
+	else if (dma_reset_workaround == 2)
+		return true;
+
+	for (i = 0; i < sizeof(broken_dev_id); i++) {
+		pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
+		if (pdev) {
+			pci_dev_put(pdev);
+			return true;
+		}
+	}
+	return false;
+}
+
 static int cx23885_initdev(struct pci_dev *pci_dev,
 			   const struct pci_device_id *pci_id)
 {
@@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
 	if (NULL == dev)
 		return -ENOMEM;
 
+	dev->need_dma_reset = cx23885_does_need_dma_reset();
+
 	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
 	if (err < 0)
 		goto fail_free;
diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
index d54c7ee..cf965ef 100644
--- a/drivers/media/pci/cx23885/cx23885.h
+++ b/drivers/media/pci/cx23885/cx23885.h
@@ -451,6 +451,8 @@ struct cx23885_dev {
 	/* Analog raw audio */
 	struct cx23885_audio_dev   *audio_dev;
 
+	/* Does the system require periodic DMA resets? */
+	unsigned int		need_dma_reset:1;
 };
 
 static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-18 12:45               ` Mauro Carvalho Chehab
@ 2018-12-18 23:11                 ` Brad Love
  2018-12-18 23:46                 ` Alex Deucher
  1 sibling, 0 replies; 26+ messages in thread
From: Brad Love @ 2018-12-18 23:11 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Alex Deucher
  Cc: Suthikulpanit, Suravee, Markus Dobel, Brad Love, linux-media,
	linux-media-owner

Hi everyone,


On 18/12/2018 06.45, Mauro Carvalho Chehab wrote:
> Em Mon, 17 Dec 2018 21:05:11 -0500
> Alex Deucher <alexdeucher@gmail.com> escreveu:
>
>> On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
>> <mchehab@kernel.org> wrote:
>>> Em Sun, 16 Dec 2018 11:37:02 +0100
>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>  
>>>> On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:  
>>>>> Em Thu, 06 Dec 2018 18:18:23 +0100
>>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>>>  
>>>>>> Hi everyone,
>>>>>>
>>>>>> I will try if the hack mentioned fixes the issue for me on the weekend
>>>>>> (but I assume, as if effectively removes the function).  
>>>>> It should, but it keeps a few changes. Just want to be sure that what
>>>>> would be left won't cause issues. If this works, the logic that would
>>>>> solve Ryzen DMA fixes will be contained into a single point, making
>>>>> easier to maintain it.  
>>>> Hi,
>>>>
>>>> I wanted to have this setup running stable for a few days before
>>>> replying, that's why I am answering only now.
>>>>
>>>> But yes, as expected, with Mauro's hack, the driver has been stable for
>>>> me for about a week, with several
>>>> scheduled recordings in tvheadend, none of them missed.
>>>>
>>>> So, adding a reliable detection for affected chipsets, where the `if
>>>> (1)` currently is, should work.  
>>> Markus,
>>>
>>> Thanks for testing!
>>>
>>> Brad/Alex,
>>>
>>> I guess we should then stick with this patch:
>>>         https://patchwork.linuxtv.org/patch/53351/
>>>
>>> The past approach that we used on cx88, bttv and other old drivers
>>> were to patch drivers/pci/quirks.c, making them to "taint" DMA
>>> memory controllers that were known to bad affect on media devices,
>>> and then some logic at the drivers to check for such "taint".
>>>
>>> However, that would require to touch another subsystem, with
>>> usually cause delays. Also, as Alex pointed, this could well
>>> be just a matter of incompatibility between the cx23885 and
>>> the Ryzen DMA controller, and may not affect any other drivers.
>>>
>>> So, let's start with a logic like what I proposed, fine
>>> tuning it to the Ryzen DMA controllers with we know have
>>> troubles with the driver.
>>>
>>> We need to list the PCI ID of the memory controllers at the
>>> device ID table on that patch, though. At the RFC patch,
>>> I just added an IOMMU PCI ID from a randon Ryzen CPU:
>>>
>>>         +static struct {
>>>         +       int vendor, dev;
>>>         +} const broken_dev_id[] = {
>>>         +       /* According with
>>>         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>>>         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
>>>         +        */
>>>         +       { PCI_VENDOR_ID_AMD, 0x1451 },
>>>         +};
>>>         +
>>>
>>> Ideally, the ID for the affected Ryzen DMA engines should be there at
>>> include/linux/pci_ids.h, instead of hard-coded inside a driver.
>>>
>>> Also, we should, instead, add there the PCI IDs of the DMA engines
>>> that are known to have problems with the cx23885.  
>> These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?
> Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.
>
> Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines 
> have internally a RISC CPU that it is programmed, in runtime, to do
> DMA scatter/gather. The actual DMA engine is there. For it to work, the
> Northbridge (or the CPU chipset - as nowadays several chipsets integrated
> the Northbridge inside an IP block at the CPU) has to do the counter part,
> by allowing the board's DMA engine to access the mainboard's main memory,
> usually via IOMMU, in a safe way[1].
>
> [1] preventing memory corruption if two devices try to do DMA to the
> same area, or if the DMA from the board tries to write at the same
> time the CPU tries to access it.
>
> Media PCI boards usually push the DMA logic to unusual conditions, as
> a large amount of data is transferred, in a synchronous way,
> between the PCIe card and memory.
>
> If the video stream is recorded, the same physical memory DMA mapped area
> where the data is written by the video board could be used on another DMA
> transfer via the HD disk controller.
>
> It is even possible to setup the Conexant's DMA engine to do transfers 
> directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
> using V4L2 API overlay mode.
>
> There was a time where it used to be common to have Intel CPUs (or
> Intel-compatible CPUs) using non-Intel North Bridges. On such time,
> we've seen a lot of troubles with PCI to PCI transfers most of them
> when using non-Intel north bridges. 
>
> With some north bridges, having the same block of memory mapped
> for two DMA operations (where memory writes come from the video
> card and memory reads from the HD disk controller) was also
> problematic, as the IOMMU had issues on managing two kinds of
> transfer for the same physical memory block.
>
> The report we have on the 95f408bb commit is:
>
>    "media: cx23885: Ryzen DMA related RiSC engine stall fixes
>     
>     This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
>     platforms and some XEON platforms. On these platforms it is possible to
>     error out the RiSC engine and cause it to stall, whereafter the only
>     way to reset the board to a working state is to reboot.
> ...
>     [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"
>
> Brad could fill more details here, but I've seen the "risc op code
> error" before with bt878 and cx88 chipsets (with use a similar RISC).
> We usually get such error when there's a problem with the North Bridge
> that was not capable of doing their part at the DMA transfer.
>
> As far as I know, the Hauppauge QuadHD boards can receive 4 different
> HD MPEG-TS streams (either from cable or air transmissions). On cable,
> one transponder can have up to ~40 Mbits/second. So, this board will
> produce 4 streams of up to 40 Mbps each, happening on different times,
> each filled in a synchronous way. As nobody watches 4 channels at
> the same time, it is safe to assume that at least 3 channels will
> be recorded (if not all 4 channels). So, we're talking about 320 MBps
> of traffic that may be competing with other DMA traffic (including
> some from the Kernel itself, in order to handle memory swap).
>
> That can be recording channels for several weeks.
>
> This usually pushes the North Bridge into their limits, and could
> be revealing some North Bridge/IOMMU issues that it would otherwise 
> be not noticed under normal traffic.


Thanks for the detailed description Mauro. What you've said here is
pretty much my understanding.

I submitted a patch to the list and cc'd you all. I simply took Mauro's
patch and added a module option. The option is set to default enable for
Ryzen, and also have a force on and force off option. I added a comment
in the driver in case someone encounters this hereafter.

Regards,

Brad




>
>>> There one thing that still bothers me: could this problem be due to
>>> some BIOS setup [1]? If so, are there any ways for dynamically
>>> disabling such features inside the driver?
>>>
>>> [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
>>>  
>> possibly?  It's still not clear to me that this is specific to ryzen
>> chips rather than a problem with the DMA setup on the cx board.  Is
>> there a downside to enabling the workaround in general? 
> The problem here is that the code with resets the DMA engine (required
> for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.
>
> So, one solution that would fit all doesn't seem to exist.
>
>> The original commit mentioned that xeon platforms were affected as well.
> Xeon uses different chipsets and a different solution for the North
> Bridge functionality, with may explain why some Xeon CPUs have the
> same issue.
>
>> Is it possible it's just particular platforms with wonky bioses? 
> Good point. Yeah, it could be triggered by a wonky bios or a bad setup 
> (like enabling overclock or activating some chipset-specific feature
> that would increase the chance for a DMA transfer to fail).
>
>> Maybe DMI matching would be better?
> Mapping via DMI could work too, but it would be a way harder to map,
> as one would need to have a cx23885 board (if possible one with 4
> tuners) and a series of different machines in order to test it.
>
> Based with previous experiences with bttv and cx88, I suspect that
> we'll end by needing to map all machines with the same chipset.
>
>>> Brad,
>>>
>>> From your reports about the DMA issues, do you know what generations
>>> of the Ryzen are affected?
>>>
>>> Alex,
>>>
>>> Do you know if are there any differences at the IP block for the
>>> DMA engine used on different Ryzen CPUs? I mean: I suspect that
>>> the engine for Ryzen 2nd generation would likely be different than
>>> the one at the 1st generation, but, along the same generation, does
>>> the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?  
>> + Suravee.  I'm not really familiar with the changes, if any, that are
>> in the pcie bridges on various AMD CPUs.  Or if there are changes, it
>> would be hard to say whether this issue would affect them or not.
>>
>> Alex
>
>
> Thanks,
> Mauro


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-18 12:45               ` Mauro Carvalho Chehab
  2018-12-18 23:11                 ` Brad Love
@ 2018-12-18 23:46                 ` Alex Deucher
  2018-12-19  0:05                   ` Brad Love
  1 sibling, 1 reply; 26+ messages in thread
From: Alex Deucher @ 2018-12-18 23:46 UTC (permalink / raw)
  To: mchehab
  Cc: Suthikulpanit, Suravee, Markus Dobel, Brad Love, linux-media,
	linux-media-owner

On Tue, Dec 18, 2018 at 7:45 AM Mauro Carvalho Chehab
<mchehab@kernel.org> wrote:
>
> Em Mon, 17 Dec 2018 21:05:11 -0500
> Alex Deucher <alexdeucher@gmail.com> escreveu:
>
> > On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
> > <mchehab@kernel.org> wrote:
> > >
> > > Em Sun, 16 Dec 2018 11:37:02 +0100
> > > Markus Dobel <markus.dobel@gmx.de> escreveu:
> > >
> > > > On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
> > > > > Em Thu, 06 Dec 2018 18:18:23 +0100
> > > > > Markus Dobel <markus.dobel@gmx.de> escreveu:
> > > > >
> > > > >> Hi everyone,
> > > > >>
> > > > >> I will try if the hack mentioned fixes the issue for me on the weekend
> > > > >> (but I assume, as if effectively removes the function).
> > > > >
> > > > > It should, but it keeps a few changes. Just want to be sure that what
> > > > > would be left won't cause issues. If this works, the logic that would
> > > > > solve Ryzen DMA fixes will be contained into a single point, making
> > > > > easier to maintain it.
> > > >
> > > > Hi,
> > > >
> > > > I wanted to have this setup running stable for a few days before
> > > > replying, that's why I am answering only now.
> > > >
> > > > But yes, as expected, with Mauro's hack, the driver has been stable for
> > > > me for about a week, with several
> > > > scheduled recordings in tvheadend, none of them missed.
> > > >
> > > > So, adding a reliable detection for affected chipsets, where the `if
> > > > (1)` currently is, should work.
> > >
> > > Markus,
> > >
> > > Thanks for testing!
> > >
> > > Brad/Alex,
> > >
> > > I guess we should then stick with this patch:
> > >         https://patchwork.linuxtv.org/patch/53351/
> > >
> > > The past approach that we used on cx88, bttv and other old drivers
> > > were to patch drivers/pci/quirks.c, making them to "taint" DMA
> > > memory controllers that were known to bad affect on media devices,
> > > and then some logic at the drivers to check for such "taint".
> > >
> > > However, that would require to touch another subsystem, with
> > > usually cause delays. Also, as Alex pointed, this could well
> > > be just a matter of incompatibility between the cx23885 and
> > > the Ryzen DMA controller, and may not affect any other drivers.
> > >
> > > So, let's start with a logic like what I proposed, fine
> > > tuning it to the Ryzen DMA controllers with we know have
> > > troubles with the driver.
> > >
> > > We need to list the PCI ID of the memory controllers at the
> > > device ID table on that patch, though. At the RFC patch,
> > > I just added an IOMMU PCI ID from a randon Ryzen CPU:
> > >
> > >         +static struct {
> > >         +       int vendor, dev;
> > >         +} const broken_dev_id[] = {
> > >         +       /* According with
> > >         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> > >         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
> > >         +        */
> > >         +       { PCI_VENDOR_ID_AMD, 0x1451 },
> > >         +};
> > >         +
> > >
> > > Ideally, the ID for the affected Ryzen DMA engines should be there at
> > > include/linux/pci_ids.h, instead of hard-coded inside a driver.
> > >
> > > Also, we should, instead, add there the PCI IDs of the DMA engines
> > > that are known to have problems with the cx23885.
> >
> > These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?
>
> Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.
>
> Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines
> have internally a RISC CPU that it is programmed, in runtime, to do
> DMA scatter/gather. The actual DMA engine is there. For it to work, the
> Northbridge (or the CPU chipset - as nowadays several chipsets integrated
> the Northbridge inside an IP block at the CPU) has to do the counter part,
> by allowing the board's DMA engine to access the mainboard's main memory,
> usually via IOMMU, in a safe way[1].
>
> [1] preventing memory corruption if two devices try to do DMA to the
> same area, or if the DMA from the board tries to write at the same
> time the CPU tries to access it.
>
> Media PCI boards usually push the DMA logic to unusual conditions, as
> a large amount of data is transferred, in a synchronous way,
> between the PCIe card and memory.
>
> If the video stream is recorded, the same physical memory DMA mapped area
> where the data is written by the video board could be used on another DMA
> transfer via the HD disk controller.
>
> It is even possible to setup the Conexant's DMA engine to do transfers
> directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
> using V4L2 API overlay mode.

Is this supported today?  I didn't think all of the PCI and DMA API
changes had gone upstream yet for PCIe p2p support.  I think you'd
need to disable the IOMMU to work correctly.

>
> There was a time where it used to be common to have Intel CPUs (or
> Intel-compatible CPUs) using non-Intel North Bridges. On such time,
> we've seen a lot of troubles with PCI to PCI transfers most of them
> when using non-Intel north bridges.

Well p2p was problematic in general in the early days of pcie since it
wasn't part of the spec.  Things are only recently improving in that
regard.  Linux doesn't even have good pcie p2p support yet.

>
> With some north bridges, having the same block of memory mapped
> for two DMA operations (where memory writes come from the video
> card and memory reads from the HD disk controller) was also
> problematic, as the IOMMU had issues on managing two kinds of
> transfer for the same physical memory block.
>
> The report we have on the 95f408bb commit is:
>
>    "media: cx23885: Ryzen DMA related RiSC engine stall fixes
>
>     This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
>     platforms and some XEON platforms. On these platforms it is possible to
>     error out the RiSC engine and cause it to stall, whereafter the only
>     way to reset the board to a working state is to reboot.
> ...
>     [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"
>
> Brad could fill more details here, but I've seen the "risc op code
> error" before with bt878 and cx88 chipsets (with use a similar RISC).
> We usually get such error when there's a problem with the North Bridge
> that was not capable of doing their part at the DMA transfer.
>

What worries me is that the cx DMA engine seems to get into this state
even on non-Ryzen boards and in that case it doesn't seem to recover.
Hence Markus seeing hangs on his board with the workaround enabled
unconditionally.  Shouldn't it not get into that state in the first
place if the bridge is not problematic?  I'm concerned the we are just
papering over the real issue and we may end up causing stability
issues for platforms that are added to the list.

> As far as I know, the Hauppauge QuadHD boards can receive 4 different
> HD MPEG-TS streams (either from cable or air transmissions). On cable,
> one transponder can have up to ~40 Mbits/second. So, this board will
> produce 4 streams of up to 40 Mbps each, happening on different times,
> each filled in a synchronous way. As nobody watches 4 channels at
> the same time, it is safe to assume that at least 3 channels will
> be recorded (if not all 4 channels). So, we're talking about 320 MBps
> of traffic that may be competing with other DMA traffic (including
> some from the Kernel itself, in order to handle memory swap).
>
> That can be recording channels for several weeks.
>
> This usually pushes the North Bridge into their limits, and could
> be revealing some North Bridge/IOMMU issues that it would otherwise
> be not noticed under normal traffic.
>

GPUs push a lot of traffic too and we aren't seeing anything like this.


> > >
> > > There one thing that still bothers me: could this problem be due to
> > > some BIOS setup [1]? If so, are there any ways for dynamically
> > > disabling such features inside the driver?
> > >
> > > [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
> > >
> >
> > possibly?  It's still not clear to me that this is specific to ryzen
> > chips rather than a problem with the DMA setup on the cx board.  Is
> > there a downside to enabling the workaround in general?
>
> The problem here is that the code with resets the DMA engine (required
> for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.
>
> So, one solution that would fit all doesn't seem to exist.
>
> > The original commit mentioned that xeon platforms were affected as well.
>
> Xeon uses different chipsets and a different solution for the North
> Bridge functionality, with may explain why some Xeon CPUs have the
> same issue.
>

Shouldn't we add them to the list as well?

> > Is it possible it's just particular platforms with wonky bioses?
>
> Good point. Yeah, it could be triggered by a wonky bios or a bad setup
> (like enabling overclock or activating some chipset-specific feature
> that would increase the chance for a DMA transfer to fail).
>
> > Maybe DMI matching would be better?
>
> Mapping via DMI could work too, but it would be a way harder to map,
> as one would need to have a cx23885 board (if possible one with 4
> tuners) and a series of different machines in order to test it.
>
> Based with previous experiences with bttv and cx88, I suspect that
> we'll end by needing to map all machines with the same chipset.
>

Does anyone have a ryzen and the cx board and can they verify that
they are actually seeing hangs without the special reset code?  So far
I've only see non-Ryzen users complaining about hangs with the
workaround.

Alex

> >
> > > Brad,
> > >
> > > From your reports about the DMA issues, do you know what generations
> > > of the Ryzen are affected?
> > >
> > > Alex,
> > >
> > > Do you know if are there any differences at the IP block for the
> > > DMA engine used on different Ryzen CPUs? I mean: I suspect that
> > > the engine for Ryzen 2nd generation would likely be different than
> > > the one at the 1st generation, but, along the same generation, does
> > > the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?
> >
> > + Suravee.  I'm not really familiar with the changes, if any, that are
> > in the pcie bridges on various AMD CPUs.  Or if there are changes, it
> > would be hard to say whether this issue would affect them or not.
> >
> > Alex
>
>
>
> Thanks,
> Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-18 22:59               ` [PATCH v2] cx23885: only reset DMA on problematic CPUs Brad Love
@ 2018-12-18 23:49                 ` Alex Deucher
  2018-12-19 17:26                   ` Brad Love
  2018-12-19 11:08                 ` Matthias Schwarzott
  2018-12-19 17:07                 ` [PATCH v3] " Brad Love
  2 siblings, 1 reply; 26+ messages in thread
From: Alex Deucher @ 2018-12-18 23:49 UTC (permalink / raw)
  To: Brad Love; +Cc: linux-media, mchehab, Markus Dobel

On Tue, Dec 18, 2018 at 5:59 PM Brad Love <brad@nextdimension.cc> wrote:
>
> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
> related RiSC engine stall fixes") caused regresssions with other CPUs.
>
> Ensure that the quirk will be applied only for the CPUs that
> are known to cause problems.
>
> A module option is added for explicit control of the behaviour.
>
> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
>
> Signed-off-by: Brad Love <brad@nextdimension.cc>
> ---
> Changes since v1:
> - Added module option for three way control
> - Removed '7' from pci id description, Ryzen 3 is the same id
>
>  drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>  2 files changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
> index 39804d8..fb721c7 100644
> --- a/drivers/media/pci/cx23885/cx23885-core.c
> +++ b/drivers/media/pci/cx23885/cx23885-core.c
> @@ -23,6 +23,7 @@
>  #include <linux/moduleparam.h>
>  #include <linux/kmod.h>
>  #include <linux/kernel.h>
> +#include <linux/pci.h>
>  #include <linux/slab.h>
>  #include <linux/interrupt.h>
>  #include <linux/delay.h>
> @@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
>  MODULE_LICENSE("GPL");
>  MODULE_VERSION(CX23885_VERSION);
>
> +/*
> + * Some platforms have been found to require periodic resetting of the DMA
> + * engine. Ryzen and XEON platforms are known to be affected. The symptom
> + * encountered is "mpeg risc op code error". Only Ryzen platforms employ
> + * this workaround if the option equals 1. The workaround can be explicitly
> + * disabled for all platforms by setting to 0, the workaround can be forced
> + * on for any platform by setting to 2.
> + */
> +static unsigned int dma_reset_workaround = 1;
> +module_param(dma_reset_workaround, int, 0644);
> +MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
> +
>  static unsigned int debug;
>  module_param(debug, int, 0644);
>  MODULE_PARM_DESC(debug, "enable debug messages");
> @@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
>
>  static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
>  {
> -       uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
> -       uint32_t reg2_val = cx_read(TC_REQ_SET);
> +       uint32_t reg1_val, reg2_val;
> +
> +       if (!dev->need_dma_reset)
> +               return;
> +
> +       reg1_val = cx_read(TC_REQ); /* read-only */
> +       reg2_val = cx_read(TC_REQ_SET);
>
>         if (reg1_val && reg2_val) {
>                 cx_write(TC_REQ, reg1_val);
> @@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>         /* TODO: 23-19 */
>  }
>
> +static struct {
> +       int vendor, dev;
> +} const broken_dev_id[] = {
> +       /* According with
> +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> +        * 0x1451 is PCI ID for the IOMMU found on Ryzen
> +        */
> +       { PCI_VENDOR_ID_AMD, 0x1451 },

Does this issue only happen with the IOMMU is enabled?  Is it only for
p2p transfers?  Until recently the DMA and PCI subsystems didn't
actually support p2p properly when the IOMMU was enabled.  that might
explain some of the issues.  Additionally, if you match based on the
IOMMU id, you won't match if the user disables the IOMMU in the sbios.
Is this only an issue with the IOMMU enabled?

Alex

> +};
> +
> +static bool cx23885_does_need_dma_reset(void)
> +{
> +       int i;
> +       struct pci_dev *pdev = NULL;
> +
> +       if (dma_reset_workaround == 0)
> +               return false;
> +       else if (dma_reset_workaround == 2)
> +               return true;
> +
> +       for (i = 0; i < sizeof(broken_dev_id); i++) {
> +               pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
> +               if (pdev) {
> +                       pci_dev_put(pdev);
> +                       return true;
> +               }
> +       }
> +       return false;
> +}
> +
>  static int cx23885_initdev(struct pci_dev *pci_dev,
>                            const struct pci_device_id *pci_id)
>  {
> @@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>         if (NULL == dev)
>                 return -ENOMEM;
>
> +       dev->need_dma_reset = cx23885_does_need_dma_reset();
> +
>         err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>         if (err < 0)
>                 goto fail_free;
> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
> index d54c7ee..cf965ef 100644
> --- a/drivers/media/pci/cx23885/cx23885.h
> +++ b/drivers/media/pci/cx23885/cx23885.h
> @@ -451,6 +451,8 @@ struct cx23885_dev {
>         /* Analog raw audio */
>         struct cx23885_audio_dev   *audio_dev;
>
> +       /* Does the system require periodic DMA resets? */
> +       unsigned int            need_dma_reset:1;
>  };
>
>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-18 23:46                 ` Alex Deucher
@ 2018-12-19  0:05                   ` Brad Love
  2018-12-19  0:08                     ` Brad Love
  0 siblings, 1 reply; 26+ messages in thread
From: Brad Love @ 2018-12-19  0:05 UTC (permalink / raw)
  To: Alex Deucher, mchehab
  Cc: Suthikulpanit, Suravee, Markus Dobel, Brad Love, linux-media,
	linux-media-owner


On 18/12/2018 17.46, Alex Deucher wrote:
> On Tue, Dec 18, 2018 at 7:45 AM Mauro Carvalho Chehab
> <mchehab@kernel.org> wrote:
>> Em Mon, 17 Dec 2018 21:05:11 -0500
>> Alex Deucher <alexdeucher@gmail.com> escreveu:
>>
>>> On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
>>> <mchehab@kernel.org> wrote:
>>>> Em Sun, 16 Dec 2018 11:37:02 +0100
>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>>
>>>>> On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
>>>>>> Em Thu, 06 Dec 2018 18:18:23 +0100
>>>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>>>>
>>>>>>> Hi everyone,
>>>>>>>
>>>>>>> I will try if the hack mentioned fixes the issue for me on the weekend
>>>>>>> (but I assume, as if effectively removes the function).
>>>>>> It should, but it keeps a few changes. Just want to be sure that what
>>>>>> would be left won't cause issues. If this works, the logic that would
>>>>>> solve Ryzen DMA fixes will be contained into a single point, making
>>>>>> easier to maintain it.
>>>>> Hi,
>>>>>
>>>>> I wanted to have this setup running stable for a few days before
>>>>> replying, that's why I am answering only now.
>>>>>
>>>>> But yes, as expected, with Mauro's hack, the driver has been stable for
>>>>> me for about a week, with several
>>>>> scheduled recordings in tvheadend, none of them missed.
>>>>>
>>>>> So, adding a reliable detection for affected chipsets, where the `if
>>>>> (1)` currently is, should work.
>>>> Markus,
>>>>
>>>> Thanks for testing!
>>>>
>>>> Brad/Alex,
>>>>
>>>> I guess we should then stick with this patch:
>>>>         https://patchwork.linuxtv.org/patch/53351/
>>>>
>>>> The past approach that we used on cx88, bttv and other old drivers
>>>> were to patch drivers/pci/quirks.c, making them to "taint" DMA
>>>> memory controllers that were known to bad affect on media devices,
>>>> and then some logic at the drivers to check for such "taint".
>>>>
>>>> However, that would require to touch another subsystem, with
>>>> usually cause delays. Also, as Alex pointed, this could well
>>>> be just a matter of incompatibility between the cx23885 and
>>>> the Ryzen DMA controller, and may not affect any other drivers.
>>>>
>>>> So, let's start with a logic like what I proposed, fine
>>>> tuning it to the Ryzen DMA controllers with we know have
>>>> troubles with the driver.
>>>>
>>>> We need to list the PCI ID of the memory controllers at the
>>>> device ID table on that patch, though. At the RFC patch,
>>>> I just added an IOMMU PCI ID from a randon Ryzen CPU:
>>>>
>>>>         +static struct {
>>>>         +       int vendor, dev;
>>>>         +} const broken_dev_id[] = {
>>>>         +       /* According with
>>>>         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>>>>         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
>>>>         +        */
>>>>         +       { PCI_VENDOR_ID_AMD, 0x1451 },
>>>>         +};
>>>>         +
>>>>
>>>> Ideally, the ID for the affected Ryzen DMA engines should be there at
>>>> include/linux/pci_ids.h, instead of hard-coded inside a driver.
>>>>
>>>> Also, we should, instead, add there the PCI IDs of the DMA engines
>>>> that are known to have problems with the cx23885.
>>> These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?
>> Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.
>>
>> Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines
>> have internally a RISC CPU that it is programmed, in runtime, to do
>> DMA scatter/gather. The actual DMA engine is there. For it to work, the
>> Northbridge (or the CPU chipset - as nowadays several chipsets integrated
>> the Northbridge inside an IP block at the CPU) has to do the counter part,
>> by allowing the board's DMA engine to access the mainboard's main memory,
>> usually via IOMMU, in a safe way[1].
>>
>> [1] preventing memory corruption if two devices try to do DMA to the
>> same area, or if the DMA from the board tries to write at the same
>> time the CPU tries to access it.
>>
>> Media PCI boards usually push the DMA logic to unusual conditions, as
>> a large amount of data is transferred, in a synchronous way,
>> between the PCIe card and memory.
>>
>> If the video stream is recorded, the same physical memory DMA mapped area
>> where the data is written by the video board could be used on another DMA
>> transfer via the HD disk controller.
>>
>> It is even possible to setup the Conexant's DMA engine to do transfers
>> directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
>> using V4L2 API overlay mode.
> Is this supported today?  I didn't think all of the PCI and DMA API
> changes had gone upstream yet for PCIe p2p support.  I think you'd
> need to disable the IOMMU to work correctly.
>
>> There was a time where it used to be common to have Intel CPUs (or
>> Intel-compatible CPUs) using non-Intel North Bridges. On such time,
>> we've seen a lot of troubles with PCI to PCI transfers most of them
>> when using non-Intel north bridges.
> Well p2p was problematic in general in the early days of pcie since it
> wasn't part of the spec.  Things are only recently improving in that
> regard.  Linux doesn't even have good pcie p2p support yet.
>
>> With some north bridges, having the same block of memory mapped
>> for two DMA operations (where memory writes come from the video
>> card and memory reads from the HD disk controller) was also
>> problematic, as the IOMMU had issues on managing two kinds of
>> transfer for the same physical memory block.
>>
>> The report we have on the 95f408bb commit is:
>>
>>    "media: cx23885: Ryzen DMA related RiSC engine stall fixes
>>
>>     This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
>>     platforms and some XEON platforms. On these platforms it is possible to
>>     error out the RiSC engine and cause it to stall, whereafter the only
>>     way to reset the board to a working state is to reboot.
>> ...
>>     [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"
>>
>> Brad could fill more details here, but I've seen the "risc op code
>> error" before with bt878 and cx88 chipsets (with use a similar RISC).
>> We usually get such error when there's a problem with the North Bridge
>> that was not capable of doing their part at the DMA transfer.
>>
> What worries me is that the cx DMA engine seems to get into this state
> even on non-Ryzen boards and in that case it doesn't seem to recover.
> Hence Markus seeing hangs on his board with the workaround enabled
> unconditionally.  Shouldn't it not get into that state in the first
> place if the bridge is not problematic?  I'm concerned the we are just
> papering over the real issue and we may end up causing stability
> issues for platforms that are added to the list.


This issue was never reported to Hauppauge until the release of Ryzen
platform. The company support received lots of tickets and my github
page also got issues reported. The issue was only seen on the products
with 4 tuner/decoders. Those issues were tested and investigated on
github, and I had to source a Ryzen platform to do testing on my own. I
definitely saw this issue before the "fix". With this fix in place, the
stability that was seen on previous platforms was then seen on Ryzen
platforms. It is just after some time, now we see that the method used
on ryzen has adverse effects on other platforms.

The possibility of adversely affecting various platforms is why I added
the three-way option. What I found in all of the reports I received was
some motherboard manufacturers released BIOS updates with workarounds.
Those workarounds fixed this and other issues. The MSI B350M motherboard
I have had not received any BIOS workaround fix as of a few months ago.
In the case a BIOS update renders this fix unnecessary and causes
adverse affects I want the ability to turn it off.

If you'd like to read the history there is discussion on the open and
closed issues here:

https://github.com/b-rad-NDi/Ubuntu-media-tree-kernel-builder/issues



>
>> As far as I know, the Hauppauge QuadHD boards can receive 4 different
>> HD MPEG-TS streams (either from cable or air transmissions). On cable,
>> one transponder can have up to ~40 Mbits/second. So, this board will
>> produce 4 streams of up to 40 Mbps each, happening on different times,
>> each filled in a synchronous way. As nobody watches 4 channels at
>> the same time, it is safe to assume that at least 3 channels will
>> be recorded (if not all 4 channels). So, we're talking about 320 MBps
>> of traffic that may be competing with other DMA traffic (including
>> some from the Kernel itself, in order to handle memory swap).
>>
>> That can be recording channels for several weeks.
>>
>> This usually pushes the North Bridge into their limits, and could
>> be revealing some North Bridge/IOMMU issues that it would otherwise
>> be not noticed under normal traffic.
>>
> GPUs push a lot of traffic too and we aren't seeing anything like this.
>
>
>>>> There one thing that still bothers me: could this problem be due to
>>>> some BIOS setup [1]? If so, are there any ways for dynamically
>>>> disabling such features inside the driver?
>>>>
>>>> [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
>>>>
>>> possibly?  It's still not clear to me that this is specific to ryzen
>>> chips rather than a problem with the DMA setup on the cx board.  Is
>>> there a downside to enabling the workaround in general?
>> The problem here is that the code with resets the DMA engine (required
>> for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.
>>
>> So, one solution that would fit all doesn't seem to exist.
>>
>>> The original commit mentioned that xeon platforms were affected as well.
>> Xeon uses different chipsets and a different solution for the North
>> Bridge functionality, with may explain why some Xeon CPUs have the
>> same issue.
>>
> Shouldn't we add them to the list as well?
>
>>> Is it possible it's just particular platforms with wonky bioses?
>> Good point. Yeah, it could be triggered by a wonky bios or a bad setup
>> (like enabling overclock or activating some chipset-specific feature
>> that would increase the chance for a DMA transfer to fail).
>>
>>> Maybe DMI matching would be better?
>> Mapping via DMI could work too, but it would be a way harder to map,
>> as one would need to have a cx23885 board (if possible one with 4
>> tuners) and a series of different machines in order to test it.
>>
>> Based with previous experiences with bttv and cx88, I suspect that
>> we'll end by needing to map all machines with the same chipset.
>>
> Does anyone have a ryzen and the cx board and can they verify that
> they are actually seeing hangs without the special reset code?  So far
> I've only see non-Ryzen users complaining about hangs with the
> workaround.


See above. I'm the Hauppauge engineer that worked on this.





> Alex
>
>>>> Brad,
>>>>
>>>> From your reports about the DMA issues, do you know what generations
>>>> of the Ryzen are affected?
>>>>
>>>> Alex,
>>>>
>>>> Do you know if are there any differences at the IP block for the
>>>> DMA engine used on different Ryzen CPUs? I mean: I suspect that
>>>> the engine for Ryzen 2nd generation would likely be different than
>>>> the one at the 1st generation, but, along the same generation, does
>>>> the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?
>>> + Suravee.  I'm not really familiar with the changes, if any, that are
>>> in the pcie bridges on various AMD CPUs.  Or if there are changes, it
>>> would be hard to say whether this issue would affect them or not.
>>>
>>> Alex
>>
>>
>> Thanks,
>> Mauro


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-19  0:05                   ` Brad Love
@ 2018-12-19  0:08                     ` Brad Love
  2018-12-19 19:07                       ` Alex Deucher
  0 siblings, 1 reply; 26+ messages in thread
From: Brad Love @ 2018-12-19  0:08 UTC (permalink / raw)
  To: Brad Love, Alex Deucher, mchehab
  Cc: Suthikulpanit, Suravee, Markus Dobel, linux-media, linux-media-owner


On 18/12/2018 18.05, Brad Love wrote:
> On 18/12/2018 17.46, Alex Deucher wrote:
>> On Tue, Dec 18, 2018 at 7:45 AM Mauro Carvalho Chehab
>> <mchehab@kernel.org> wrote:
>>> Em Mon, 17 Dec 2018 21:05:11 -0500
>>> Alex Deucher <alexdeucher@gmail.com> escreveu:
>>>
>>>> On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
>>>> <mchehab@kernel.org> wrote:
>>>>> Em Sun, 16 Dec 2018 11:37:02 +0100
>>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>>>
>>>>>> On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
>>>>>>> Em Thu, 06 Dec 2018 18:18:23 +0100
>>>>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
>>>>>>>
>>>>>>>> Hi everyone,
>>>>>>>>
>>>>>>>> I will try if the hack mentioned fixes the issue for me on the weekend
>>>>>>>> (but I assume, as if effectively removes the function).
>>>>>>> It should, but it keeps a few changes. Just want to be sure that what
>>>>>>> would be left won't cause issues. If this works, the logic that would
>>>>>>> solve Ryzen DMA fixes will be contained into a single point, making
>>>>>>> easier to maintain it.
>>>>>> Hi,
>>>>>>
>>>>>> I wanted to have this setup running stable for a few days before
>>>>>> replying, that's why I am answering only now.
>>>>>>
>>>>>> But yes, as expected, with Mauro's hack, the driver has been stable for
>>>>>> me for about a week, with several
>>>>>> scheduled recordings in tvheadend, none of them missed.
>>>>>>
>>>>>> So, adding a reliable detection for affected chipsets, where the `if
>>>>>> (1)` currently is, should work.
>>>>> Markus,
>>>>>
>>>>> Thanks for testing!
>>>>>
>>>>> Brad/Alex,
>>>>>
>>>>> I guess we should then stick with this patch:
>>>>>         https://patchwork.linuxtv.org/patch/53351/
>>>>>
>>>>> The past approach that we used on cx88, bttv and other old drivers
>>>>> were to patch drivers/pci/quirks.c, making them to "taint" DMA
>>>>> memory controllers that were known to bad affect on media devices,
>>>>> and then some logic at the drivers to check for such "taint".
>>>>>
>>>>> However, that would require to touch another subsystem, with
>>>>> usually cause delays. Also, as Alex pointed, this could well
>>>>> be just a matter of incompatibility between the cx23885 and
>>>>> the Ryzen DMA controller, and may not affect any other drivers.
>>>>>
>>>>> So, let's start with a logic like what I proposed, fine
>>>>> tuning it to the Ryzen DMA controllers with we know have
>>>>> troubles with the driver.
>>>>>
>>>>> We need to list the PCI ID of the memory controllers at the
>>>>> device ID table on that patch, though. At the RFC patch,
>>>>> I just added an IOMMU PCI ID from a randon Ryzen CPU:
>>>>>
>>>>>         +static struct {
>>>>>         +       int vendor, dev;
>>>>>         +} const broken_dev_id[] = {
>>>>>         +       /* According with
>>>>>         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>>>>>         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
>>>>>         +        */
>>>>>         +       { PCI_VENDOR_ID_AMD, 0x1451 },
>>>>>         +};
>>>>>         +
>>>>>
>>>>> Ideally, the ID for the affected Ryzen DMA engines should be there at
>>>>> include/linux/pci_ids.h, instead of hard-coded inside a driver.
>>>>>
>>>>> Also, we should, instead, add there the PCI IDs of the DMA engines
>>>>> that are known to have problems with the cx23885.
>>>> These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?
>>> Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.
>>>
>>> Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines
>>> have internally a RISC CPU that it is programmed, in runtime, to do
>>> DMA scatter/gather. The actual DMA engine is there. For it to work, the
>>> Northbridge (or the CPU chipset - as nowadays several chipsets integrated
>>> the Northbridge inside an IP block at the CPU) has to do the counter part,
>>> by allowing the board's DMA engine to access the mainboard's main memory,
>>> usually via IOMMU, in a safe way[1].
>>>
>>> [1] preventing memory corruption if two devices try to do DMA to the
>>> same area, or if the DMA from the board tries to write at the same
>>> time the CPU tries to access it.
>>>
>>> Media PCI boards usually push the DMA logic to unusual conditions, as
>>> a large amount of data is transferred, in a synchronous way,
>>> between the PCIe card and memory.
>>>
>>> If the video stream is recorded, the same physical memory DMA mapped area
>>> where the data is written by the video board could be used on another DMA
>>> transfer via the HD disk controller.
>>>
>>> It is even possible to setup the Conexant's DMA engine to do transfers
>>> directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
>>> using V4L2 API overlay mode.
>> Is this supported today?  I didn't think all of the PCI and DMA API
>> changes had gone upstream yet for PCIe p2p support.  I think you'd
>> need to disable the IOMMU to work correctly.
>>
>>> There was a time where it used to be common to have Intel CPUs (or
>>> Intel-compatible CPUs) using non-Intel North Bridges. On such time,
>>> we've seen a lot of troubles with PCI to PCI transfers most of them
>>> when using non-Intel north bridges.
>> Well p2p was problematic in general in the early days of pcie since it
>> wasn't part of the spec.  Things are only recently improving in that
>> regard.  Linux doesn't even have good pcie p2p support yet.
>>
>>> With some north bridges, having the same block of memory mapped
>>> for two DMA operations (where memory writes come from the video
>>> card and memory reads from the HD disk controller) was also
>>> problematic, as the IOMMU had issues on managing two kinds of
>>> transfer for the same physical memory block.
>>>
>>> The report we have on the 95f408bb commit is:
>>>
>>>    "media: cx23885: Ryzen DMA related RiSC engine stall fixes
>>>
>>>     This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
>>>     platforms and some XEON platforms. On these platforms it is possible to
>>>     error out the RiSC engine and cause it to stall, whereafter the only
>>>     way to reset the board to a working state is to reboot.
>>> ...
>>>     [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"
>>>
>>> Brad could fill more details here, but I've seen the "risc op code
>>> error" before with bt878 and cx88 chipsets (with use a similar RISC).
>>> We usually get such error when there's a problem with the North Bridge
>>> that was not capable of doing their part at the DMA transfer.
>>>
>> What worries me is that the cx DMA engine seems to get into this state
>> even on non-Ryzen boards and in that case it doesn't seem to recover.
>> Hence Markus seeing hangs on his board with the workaround enabled
>> unconditionally.  Shouldn't it not get into that state in the first
>> place if the bridge is not problematic?  I'm concerned the we are just
>> papering over the real issue and we may end up causing stability
>> issues for platforms that are added to the list.
>
> This issue was never reported to Hauppauge until the release of Ryzen
> platform. The company support received lots of tickets and my github
> page also got issues reported. The issue was only seen on the products
> with 4 tuner/decoders. Those issues were tested and investigated on
> github, and I had to source a Ryzen platform to do testing on my own. I
> definitely saw this issue before the "fix". With this fix in place, the
> stability that was seen on previous platforms was then seen on Ryzen
> platforms. It is just after some time, now we see that the method used
> on ryzen has adverse effects on other platforms.
>
> The possibility of adversely affecting various platforms is why I added
> the three-way option. What I found in all of the reports I received was
> some motherboard manufacturers released BIOS updates with workarounds.
> Those workarounds fixed this and other issues. The MSI B350M motherboard
> I have had not received any BIOS workaround fix as of a few months ago.
> In the case a BIOS update renders this fix unnecessary and causes
> adverse affects I want the ability to turn it off.

I'd also just like to add, that those few platforms that did receive
bios updates that addressed this during the period this was under test,
were not adversely affected by the eventual fix.




> If you'd like to read the history there is discussion on the open and
> closed issues here:
>
> https://github.com/b-rad-NDi/Ubuntu-media-tree-kernel-builder/issues
>
>
>
>>> As far as I know, the Hauppauge QuadHD boards can receive 4 different
>>> HD MPEG-TS streams (either from cable or air transmissions). On cable,
>>> one transponder can have up to ~40 Mbits/second. So, this board will
>>> produce 4 streams of up to 40 Mbps each, happening on different times,
>>> each filled in a synchronous way. As nobody watches 4 channels at
>>> the same time, it is safe to assume that at least 3 channels will
>>> be recorded (if not all 4 channels). So, we're talking about 320 MBps
>>> of traffic that may be competing with other DMA traffic (including
>>> some from the Kernel itself, in order to handle memory swap).
>>>
>>> That can be recording channels for several weeks.
>>>
>>> This usually pushes the North Bridge into their limits, and could
>>> be revealing some North Bridge/IOMMU issues that it would otherwise
>>> be not noticed under normal traffic.
>>>
>> GPUs push a lot of traffic too and we aren't seeing anything like this.
>>
>>
>>>>> There one thing that still bothers me: could this problem be due to
>>>>> some BIOS setup [1]? If so, are there any ways for dynamically
>>>>> disabling such features inside the driver?
>>>>>
>>>>> [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
>>>>>
>>>> possibly?  It's still not clear to me that this is specific to ryzen
>>>> chips rather than a problem with the DMA setup on the cx board.  Is
>>>> there a downside to enabling the workaround in general?
>>> The problem here is that the code with resets the DMA engine (required
>>> for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.
>>>
>>> So, one solution that would fit all doesn't seem to exist.
>>>
>>>> The original commit mentioned that xeon platforms were affected as well.
>>> Xeon uses different chipsets and a different solution for the North
>>> Bridge functionality, with may explain why some Xeon CPUs have the
>>> same issue.
>>>
>> Shouldn't we add them to the list as well?
>>
>>>> Is it possible it's just particular platforms with wonky bioses?
>>> Good point. Yeah, it could be triggered by a wonky bios or a bad setup
>>> (like enabling overclock or activating some chipset-specific feature
>>> that would increase the chance for a DMA transfer to fail).
>>>
>>>> Maybe DMI matching would be better?
>>> Mapping via DMI could work too, but it would be a way harder to map,
>>> as one would need to have a cx23885 board (if possible one with 4
>>> tuners) and a series of different machines in order to test it.
>>>
>>> Based with previous experiences with bttv and cx88, I suspect that
>>> we'll end by needing to map all machines with the same chipset.
>>>
>> Does anyone have a ryzen and the cx board and can they verify that
>> they are actually seeing hangs without the special reset code?  So far
>> I've only see non-Ryzen users complaining about hangs with the
>> workaround.
>
> See above. I'm the Hauppauge engineer that worked on this.
>
>
>
>
>
>> Alex
>>
>>>>> Brad,
>>>>>
>>>>> From your reports about the DMA issues, do you know what generations
>>>>> of the Ryzen are affected?
>>>>>
>>>>> Alex,
>>>>>
>>>>> Do you know if are there any differences at the IP block for the
>>>>> DMA engine used on different Ryzen CPUs? I mean: I suspect that
>>>>> the engine for Ryzen 2nd generation would likely be different than
>>>>> the one at the 1st generation, but, along the same generation, does
>>>>> the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?
>>>> + Suravee.  I'm not really familiar with the changes, if any, that are
>>>> in the pcie bridges on various AMD CPUs.  Or if there are changes, it
>>>> would be hard to say whether this issue would affect them or not.
>>>>
>>>> Alex
>>>
>>> Thanks,
>>> Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-18 22:59               ` [PATCH v2] cx23885: only reset DMA on problematic CPUs Brad Love
  2018-12-18 23:49                 ` Alex Deucher
@ 2018-12-19 11:08                 ` Matthias Schwarzott
  2018-12-19 17:09                   ` Brad Love
  2018-12-19 17:07                 ` [PATCH v3] " Brad Love
  2 siblings, 1 reply; 26+ messages in thread
From: Matthias Schwarzott @ 2018-12-19 11:08 UTC (permalink / raw)
  To: Brad Love, linux-media, mchehab, markus.dobel, alexdeucher

Am 18.12.18 um 23:59 schrieb Brad Love:
> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
> related RiSC engine stall fixes") caused regresssions with other CPUs.
> 
> Ensure that the quirk will be applied only for the CPUs that
> are known to cause problems.
> 
> A module option is added for explicit control of the behaviour.
> 
> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
> 
> Signed-off-by: Brad Love <brad@nextdimension.cc>

Hi Brad,
I found one issue. See below.

Regards
Matthias

> ---
> Changes since v1:
> - Added module option for three way control
> - Removed '7' from pci id description, Ryzen 3 is the same id
> 
>  drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>  2 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
> index 39804d8..fb721c7 100644
> --- a/drivers/media/pci/cx23885/cx23885-core.c
> +++ b/drivers/media/pci/cx23885/cx23885-core.c
...
> @@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>  	/* TODO: 23-19 */
>  }
>  
> +static struct {
> +	int vendor, dev;
> +} const broken_dev_id[] = {
> +	/* According with
> +	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> +	 * 0x1451 is PCI ID for the IOMMU found on Ryzen
> +	 */
> +	{ PCI_VENDOR_ID_AMD, 0x1451 },
> +};
> +
> +static bool cx23885_does_need_dma_reset(void)
> +{
> +	int i;
> +	struct pci_dev *pdev = NULL;
> +
> +	if (dma_reset_workaround == 0)
> +		return false;
> +	else if (dma_reset_workaround == 2)
> +		return true;
> +
> +	for (i = 0; i < sizeof(broken_dev_id); i++) {
This is broken. sizeof delivers the size in bytes, not in number of
array elements. ARRAY_SIZE is what you want.

> +		pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
> +		if (pdev) {
> +			pci_dev_put(pdev);
> +			return true;
> +		}
> +	}
> +	return false;
> +}
> +
>  static int cx23885_initdev(struct pci_dev *pci_dev,
>  			   const struct pci_device_id *pci_id)
>  {
> @@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>  	if (NULL == dev)
>  		return -ENOMEM;
>  
> +	dev->need_dma_reset = cx23885_does_need_dma_reset();
> +
>  	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>  	if (err < 0)
>  		goto fail_free;
> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
> index d54c7ee..cf965ef 100644
> --- a/drivers/media/pci/cx23885/cx23885.h
> +++ b/drivers/media/pci/cx23885/cx23885.h
> @@ -451,6 +451,8 @@ struct cx23885_dev {
>  	/* Analog raw audio */
>  	struct cx23885_audio_dev   *audio_dev;
>  
> +	/* Does the system require periodic DMA resets? */
> +	unsigned int		need_dma_reset:1;
>  };
>  
>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3] cx23885: only reset DMA on problematic CPUs
  2018-12-18 22:59               ` [PATCH v2] cx23885: only reset DMA on problematic CPUs Brad Love
  2018-12-18 23:49                 ` Alex Deucher
  2018-12-19 11:08                 ` Matthias Schwarzott
@ 2018-12-19 17:07                 ` Brad Love
  2018-12-20 13:43                   ` Mauro Carvalho Chehab
  2 siblings, 1 reply; 26+ messages in thread
From: Brad Love @ 2018-12-19 17:07 UTC (permalink / raw)
  To: linux-media, mchehab, markus.dobel, alexdeucher, zzam; +Cc: Brad Love

It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
related RiSC engine stall fixes") caused regresssions with other CPUs.

Ensure that the quirk will be applied only for the CPUs that
are known to cause problems.

A module option is added for explicit control of the behaviour.

Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")

Signed-off-by: Brad Love <brad@nextdimension.cc>
---
Since v2:
- Replaced sizeof with ARRAY_SIZE
- Fixed column 80 checkpatch complaint
Changes since v1:
- Added module option for three way control
- Removed '7' from pci id description, Ryzen 3 is the same id

 drivers/media/pci/cx23885/cx23885-core.c | 55 ++++++++++++++++++++++++++++++--
 drivers/media/pci/cx23885/cx23885.h      |  2 ++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
index 39804d8..e2e3649 100644
--- a/drivers/media/pci/cx23885/cx23885-core.c
+++ b/drivers/media/pci/cx23885/cx23885-core.c
@@ -23,6 +23,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
+#include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(CX23885_VERSION);
 
+/*
+ * Some platforms have been found to require periodic resetting of the DMA
+ * engine. Ryzen and XEON platforms are known to be affected. The symptom
+ * encountered is "mpeg risc op code error". Only Ryzen platforms employ
+ * this workaround if the option equals 1. The workaround can be explicitly
+ * disabled for all platforms by setting to 0, the workaround can be forced
+ * on for any platform by setting to 2.
+ */
+static unsigned int dma_reset_workaround = 1;
+module_param(dma_reset_workaround, int, 0644);
+MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
+
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
@@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
 
 static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
 {
-	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
-	uint32_t reg2_val = cx_read(TC_REQ_SET);
+	uint32_t reg1_val, reg2_val;
+
+	if (!dev->need_dma_reset)
+		return;
+
+	reg1_val = cx_read(TC_REQ); /* read-only */
+	reg2_val = cx_read(TC_REQ_SET);
 
 	if (reg1_val && reg2_val) {
 		cx_write(TC_REQ, reg1_val);
@@ -2058,6 +2076,37 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
 	/* TODO: 23-19 */
 }
 
+static struct {
+	int vendor, dev;
+} const broken_dev_id[] = {
+	/* According with
+	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
+	 * 0x1451 is PCI ID for the IOMMU found on Ryzen
+	 */
+	{ PCI_VENDOR_ID_AMD, 0x1451 },
+};
+
+static bool cx23885_does_need_dma_reset(void)
+{
+	int i;
+	struct pci_dev *pdev = NULL;
+
+	if (dma_reset_workaround == 0)
+		return false;
+	else if (dma_reset_workaround == 2)
+		return true;
+
+	for (i = 0; i < ARRAY_SIZE(broken_dev_id); i++) {
+		pdev = pci_get_device(broken_dev_id[i].vendor,
+					broken_dev_id[i].dev, NULL);
+		if (pdev) {
+			pci_dev_put(pdev);
+			return true;
+		}
+	}
+	return false;
+}
+
 static int cx23885_initdev(struct pci_dev *pci_dev,
 			   const struct pci_device_id *pci_id)
 {
@@ -2069,6 +2118,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
 	if (NULL == dev)
 		return -ENOMEM;
 
+	dev->need_dma_reset = cx23885_does_need_dma_reset();
+
 	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
 	if (err < 0)
 		goto fail_free;
diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
index d54c7ee..cf965ef 100644
--- a/drivers/media/pci/cx23885/cx23885.h
+++ b/drivers/media/pci/cx23885/cx23885.h
@@ -451,6 +451,8 @@ struct cx23885_dev {
 	/* Analog raw audio */
 	struct cx23885_audio_dev   *audio_dev;
 
+	/* Does the system require periodic DMA resets? */
+	unsigned int		need_dma_reset:1;
 };
 
 static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-19 11:08                 ` Matthias Schwarzott
@ 2018-12-19 17:09                   ` Brad Love
  0 siblings, 0 replies; 26+ messages in thread
From: Brad Love @ 2018-12-19 17:09 UTC (permalink / raw)
  To: Matthias Schwarzott, Brad Love, linux-media, mchehab,
	markus.dobel, alexdeucher


On 19/12/2018 05.08, Matthias Schwarzott wrote:
> Am 18.12.18 um 23:59 schrieb Brad Love:
>> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
>> related RiSC engine stall fixes") caused regresssions with other CPUs.
>>
>> Ensure that the quirk will be applied only for the CPUs that
>> are known to cause problems.
>>
>> A module option is added for explicit control of the behaviour.
>>
>> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
>>
>> Signed-off-by: Brad Love <brad@nextdimension.cc>
> Hi Brad,
> I found one issue. See below.
>
> Regards
> Matthias


Thanks for the catch Matthias, v3 submitted.

Cheers,

Brad



>
>> ---
>> Changes since v1:
>> - Added module option for three way control
>> - Removed '7' from pci id description, Ryzen 3 is the same id
>>
>>  drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
>>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>>  2 files changed, 54 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
>> index 39804d8..fb721c7 100644
>> --- a/drivers/media/pci/cx23885/cx23885-core.c
>> +++ b/drivers/media/pci/cx23885/cx23885-core.c
> ...
>> @@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>>  	/* TODO: 23-19 */
>>  }
>>  
>> +static struct {
>> +	int vendor, dev;
>> +} const broken_dev_id[] = {
>> +	/* According with
>> +	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>> +	 * 0x1451 is PCI ID for the IOMMU found on Ryzen
>> +	 */
>> +	{ PCI_VENDOR_ID_AMD, 0x1451 },
>> +};
>> +
>> +static bool cx23885_does_need_dma_reset(void)
>> +{
>> +	int i;
>> +	struct pci_dev *pdev = NULL;
>> +
>> +	if (dma_reset_workaround == 0)
>> +		return false;
>> +	else if (dma_reset_workaround == 2)
>> +		return true;
>> +
>> +	for (i = 0; i < sizeof(broken_dev_id); i++) {
> This is broken. sizeof delivers the size in bytes, not in number of
> array elements. ARRAY_SIZE is what you want.
>
>> +		pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
>> +		if (pdev) {
>> +			pci_dev_put(pdev);
>> +			return true;
>> +		}
>> +	}
>> +	return false;
>> +}
>> +
>>  static int cx23885_initdev(struct pci_dev *pci_dev,
>>  			   const struct pci_device_id *pci_id)
>>  {
>> @@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>>  	if (NULL == dev)
>>  		return -ENOMEM;
>>  
>> +	dev->need_dma_reset = cx23885_does_need_dma_reset();
>> +
>>  	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>>  	if (err < 0)
>>  		goto fail_free;
>> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
>> index d54c7ee..cf965ef 100644
>> --- a/drivers/media/pci/cx23885/cx23885.h
>> +++ b/drivers/media/pci/cx23885/cx23885.h
>> @@ -451,6 +451,8 @@ struct cx23885_dev {
>>  	/* Analog raw audio */
>>  	struct cx23885_audio_dev   *audio_dev;
>>  
>> +	/* Does the system require periodic DMA resets? */
>> +	unsigned int		need_dma_reset:1;
>>  };
>>  
>>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
>>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-18 23:49                 ` Alex Deucher
@ 2018-12-19 17:26                   ` Brad Love
  2018-12-19 17:40                     ` Brad Love
  0 siblings, 1 reply; 26+ messages in thread
From: Brad Love @ 2018-12-19 17:26 UTC (permalink / raw)
  To: Alex Deucher, Brad Love; +Cc: linux-media, mchehab, Markus Dobel

Hi Alex,


On 18/12/2018 17.49, Alex Deucher wrote:
> On Tue, Dec 18, 2018 at 5:59 PM Brad Love <brad@nextdimension.cc> wrote:
>> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
>> related RiSC engine stall fixes") caused regresssions with other CPUs.
>>
>> Ensure that the quirk will be applied only for the CPUs that
>> are known to cause problems.
>>
>> A module option is added for explicit control of the behaviour.
>>
>> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
>>
>> Signed-off-by: Brad Love <brad@nextdimension.cc>
>> ---
>> Changes since v1:
>> - Added module option for three way control
>> - Removed '7' from pci id description, Ryzen 3 is the same id
>>
>>  drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
>>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>>  2 files changed, 54 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
>> index 39804d8..fb721c7 100644
>> --- a/drivers/media/pci/cx23885/cx23885-core.c
>> +++ b/drivers/media/pci/cx23885/cx23885-core.c
>> @@ -23,6 +23,7 @@
>>  #include <linux/moduleparam.h>
>>  #include <linux/kmod.h>
>>  #include <linux/kernel.h>
>> +#include <linux/pci.h>
>>  #include <linux/slab.h>
>>  #include <linux/interrupt.h>
>>  #include <linux/delay.h>
>> @@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
>>  MODULE_LICENSE("GPL");
>>  MODULE_VERSION(CX23885_VERSION);
>>
>> +/*
>> + * Some platforms have been found to require periodic resetting of the DMA
>> + * engine. Ryzen and XEON platforms are known to be affected. The symptom
>> + * encountered is "mpeg risc op code error". Only Ryzen platforms employ
>> + * this workaround if the option equals 1. The workaround can be explicitly
>> + * disabled for all platforms by setting to 0, the workaround can be forced
>> + * on for any platform by setting to 2.
>> + */
>> +static unsigned int dma_reset_workaround = 1;
>> +module_param(dma_reset_workaround, int, 0644);
>> +MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
>> +
>>  static unsigned int debug;
>>  module_param(debug, int, 0644);
>>  MODULE_PARM_DESC(debug, "enable debug messages");
>> @@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
>>
>>  static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
>>  {
>> -       uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
>> -       uint32_t reg2_val = cx_read(TC_REQ_SET);
>> +       uint32_t reg1_val, reg2_val;
>> +
>> +       if (!dev->need_dma_reset)
>> +               return;
>> +
>> +       reg1_val = cx_read(TC_REQ); /* read-only */
>> +       reg2_val = cx_read(TC_REQ_SET);
>>
>>         if (reg1_val && reg2_val) {
>>                 cx_write(TC_REQ, reg1_val);
>> @@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>>         /* TODO: 23-19 */
>>  }
>>
>> +static struct {
>> +       int vendor, dev;
>> +} const broken_dev_id[] = {
>> +       /* According with
>> +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>> +        * 0x1451 is PCI ID for the IOMMU found on Ryzen
>> +        */
>> +       { PCI_VENDOR_ID_AMD, 0x1451 },
> Does this issue only happen with the IOMMU is enabled?  Is it only for
> p2p transfers?  Until recently the DMA and PCI subsystems didn't
> actually support p2p properly when the IOMMU was enabled.  that might
> explain some of the issues.  Additionally, if you match based on the
> IOMMU id, you won't match if the user disables the IOMMU in the sbios.
> Is this only an issue with the IOMMU enabled?
>
> Alex


I'm unsure of the answers to your questions. I do still have my Ryzen3
system around, I'll see if I can disable IOMMU and do some tests.

Regards,

Brad



>> +};
>> +
>> +static bool cx23885_does_need_dma_reset(void)
>> +{
>> +       int i;
>> +       struct pci_dev *pdev = NULL;
>> +
>> +       if (dma_reset_workaround == 0)
>> +               return false;
>> +       else if (dma_reset_workaround == 2)
>> +               return true;
>> +
>> +       for (i = 0; i < sizeof(broken_dev_id); i++) {
>> +               pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
>> +               if (pdev) {
>> +                       pci_dev_put(pdev);
>> +                       return true;
>> +               }
>> +       }
>> +       return false;
>> +}
>> +
>>  static int cx23885_initdev(struct pci_dev *pci_dev,
>>                            const struct pci_device_id *pci_id)
>>  {
>> @@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>>         if (NULL == dev)
>>                 return -ENOMEM;
>>
>> +       dev->need_dma_reset = cx23885_does_need_dma_reset();
>> +
>>         err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>>         if (err < 0)
>>                 goto fail_free;
>> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
>> index d54c7ee..cf965ef 100644
>> --- a/drivers/media/pci/cx23885/cx23885.h
>> +++ b/drivers/media/pci/cx23885/cx23885.h
>> @@ -451,6 +451,8 @@ struct cx23885_dev {
>>         /* Analog raw audio */
>>         struct cx23885_audio_dev   *audio_dev;
>>
>> +       /* Does the system require periodic DMA resets? */
>> +       unsigned int            need_dma_reset:1;
>>  };
>>
>>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
>> --
>> 2.7.4
>>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2] cx23885: only reset DMA on problematic CPUs
  2018-12-19 17:26                   ` Brad Love
@ 2018-12-19 17:40                     ` Brad Love
  0 siblings, 0 replies; 26+ messages in thread
From: Brad Love @ 2018-12-19 17:40 UTC (permalink / raw)
  To: Brad Love, Alex Deucher; +Cc: linux-media, mchehab, Markus Dobel

Hi Alex,


On 19/12/2018 11.26, Brad Love wrote:
> Hi Alex,
>
>
> On 18/12/2018 17.49, Alex Deucher wrote:
>> On Tue, Dec 18, 2018 at 5:59 PM Brad Love <brad@nextdimension.cc> wrote:
>>> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
>>> related RiSC engine stall fixes") caused regresssions with other CPUs.
>>>
>>> Ensure that the quirk will be applied only for the CPUs that
>>> are known to cause problems.
>>>
>>> A module option is added for explicit control of the behaviour.
>>>
>>> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
>>>
>>> Signed-off-by: Brad Love <brad@nextdimension.cc>
>>> ---
>>> Changes since v1:
>>> - Added module option for three way control
>>> - Removed '7' from pci id description, Ryzen 3 is the same id
>>>
>>>  drivers/media/pci/cx23885/cx23885-core.c | 54 ++++++++++++++++++++++++++++++--
>>>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>>>  2 files changed, 54 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
>>> index 39804d8..fb721c7 100644
>>> --- a/drivers/media/pci/cx23885/cx23885-core.c
>>> +++ b/drivers/media/pci/cx23885/cx23885-core.c
>>> @@ -23,6 +23,7 @@
>>>  #include <linux/moduleparam.h>
>>>  #include <linux/kmod.h>
>>>  #include <linux/kernel.h>
>>> +#include <linux/pci.h>
>>>  #include <linux/slab.h>
>>>  #include <linux/interrupt.h>
>>>  #include <linux/delay.h>
>>> @@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
>>>  MODULE_LICENSE("GPL");
>>>  MODULE_VERSION(CX23885_VERSION);
>>>
>>> +/*
>>> + * Some platforms have been found to require periodic resetting of the DMA
>>> + * engine. Ryzen and XEON platforms are known to be affected. The symptom
>>> + * encountered is "mpeg risc op code error". Only Ryzen platforms employ
>>> + * this workaround if the option equals 1. The workaround can be explicitly
>>> + * disabled for all platforms by setting to 0, the workaround can be forced
>>> + * on for any platform by setting to 2.
>>> + */
>>> +static unsigned int dma_reset_workaround = 1;
>>> +module_param(dma_reset_workaround, int, 0644);
>>> +MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
>>> +
>>>  static unsigned int debug;
>>>  module_param(debug, int, 0644);
>>>  MODULE_PARM_DESC(debug, "enable debug messages");
>>> @@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
>>>
>>>  static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
>>>  {
>>> -       uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
>>> -       uint32_t reg2_val = cx_read(TC_REQ_SET);
>>> +       uint32_t reg1_val, reg2_val;
>>> +
>>> +       if (!dev->need_dma_reset)
>>> +               return;
>>> +
>>> +       reg1_val = cx_read(TC_REQ); /* read-only */
>>> +       reg2_val = cx_read(TC_REQ_SET);
>>>
>>>         if (reg1_val && reg2_val) {
>>>                 cx_write(TC_REQ, reg1_val);
>>> @@ -2058,6 +2076,36 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>>>         /* TODO: 23-19 */
>>>  }
>>>
>>> +static struct {
>>> +       int vendor, dev;
>>> +} const broken_dev_id[] = {
>>> +       /* According with
>>> +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
>>> +        * 0x1451 is PCI ID for the IOMMU found on Ryzen
>>> +        */
>>> +       { PCI_VENDOR_ID_AMD, 0x1451 },
>> Does this issue only happen with the IOMMU is enabled?  Is it only for
>> p2p transfers?  Until recently the DMA and PCI subsystems didn't
>> actually support p2p properly when the IOMMU was enabled.  that might
>> explain some of the issues.  Additionally, if you match based on the
>> IOMMU id, you won't match if the user disables the IOMMU in the sbios.
>> Is this only an issue with the IOMMU enabled?
>>
>> Alex
>
> I'm unsure of the answers to your questions. I do still have my Ryzen3
> system around, I'll see if I can disable IOMMU and do some tests.
>
> Regards,
>
> Brad


The moment I looked this up I recalled something. During testing I had
to pass iommu=pt as a kernel command line option, that option is still
in my grub config. Without that set I would get critical AMD-VI errors
from the onboard ethernet as well as gpu. I have left that setting as is
the entire time (~8mo) I've been testing the system, because I could not
boot initially without it. I'll try other options now.

Regards,

Brad




>
>
>>> +};
>>> +
>>> +static bool cx23885_does_need_dma_reset(void)
>>> +{
>>> +       int i;
>>> +       struct pci_dev *pdev = NULL;
>>> +
>>> +       if (dma_reset_workaround == 0)
>>> +               return false;
>>> +       else if (dma_reset_workaround == 2)
>>> +               return true;
>>> +
>>> +       for (i = 0; i < sizeof(broken_dev_id); i++) {
>>> +               pdev = pci_get_device(broken_dev_id[i].vendor, broken_dev_id[i].dev, NULL);
>>> +               if (pdev) {
>>> +                       pci_dev_put(pdev);
>>> +                       return true;
>>> +               }
>>> +       }
>>> +       return false;
>>> +}
>>> +
>>>  static int cx23885_initdev(struct pci_dev *pci_dev,
>>>                            const struct pci_device_id *pci_id)
>>>  {
>>> @@ -2069,6 +2117,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>>>         if (NULL == dev)
>>>                 return -ENOMEM;
>>>
>>> +       dev->need_dma_reset = cx23885_does_need_dma_reset();
>>> +
>>>         err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>>>         if (err < 0)
>>>                 goto fail_free;
>>> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
>>> index d54c7ee..cf965ef 100644
>>> --- a/drivers/media/pci/cx23885/cx23885.h
>>> +++ b/drivers/media/pci/cx23885/cx23885.h
>>> @@ -451,6 +451,8 @@ struct cx23885_dev {
>>>         /* Analog raw audio */
>>>         struct cx23885_audio_dev   *audio_dev;
>>>
>>> +       /* Does the system require periodic DMA resets? */
>>> +       unsigned int            need_dma_reset:1;
>>>  };
>>>
>>>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)
>>> --
>>> 2.7.4
>>>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes
  2018-12-19  0:08                     ` Brad Love
@ 2018-12-19 19:07                       ` Alex Deucher
  0 siblings, 0 replies; 26+ messages in thread
From: Alex Deucher @ 2018-12-19 19:07 UTC (permalink / raw)
  To: Brad Love
  Cc: mchehab, Suthikulpanit, Suravee, Markus Dobel, linux-media,
	linux-media-owner

On Tue, Dec 18, 2018 at 7:08 PM Brad Love <brad@nextdimension.cc> wrote:
>
>
> On 18/12/2018 18.05, Brad Love wrote:
> > On 18/12/2018 17.46, Alex Deucher wrote:
> >> On Tue, Dec 18, 2018 at 7:45 AM Mauro Carvalho Chehab
> >> <mchehab@kernel.org> wrote:
> >>> Em Mon, 17 Dec 2018 21:05:11 -0500
> >>> Alex Deucher <alexdeucher@gmail.com> escreveu:
> >>>
> >>>> On Sun, Dec 16, 2018 at 9:23 AM Mauro Carvalho Chehab
> >>>> <mchehab@kernel.org> wrote:
> >>>>> Em Sun, 16 Dec 2018 11:37:02 +0100
> >>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
> >>>>>
> >>>>>> On 06.12.2018 19:01, Mauro Carvalho Chehab wrote:
> >>>>>>> Em Thu, 06 Dec 2018 18:18:23 +0100
> >>>>>>> Markus Dobel <markus.dobel@gmx.de> escreveu:
> >>>>>>>
> >>>>>>>> Hi everyone,
> >>>>>>>>
> >>>>>>>> I will try if the hack mentioned fixes the issue for me on the weekend
> >>>>>>>> (but I assume, as if effectively removes the function).
> >>>>>>> It should, but it keeps a few changes. Just want to be sure that what
> >>>>>>> would be left won't cause issues. If this works, the logic that would
> >>>>>>> solve Ryzen DMA fixes will be contained into a single point, making
> >>>>>>> easier to maintain it.
> >>>>>> Hi,
> >>>>>>
> >>>>>> I wanted to have this setup running stable for a few days before
> >>>>>> replying, that's why I am answering only now.
> >>>>>>
> >>>>>> But yes, as expected, with Mauro's hack, the driver has been stable for
> >>>>>> me for about a week, with several
> >>>>>> scheduled recordings in tvheadend, none of them missed.
> >>>>>>
> >>>>>> So, adding a reliable detection for affected chipsets, where the `if
> >>>>>> (1)` currently is, should work.
> >>>>> Markus,
> >>>>>
> >>>>> Thanks for testing!
> >>>>>
> >>>>> Brad/Alex,
> >>>>>
> >>>>> I guess we should then stick with this patch:
> >>>>>         https://patchwork.linuxtv.org/patch/53351/
> >>>>>
> >>>>> The past approach that we used on cx88, bttv and other old drivers
> >>>>> were to patch drivers/pci/quirks.c, making them to "taint" DMA
> >>>>> memory controllers that were known to bad affect on media devices,
> >>>>> and then some logic at the drivers to check for such "taint".
> >>>>>
> >>>>> However, that would require to touch another subsystem, with
> >>>>> usually cause delays. Also, as Alex pointed, this could well
> >>>>> be just a matter of incompatibility between the cx23885 and
> >>>>> the Ryzen DMA controller, and may not affect any other drivers.
> >>>>>
> >>>>> So, let's start with a logic like what I proposed, fine
> >>>>> tuning it to the Ryzen DMA controllers with we know have
> >>>>> troubles with the driver.
> >>>>>
> >>>>> We need to list the PCI ID of the memory controllers at the
> >>>>> device ID table on that patch, though. At the RFC patch,
> >>>>> I just added an IOMMU PCI ID from a randon Ryzen CPU:
> >>>>>
> >>>>>         +static struct {
> >>>>>         +       int vendor, dev;
> >>>>>         +} const broken_dev_id[] = {
> >>>>>         +       /* According with
> >>>>>         +        * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> >>>>>         +        * 0x1451 is PCI ID for the IOMMU found on Ryzen 7
> >>>>>         +        */
> >>>>>         +       { PCI_VENDOR_ID_AMD, 0x1451 },
> >>>>>         +};
> >>>>>         +
> >>>>>
> >>>>> Ideally, the ID for the affected Ryzen DMA engines should be there at
> >>>>> include/linux/pci_ids.h, instead of hard-coded inside a driver.
> >>>>>
> >>>>> Also, we should, instead, add there the PCI IDs of the DMA engines
> >>>>> that are known to have problems with the cx23885.
> >>>> These aren't really DMA engines.  Isn't this just the pcie bridge on the CPU?
> >>> Yeah, it is not the DMA engine itself, but the CPU/chipset support for it.
> >>>
> >>> Let me be a little clearer. The Conexant chipsets for PCI/PCIe engines
> >>> have internally a RISC CPU that it is programmed, in runtime, to do
> >>> DMA scatter/gather. The actual DMA engine is there. For it to work, the
> >>> Northbridge (or the CPU chipset - as nowadays several chipsets integrated
> >>> the Northbridge inside an IP block at the CPU) has to do the counter part,
> >>> by allowing the board's DMA engine to access the mainboard's main memory,
> >>> usually via IOMMU, in a safe way[1].
> >>>
> >>> [1] preventing memory corruption if two devices try to do DMA to the
> >>> same area, or if the DMA from the board tries to write at the same
> >>> time the CPU tries to access it.
> >>>
> >>> Media PCI boards usually push the DMA logic to unusual conditions, as
> >>> a large amount of data is transferred, in a synchronous way,
> >>> between the PCIe card and memory.
> >>>
> >>> If the video stream is recorded, the same physical memory DMA mapped area
> >>> where the data is written by the video board could be used on another DMA
> >>> transfer via the HD disk controller.
> >>>
> >>> It is even possible to setup the Conexant's DMA engine to do transfers
> >>> directly to the GPU's internal memory, causing a PCI to PCI DMA transfer,
> >>> using V4L2 API overlay mode.
> >> Is this supported today?  I didn't think all of the PCI and DMA API
> >> changes had gone upstream yet for PCIe p2p support.  I think you'd
> >> need to disable the IOMMU to work correctly.
> >>
> >>> There was a time where it used to be common to have Intel CPUs (or
> >>> Intel-compatible CPUs) using non-Intel North Bridges. On such time,
> >>> we've seen a lot of troubles with PCI to PCI transfers most of them
> >>> when using non-Intel north bridges.
> >> Well p2p was problematic in general in the early days of pcie since it
> >> wasn't part of the spec.  Things are only recently improving in that
> >> regard.  Linux doesn't even have good pcie p2p support yet.
> >>
> >>> With some north bridges, having the same block of memory mapped
> >>> for two DMA operations (where memory writes come from the video
> >>> card and memory reads from the HD disk controller) was also
> >>> problematic, as the IOMMU had issues on managing two kinds of
> >>> transfer for the same physical memory block.
> >>>
> >>> The report we have on the 95f408bb commit is:
> >>>
> >>>    "media: cx23885: Ryzen DMA related RiSC engine stall fixes
> >>>
> >>>     This bug affects all of Hauppauge QuadHD boards when used on all Ryzen
> >>>     platforms and some XEON platforms. On these platforms it is possible to
> >>>     error out the RiSC engine and cause it to stall, whereafter the only
> >>>     way to reset the board to a working state is to reboot.
> >>> ...
> >>>     [  255.663598] cx23885: cx23885[0]: mpeg risc op code error"
> >>>
> >>> Brad could fill more details here, but I've seen the "risc op code
> >>> error" before with bt878 and cx88 chipsets (with use a similar RISC).
> >>> We usually get such error when there's a problem with the North Bridge
> >>> that was not capable of doing their part at the DMA transfer.
> >>>
> >> What worries me is that the cx DMA engine seems to get into this state
> >> even on non-Ryzen boards and in that case it doesn't seem to recover.
> >> Hence Markus seeing hangs on his board with the workaround enabled
> >> unconditionally.  Shouldn't it not get into that state in the first
> >> place if the bridge is not problematic?  I'm concerned the we are just
> >> papering over the real issue and we may end up causing stability
> >> issues for platforms that are added to the list.
> >
> > This issue was never reported to Hauppauge until the release of Ryzen
> > platform. The company support received lots of tickets and my github
> > page also got issues reported. The issue was only seen on the products
> > with 4 tuner/decoders. Those issues were tested and investigated on
> > github, and I had to source a Ryzen platform to do testing on my own. I
> > definitely saw this issue before the "fix". With this fix in place, the
> > stability that was seen on previous platforms was then seen on Ryzen
> > platforms. It is just after some time, now we see that the method used
> > on ryzen has adverse effects on other platforms.
> >
> > The possibility of adversely affecting various platforms is why I added
> > the three-way option. What I found in all of the reports I received was
> > some motherboard manufacturers released BIOS updates with workarounds.
> > Those workarounds fixed this and other issues. The MSI B350M motherboard
> > I have had not received any BIOS workaround fix as of a few months ago.
> > In the case a BIOS update renders this fix unnecessary and causes
> > adverse affects I want the ability to turn it off.
>
> I'd also just like to add, that those few platforms that did receive
> bios updates that addressed this during the period this was under test,
> were not adversely affected by the eventual fix.

Thanks for clarifying.  I'm ok with the patch.  Although as I
commented on the patch, we might want to use the Host bridge or one of
the pci bridges rather than the IOMMU to detect the platform since the
IOMMU can be disabled.  here's the lspci info for the gen1 ryzen I'm
currently sitting at:

00:00.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Root Complex [1022:1450]
00:00.2 IOMMU [0806]: Advanced Micro Devices, Inc. [AMD] Family 17h
(Models 00h-0fh) I/O Memory Management Unit [1022:1451]
00:01.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:01.1 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe GPP Bridge [1022:1453]
00:01.3 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe GPP Bridge [1022:1453]
00:02.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:03.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:03.1 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe GPP Bridge [1022:1453]
00:04.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:07.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:07.1 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Internal PCIe GPP Bridge 0 to Bus B [1022:1454]
00:08.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) PCIe Dummy Host Bridge [1022:1452]
00:08.1 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Internal PCIe GPP Bridge 0 to Bus B [1022:1454]
00:14.0 SMBus [0c05]: Advanced Micro Devices, Inc. [AMD] FCH SMBus
Controller [1022:790b] (rev 59)
00:14.3 ISA bridge [0601]: Advanced Micro Devices, Inc. [AMD] FCH LPC
Bridge [1022:790e] (rev 51)
00:18.0 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 0 [1022:1460]
00:18.1 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 1 [1022:1461]
00:18.2 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 2 [1022:1462]
00:18.3 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 3 [1022:1463]
00:18.4 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 4 [1022:1464]
00:18.5 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 5 [1022:1465]
00:18.6 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 6 [1022:1466]
00:18.7 Host bridge [0600]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) Data Fabric: Device 18h; Function 7 [1022:1467]
01:00.0 Non-Volatile memory controller [0108]: Samsung Electronics Co
Ltd NVMe SSD Controller SM961/PM961 [144d:a804]
02:00.0 USB controller [0c03]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset USB 3.1 xHCI Controller [1022:43bb] (rev 02)
02:00.1 SATA controller [0106]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset SATA Controller [1022:43b7] (rev 02)
02:00.2 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] Device
[1022:43b2] (rev 02)
03:00.0 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset PCIe Port [1022:43b4] (rev 02)
03:01.0 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset PCIe Port [1022:43b4] (rev 02)
03:04.0 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset PCIe Port [1022:43b4] (rev 02)
03:06.0 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset PCIe Port [1022:43b4] (rev 02)
03:07.0 PCI bridge [0604]: Advanced Micro Devices, Inc. [AMD] 300
Series Chipset PCIe Port [1022:43b4] (rev 02)
04:00.0 Ethernet controller [0200]: Intel Corporation I211 Gigabit
Network Connection [8086:1539] (rev 03)
09:00.0 VGA compatible controller [0300]: Advanced Micro Devices, Inc.
[AMD/ATI] Tonga PRO [Radeon R9 285/380] [1002:6939]
09:00.1 Audio device [0403]: Advanced Micro Devices, Inc. [AMD/ATI]
Tonga HDMI Audio [Radeon R9 285/380] [1002:aad8]
0a:00.0 Non-Essential Instrumentation [1300]: Advanced Micro Devices,
Inc. [AMD] Device [1022:145a]
0a:00.2 Encryption controller [1080]: Advanced Micro Devices, Inc.
[AMD] Family 17h (Models 00h-0fh) Platform Security Processor
[1022:1456]
0a:00.3 USB controller [0c03]: Advanced Micro Devices, Inc. [AMD]
Family 17h (Models 00h-0fh) USB 3.0 Host Controller [1022:145c]
0b:00.0 Non-Essential Instrumentation [1300]: Advanced Micro Devices,
Inc. [AMD] Device [1022:1455]
0b:00.2 SATA controller [0106]: Advanced Micro Devices, Inc. [AMD] FCH
SATA Controller [AHCI mode] [1022:7901] (rev 51)
0b:00.3 Audio device [0403]: Advanced Micro Devices, Inc. [AMD] Family
17h (Models 00h-0fh) HD Audio Controller [1022:1457]

Alex

>
>
>
>
> > If you'd like to read the history there is discussion on the open and
> > closed issues here:
> >
> > https://github.com/b-rad-NDi/Ubuntu-media-tree-kernel-builder/issues
> >
> >
> >
> >>> As far as I know, the Hauppauge QuadHD boards can receive 4 different
> >>> HD MPEG-TS streams (either from cable or air transmissions). On cable,
> >>> one transponder can have up to ~40 Mbits/second. So, this board will
> >>> produce 4 streams of up to 40 Mbps each, happening on different times,
> >>> each filled in a synchronous way. As nobody watches 4 channels at
> >>> the same time, it is safe to assume that at least 3 channels will
> >>> be recorded (if not all 4 channels). So, we're talking about 320 MBps
> >>> of traffic that may be competing with other DMA traffic (including
> >>> some from the Kernel itself, in order to handle memory swap).
> >>>
> >>> That can be recording channels for several weeks.
> >>>
> >>> This usually pushes the North Bridge into their limits, and could
> >>> be revealing some North Bridge/IOMMU issues that it would otherwise
> >>> be not noticed under normal traffic.
> >>>
> >> GPUs push a lot of traffic too and we aren't seeing anything like this.
> >>
> >>
> >>>>> There one thing that still bothers me: could this problem be due to
> >>>>> some BIOS setup [1]? If so, are there any ways for dynamically
> >>>>> disabling such features inside the driver?
> >>>>>
> >>>>> [1] like this: https://www.techarp.com/bios-guide/cpu-pci-write-buffer/
> >>>>>
> >>>> possibly?  It's still not clear to me that this is specific to ryzen
> >>>> chips rather than a problem with the DMA setup on the cx board.  Is
> >>>> there a downside to enabling the workaround in general?
> >>> The problem here is that the code with resets the DMA engine (required
> >>> for it to work with Ryzen) causes trouble with non-Ryzen North Bridges.
> >>>
> >>> So, one solution that would fit all doesn't seem to exist.
> >>>
> >>>> The original commit mentioned that xeon platforms were affected as well.
> >>> Xeon uses different chipsets and a different solution for the North
> >>> Bridge functionality, with may explain why some Xeon CPUs have the
> >>> same issue.
> >>>
> >> Shouldn't we add them to the list as well?
> >>
> >>>> Is it possible it's just particular platforms with wonky bioses?
> >>> Good point. Yeah, it could be triggered by a wonky bios or a bad setup
> >>> (like enabling overclock or activating some chipset-specific feature
> >>> that would increase the chance for a DMA transfer to fail).
> >>>
> >>>> Maybe DMI matching would be better?
> >>> Mapping via DMI could work too, but it would be a way harder to map,
> >>> as one would need to have a cx23885 board (if possible one with 4
> >>> tuners) and a series of different machines in order to test it.
> >>>
> >>> Based with previous experiences with bttv and cx88, I suspect that
> >>> we'll end by needing to map all machines with the same chipset.
> >>>
> >> Does anyone have a ryzen and the cx board and can they verify that
> >> they are actually seeing hangs without the special reset code?  So far
> >> I've only see non-Ryzen users complaining about hangs with the
> >> workaround.
> >
> > See above. I'm the Hauppauge engineer that worked on this.
> >
> >
> >
> >
> >
> >> Alex
> >>
> >>>>> Brad,
> >>>>>
> >>>>> From your reports about the DMA issues, do you know what generations
> >>>>> of the Ryzen are affected?
> >>>>>
> >>>>> Alex,
> >>>>>
> >>>>> Do you know if are there any differences at the IP block for the
> >>>>> DMA engine used on different Ryzen CPUs? I mean: I suspect that
> >>>>> the engine for Ryzen 2nd generation would likely be different than
> >>>>> the one at the 1st generation, but, along the same generation, does
> >>>>> the Ryzen 3, 5, 7 and Threadripper use the same DMA engine?
> >>>> + Suravee.  I'm not really familiar with the changes, if any, that are
> >>>> in the pcie bridges on various AMD CPUs.  Or if there are changes, it
> >>>> would be hard to say whether this issue would affect them or not.
> >>>>
> >>>> Alex
> >>>
> >>> Thanks,
> >>> Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3] cx23885: only reset DMA on problematic CPUs
  2018-12-19 17:07                 ` [PATCH v3] " Brad Love
@ 2018-12-20 13:43                   ` Mauro Carvalho Chehab
  0 siblings, 0 replies; 26+ messages in thread
From: Mauro Carvalho Chehab @ 2018-12-20 13:43 UTC (permalink / raw)
  To: Brad Love; +Cc: linux-media, markus.dobel, alexdeucher, zzam

Em Wed, 19 Dec 2018 11:07:01 -0600
Brad Love <brad@nextdimension.cc> escreveu:

> It is reported that commit 95f408bbc4e4 ("media: cx23885: Ryzen DMA
> related RiSC engine stall fixes") caused regresssions with other CPUs.
> 
> Ensure that the quirk will be applied only for the CPUs that
> are known to cause problems.
> 
> A module option is added for explicit control of the behaviour.
> 
> Fixes: 95f408bbc4e4 ("media: cx23885: Ryzen DMA related RiSC engine stall fixes")
> 
> Signed-off-by: Brad Love <brad@nextdimension.cc>

Thanks!

Patch applied and sent upstream.

Regards,
Mauro

> ---
> Since v2:
> - Replaced sizeof with ARRAY_SIZE
> - Fixed column 80 checkpatch complaint
> Changes since v1:
> - Added module option for three way control
> - Removed '7' from pci id description, Ryzen 3 is the same id
> 
>  drivers/media/pci/cx23885/cx23885-core.c | 55 ++++++++++++++++++++++++++++++--
>  drivers/media/pci/cx23885/cx23885.h      |  2 ++
>  2 files changed, 55 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/media/pci/cx23885/cx23885-core.c b/drivers/media/pci/cx23885/cx23885-core.c
> index 39804d8..e2e3649 100644
> --- a/drivers/media/pci/cx23885/cx23885-core.c
> +++ b/drivers/media/pci/cx23885/cx23885-core.c
> @@ -23,6 +23,7 @@
>  #include <linux/moduleparam.h>
>  #include <linux/kmod.h>
>  #include <linux/kernel.h>
> +#include <linux/pci.h>
>  #include <linux/slab.h>
>  #include <linux/interrupt.h>
>  #include <linux/delay.h>
> @@ -41,6 +42,18 @@ MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
>  MODULE_LICENSE("GPL");
>  MODULE_VERSION(CX23885_VERSION);
>  
> +/*
> + * Some platforms have been found to require periodic resetting of the DMA
> + * engine. Ryzen and XEON platforms are known to be affected. The symptom
> + * encountered is "mpeg risc op code error". Only Ryzen platforms employ
> + * this workaround if the option equals 1. The workaround can be explicitly
> + * disabled for all platforms by setting to 0, the workaround can be forced
> + * on for any platform by setting to 2.
> + */
> +static unsigned int dma_reset_workaround = 1;
> +module_param(dma_reset_workaround, int, 0644);
> +MODULE_PARM_DESC(dma_reset_workaround, "periodic RiSC dma engine reset; 0-force disable, 1-driver detect (default), 2-force enable");
> +
>  static unsigned int debug;
>  module_param(debug, int, 0644);
>  MODULE_PARM_DESC(debug, "enable debug messages");
> @@ -603,8 +616,13 @@ static void cx23885_risc_disasm(struct cx23885_tsport *port,
>  
>  static void cx23885_clear_bridge_error(struct cx23885_dev *dev)
>  {
> -	uint32_t reg1_val = cx_read(TC_REQ); /* read-only */
> -	uint32_t reg2_val = cx_read(TC_REQ_SET);
> +	uint32_t reg1_val, reg2_val;
> +
> +	if (!dev->need_dma_reset)
> +		return;
> +
> +	reg1_val = cx_read(TC_REQ); /* read-only */
> +	reg2_val = cx_read(TC_REQ_SET);
>  
>  	if (reg1_val && reg2_val) {
>  		cx_write(TC_REQ, reg1_val);
> @@ -2058,6 +2076,37 @@ void cx23885_gpio_enable(struct cx23885_dev *dev, u32 mask, int asoutput)
>  	/* TODO: 23-19 */
>  }
>  
> +static struct {
> +	int vendor, dev;
> +} const broken_dev_id[] = {
> +	/* According with
> +	 * https://openbenchmarking.org/system/1703021-RI-AMDZEN08075/Ryzen%207%201800X/lspci,
> +	 * 0x1451 is PCI ID for the IOMMU found on Ryzen
> +	 */
> +	{ PCI_VENDOR_ID_AMD, 0x1451 },
> +};
> +
> +static bool cx23885_does_need_dma_reset(void)
> +{
> +	int i;
> +	struct pci_dev *pdev = NULL;
> +
> +	if (dma_reset_workaround == 0)
> +		return false;
> +	else if (dma_reset_workaround == 2)
> +		return true;
> +
> +	for (i = 0; i < ARRAY_SIZE(broken_dev_id); i++) {
> +		pdev = pci_get_device(broken_dev_id[i].vendor,
> +					broken_dev_id[i].dev, NULL);
> +		if (pdev) {
> +			pci_dev_put(pdev);
> +			return true;
> +		}
> +	}
> +	return false;
> +}
> +
>  static int cx23885_initdev(struct pci_dev *pci_dev,
>  			   const struct pci_device_id *pci_id)
>  {
> @@ -2069,6 +2118,8 @@ static int cx23885_initdev(struct pci_dev *pci_dev,
>  	if (NULL == dev)
>  		return -ENOMEM;
>  
> +	dev->need_dma_reset = cx23885_does_need_dma_reset();
> +
>  	err = v4l2_device_register(&pci_dev->dev, &dev->v4l2_dev);
>  	if (err < 0)
>  		goto fail_free;
> diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
> index d54c7ee..cf965ef 100644
> --- a/drivers/media/pci/cx23885/cx23885.h
> +++ b/drivers/media/pci/cx23885/cx23885.h
> @@ -451,6 +451,8 @@ struct cx23885_dev {
>  	/* Analog raw audio */
>  	struct cx23885_audio_dev   *audio_dev;
>  
> +	/* Does the system require periodic DMA resets? */
> +	unsigned int		need_dma_reset:1;
>  };
>  
>  static inline struct cx23885_dev *to_cx23885(struct v4l2_device *v4l2_dev)



Thanks,
Mauro

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2018-12-20 13:43 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-21 13:45 [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
2018-12-05 11:07 ` Mauro Carvalho Chehab
2018-12-06 16:37   ` Brad Love
2018-12-06 17:18     ` Markus Dobel
2018-12-06 18:01       ` Mauro Carvalho Chehab
2018-12-06 18:36         ` Alex Deucher
2018-12-06 19:07           ` Mauro Carvalho Chehab
2018-12-06 19:32             ` Mauro Carvalho Chehab
2018-12-18 22:59               ` [PATCH v2] cx23885: only reset DMA on problematic CPUs Brad Love
2018-12-18 23:49                 ` Alex Deucher
2018-12-19 17:26                   ` Brad Love
2018-12-19 17:40                     ` Brad Love
2018-12-19 11:08                 ` Matthias Schwarzott
2018-12-19 17:09                   ` Brad Love
2018-12-19 17:07                 ` [PATCH v3] " Brad Love
2018-12-20 13:43                   ` Mauro Carvalho Chehab
2018-12-16 10:37         ` [PATCH] Revert 95f408bb Ryzen DMA related RiSC engine stall fixes Markus Dobel
2018-12-16 14:23           ` Mauro Carvalho Chehab
2018-12-18  2:05             ` Alex Deucher
2018-12-18  6:32               ` Markus Dobel
2018-12-18 12:45               ` Mauro Carvalho Chehab
2018-12-18 23:11                 ` Brad Love
2018-12-18 23:46                 ` Alex Deucher
2018-12-19  0:05                   ` Brad Love
2018-12-19  0:08                     ` Brad Love
2018-12-19 19:07                       ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.