linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
@ 2021-11-26 15:35 John Garry
  2021-11-29 10:46 ` Ajish.Koshy
  2021-12-09 13:46 ` John Garry
  0 siblings, 2 replies; 17+ messages in thread
From: John Garry @ 2021-11-26 15:35 UTC (permalink / raw)
  To: jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, Ajish.Koshy, linux-scsi, linux-kernel, John Garry

The driver supports a "direct" mode of operation, where the SMP req frame
is directly copied into the command payload (and vice-versa for the SMP
resp).

To get at the SMP req frame data in the scatterlist the driver uses
phys_to_virt() on the DMA mapped memory dma_addr_t . This is broken,
and subsequently crashes as follows when an IOMMU is enabled:

 Unable to handle kernel paging request at virtual address
ffff0000fcebfb00
	...
 pc : pm80xx_chip_smp_req+0x2d0/0x3d0
 lr : pm80xx_chip_smp_req+0xac/0x3d0
 pm80xx_chip_smp_req+0x2d0/0x3d0
 pm8001_task_exec.constprop.0+0x368/0x520
 pm8001_queue_command+0x1c/0x30
 smp_execute_task_sg+0xdc/0x204
 sas_discover_expander.part.0+0xac/0x6cc
 sas_discover_root_expander+0x8c/0x150
 sas_discover_domain+0x3ac/0x6a0
 process_one_work+0x1d0/0x354
 worker_thread+0x13c/0x470
 kthread+0x17c/0x190
 ret_from_fork+0x10/0x20
 Code: 371806e1 910006d6 6b16033f 54000249 (38766b05)
 ---[ end trace b91d59aaee98ea2d ]---
note: kworker/u192:0[7] exited with preempt_count 1

Instead use kmap{_atomic}().

Signed-off-by: John Garry <john.garry@huawei.com>
--
I would appreciate if someone could test this change a bit more.

Even though my system boots and I can mount the disks now, SCSI error
handling kicks eventually in for some erroneously completed tasks.
That's even on my x86 machine, IIRC. I think the card FW needs updating.

diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index b9f6d83ff380..0e2221b4f411 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -3053,7 +3053,6 @@ mpi_smp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
 	struct smp_completion_resp *psmpPayload;
 	struct task_status_struct *ts;
 	struct pm8001_device *pm8001_dev;
-	char *pdma_respaddr = NULL;
 
 	psmpPayload = (struct smp_completion_resp *)(piomb + 4);
 	status = le32_to_cpu(psmpPayload->status);
@@ -3080,19 +3079,23 @@ mpi_smp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
 		if (pm8001_dev)
 			atomic_dec(&pm8001_dev->running_req);
 		if (pm8001_ha->smp_exp_mode == SMP_DIRECT) {
+			struct scatterlist *sg_resp = &t->smp_task.smp_resp;
+			u8 *payload;
+			void *to;
+
 			pm8001_dbg(pm8001_ha, IO,
 				   "DIRECT RESPONSE Length:%d\n",
 				   param);
-			pdma_respaddr = (char *)(phys_to_virt(cpu_to_le64
-						((u64)sg_dma_address
-						(&t->smp_task.smp_resp))));
+			to = kmap_atomic(sg_page(sg_resp));
+			payload = to + sg_resp->offset;
 			for (i = 0; i < param; i++) {
-				*(pdma_respaddr+i) = psmpPayload->_r_a[i];
+				*(payload + i) = psmpPayload->_r_a[i];
 				pm8001_dbg(pm8001_ha, IO,
 					   "SMP Byte%d DMA data 0x%x psmp 0x%x\n",
-					   i, *(pdma_respaddr + i),
+					   i, *(payload + i),
 					   psmpPayload->_r_a[i]);
 			}
+			kunmap_atomic(to);
 		}
 		break;
 	case IO_ABORTED:
@@ -4236,14 +4239,14 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
 	struct sas_task *task = ccb->task;
 	struct domain_device *dev = task->dev;
 	struct pm8001_device *pm8001_dev = dev->lldd_dev;
-	struct scatterlist *sg_req, *sg_resp;
+	struct scatterlist *sg_req, *sg_resp, *smp_req;
 	u32 req_len, resp_len;
 	struct smp_req smp_cmd;
 	u32 opc;
 	struct inbound_queue_table *circularQ;
-	char *preq_dma_addr = NULL;
-	__le64 tmp_addr;
 	u32 i, length;
+	u8 *payload;
+	u8 *to;
 
 	memset(&smp_cmd, 0, sizeof(smp_cmd));
 	/*
@@ -4280,8 +4283,9 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
 		pm8001_ha->smp_exp_mode = SMP_INDIRECT;
 
 
-	tmp_addr = cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req));
-	preq_dma_addr = (char *)phys_to_virt(tmp_addr);
+	smp_req = &task->smp_task.smp_req;
+	to = kmap(sg_page(smp_req));
+	payload = to + smp_req->offset;
 
 	/* INDIRECT MODE command settings. Use DMA */
 	if (pm8001_ha->smp_exp_mode == SMP_INDIRECT) {
@@ -4289,7 +4293,7 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
 		/* for SPCv indirect mode. Place the top 4 bytes of
 		 * SMP Request header here. */
 		for (i = 0; i < 4; i++)
-			smp_cmd.smp_req16[i] = *(preq_dma_addr + i);
+			smp_cmd.smp_req16[i] = *(payload + i);
 		/* exclude top 4 bytes for SMP req header */
 		smp_cmd.long_smp_req.long_req_addr =
 			cpu_to_le64((u64)sg_dma_address
@@ -4320,20 +4324,20 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
 		pm8001_dbg(pm8001_ha, IO, "SMP REQUEST DIRECT MODE\n");
 		for (i = 0; i < length; i++)
 			if (i < 16) {
-				smp_cmd.smp_req16[i] = *(preq_dma_addr+i);
+				smp_cmd.smp_req16[i] = *(payload+i);
 				pm8001_dbg(pm8001_ha, IO,
 					   "Byte[%d]:%x (DMA data:%x)\n",
 					   i, smp_cmd.smp_req16[i],
-					   *(preq_dma_addr));
+					   *(payload));
 			} else {
-				smp_cmd.smp_req[i] = *(preq_dma_addr+i);
+				smp_cmd.smp_req[i] = *(payload+i);
 				pm8001_dbg(pm8001_ha, IO,
 					   "Byte[%d]:%x (DMA data:%x)\n",
 					   i, smp_cmd.smp_req[i],
-					   *(preq_dma_addr));
+					   *(payload));
 			}
 	}
-
+	kunmap(sg_page(smp_req));
 	build_smp_cmd(pm8001_dev->device_id, smp_cmd.tag,
 				&smp_cmd, pm8001_ha->smp_exp_mode, length);
 	rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &smp_cmd,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-11-26 15:35 [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t John Garry
@ 2021-11-29 10:46 ` Ajish.Koshy
  2021-12-06  9:42   ` John Garry
  2021-12-09 13:46 ` John Garry
  1 sibling, 1 reply; 17+ messages in thread
From: Ajish.Koshy @ 2021-11-29 10:46 UTC (permalink / raw)
  To: john.garry, jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, linux-scsi, linux-kernel

Thanks John for the update. Based on the given issue, 
we never tested on arm server. 

Further arm testing will depend on the availability of 
the server. 

Meanwhile will do further test on x86 and update
on the observations.

-----Original Message-----
From: John Garry <john.garry@huawei.com> 
Sent: Friday, November 26, 2021 09:06 PM
To: jinpu.wang@cloud.ionos.com; jejb@linux.ibm.com; martin.petersen@oracle.com
Cc: Viswas G - I30667 <Viswas.G@microchip.com>; Ajish Koshy - I30923 <Ajish.Koshy@microchip.com>; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org; John Garry <john.garry@huawei.com>
Subject: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

The driver supports a "direct" mode of operation, where the SMP req frame is directly copied into the command payload (and vice-versa for the SMP resp).

To get at the SMP req frame data in the scatterlist the driver uses
phys_to_virt() on the DMA mapped memory dma_addr_t . This is broken, and subsequently crashes as follows when an IOMMU is enabled:

 Unable to handle kernel paging request at virtual address
ffff0000fcebfb00
        ...
 pc : pm80xx_chip_smp_req+0x2d0/0x3d0
 lr : pm80xx_chip_smp_req+0xac/0x3d0
 pm80xx_chip_smp_req+0x2d0/0x3d0
 pm8001_task_exec.constprop.0+0x368/0x520
 pm8001_queue_command+0x1c/0x30
 smp_execute_task_sg+0xdc/0x204
 sas_discover_expander.part.0+0xac/0x6cc
 sas_discover_root_expander+0x8c/0x150
 sas_discover_domain+0x3ac/0x6a0
 process_one_work+0x1d0/0x354
 worker_thread+0x13c/0x470
 kthread+0x17c/0x190
 ret_from_fork+0x10/0x20
 Code: 371806e1 910006d6 6b16033f 54000249 (38766b05)  ---[ end trace b91d59aaee98ea2d ]---
note: kworker/u192:0[7] exited with preempt_count 1

Instead use kmap{_atomic}().

Signed-off-by: John Garry <john.garry@huawei.com>
--
I would appreciate if someone could test this change a bit more.

Even though my system boots and I can mount the disks now, SCSI error handling kicks eventually in for some erroneously completed tasks.
That's even on my x86 machine, IIRC. I think the card FW needs updating.

diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index b9f6d83ff380..0e2221b4f411 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -3053,7 +3053,6 @@ mpi_smp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
        struct smp_completion_resp *psmpPayload;
        struct task_status_struct *ts;
        struct pm8001_device *pm8001_dev;
-       char *pdma_respaddr = NULL;

        psmpPayload = (struct smp_completion_resp *)(piomb + 4);
        status = le32_to_cpu(psmpPayload->status); @@ -3080,19 +3079,23 @@ mpi_smp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
                if (pm8001_dev)
                        atomic_dec(&pm8001_dev->running_req);
                if (pm8001_ha->smp_exp_mode == SMP_DIRECT) {
+                       struct scatterlist *sg_resp = &t->smp_task.smp_resp;
+                       u8 *payload;
+                       void *to;
+
                        pm8001_dbg(pm8001_ha, IO,
                                   "DIRECT RESPONSE Length:%d\n",
                                   param);
-                       pdma_respaddr = (char *)(phys_to_virt(cpu_to_le64
-                                               ((u64)sg_dma_address
-                                               (&t->smp_task.smp_resp))));
+                       to = kmap_atomic(sg_page(sg_resp));
+                       payload = to + sg_resp->offset;
                        for (i = 0; i < param; i++) {
-                               *(pdma_respaddr+i) = psmpPayload->_r_a[i];
+                               *(payload + i) = psmpPayload->_r_a[i];
                                pm8001_dbg(pm8001_ha, IO,
                                           "SMP Byte%d DMA data 0x%x psmp 0x%x\n",
-                                          i, *(pdma_respaddr + i),
+                                          i, *(payload + i),
                                           psmpPayload->_r_a[i]);
                        }
+                       kunmap_atomic(to);
                }
                break;
        case IO_ABORTED:
@@ -4236,14 +4239,14 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
        struct sas_task *task = ccb->task;
        struct domain_device *dev = task->dev;
        struct pm8001_device *pm8001_dev = dev->lldd_dev;
-       struct scatterlist *sg_req, *sg_resp;
+       struct scatterlist *sg_req, *sg_resp, *smp_req;
        u32 req_len, resp_len;
        struct smp_req smp_cmd;
        u32 opc;
        struct inbound_queue_table *circularQ;
-       char *preq_dma_addr = NULL;
-       __le64 tmp_addr;
        u32 i, length;
+       u8 *payload;
+       u8 *to;

        memset(&smp_cmd, 0, sizeof(smp_cmd));
        /*
@@ -4280,8 +4283,9 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
                pm8001_ha->smp_exp_mode = SMP_INDIRECT;


-       tmp_addr = cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req));
-       preq_dma_addr = (char *)phys_to_virt(tmp_addr);
+       smp_req = &task->smp_task.smp_req;
+       to = kmap(sg_page(smp_req));
+       payload = to + smp_req->offset;

        /* INDIRECT MODE command settings. Use DMA */
        if (pm8001_ha->smp_exp_mode == SMP_INDIRECT) { @@ -4289,7 +4293,7 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
                /* for SPCv indirect mode. Place the top 4 bytes of
                 * SMP Request header here. */
                for (i = 0; i < 4; i++)
-                       smp_cmd.smp_req16[i] = *(preq_dma_addr + i);
+                       smp_cmd.smp_req16[i] = *(payload + i);
                /* exclude top 4 bytes for SMP req header */
                smp_cmd.long_smp_req.long_req_addr =
                        cpu_to_le64((u64)sg_dma_address @@ -4320,20 +4324,20 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
                pm8001_dbg(pm8001_ha, IO, "SMP REQUEST DIRECT MODE\n");
                for (i = 0; i < length; i++)
                        if (i < 16) {
-                               smp_cmd.smp_req16[i] = *(preq_dma_addr+i);
+                               smp_cmd.smp_req16[i] = *(payload+i);
                                pm8001_dbg(pm8001_ha, IO,
                                           "Byte[%d]:%x (DMA data:%x)\n",
                                           i, smp_cmd.smp_req16[i],
-                                          *(preq_dma_addr));
+                                          *(payload));
                        } else {
-                               smp_cmd.smp_req[i] = *(preq_dma_addr+i);
+                               smp_cmd.smp_req[i] = *(payload+i);
                                pm8001_dbg(pm8001_ha, IO,
                                           "Byte[%d]:%x (DMA data:%x)\n",
                                           i, smp_cmd.smp_req[i],
-                                          *(preq_dma_addr));
+                                          *(payload));
                        }
        }
-
+       kunmap(sg_page(smp_req));
        build_smp_cmd(pm8001_dev->device_id, smp_cmd.tag,
                                &smp_cmd, pm8001_ha->smp_exp_mode, length);
        rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &smp_cmd,
--
2.17.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-11-29 10:46 ` Ajish.Koshy
@ 2021-12-06  9:42   ` John Garry
  2021-12-06 13:24     ` Niklas Cassel
  0 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-06  9:42 UTC (permalink / raw)
  To: Ajish.Koshy, jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, linux-scsi, linux-kernel

On 29/11/2021 10:46, Ajish.Koshy@microchip.com wrote:
> Thanks John for the update. Based on the given issue,
> we never tested on arm server.
> 
> Further arm testing will depend on the availability of
> the server.
> 
> Meanwhile will do further test on x86 and update
> on the observations.

Have you tested on x86 with the IOMMU enabled? From my limited 
experience, out of the box the IOMMU is disabled in the BIOS on x86 
machines - that is a very general statement. But this is not just an 
issue specific to arm64.

Thanks,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-06  9:42   ` John Garry
@ 2021-12-06 13:24     ` Niklas Cassel
  2021-12-07 10:36       ` Ajish.Koshy
  0 siblings, 1 reply; 17+ messages in thread
From: Niklas Cassel @ 2021-12-06 13:24 UTC (permalink / raw)
  To: John Garry
  Cc: Ajish.Koshy, jinpu.wang, jejb, martin.petersen, Viswas.G,
	linux-scsi, linux-kernel, Damien Le Moal

On Mon, Dec 06, 2021 at 09:42:23AM +0000, John Garry wrote:
> On 29/11/2021 10:46, Ajish.Koshy@microchip.com wrote:
> > Thanks John for the update. Based on the given issue,
> > we never tested on arm server.
> > 
> > Further arm testing will depend on the availability of
> > the server.
> > 
> > Meanwhile will do further test on x86 and update
> > on the observations.
> 
> Have you tested on x86 with the IOMMU enabled? From my limited experience,
> out of the box the IOMMU is disabled in the BIOS on x86 machines - that is a
> very general statement. But this is not just an issue specific to arm64.

My limited experience tells me the opposite, that modern x64 servers
have IOMMU enabled by default:

E.g.:
https://www.dell.com/support/manuals/en-uk/poweredge-r7515/r7515_bios_pub/processor-settings?guid=guid-ec36d324-4f45-4bc1-bb51-de2db7cc5cd9&lang=en-us
https://support.hpe.com/hpesc/public/docDisplay?docId=a00025662en_us&docLocale=en_US


This driver not working properly on a system with an IOMMU just tells
us that the number of mainline users of this driver is very low :/


Kind regards,
Niklas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-06 13:24     ` Niklas Cassel
@ 2021-12-07 10:36       ` Ajish.Koshy
  2021-12-07 13:54         ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Ajish.Koshy @ 2021-12-07 10:36 UTC (permalink / raw)
  To: john.garry
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, damien.lemoal, Niklas.Cassel, Ajish.Koshy,
	Vasanthalakshmi.Tharmarajan

Hi John,

Well I could see this kernel panic [  126.843958] RIP: 0010:pm80xx_chip_smp_req+0x1d6/0x2e0 [pm80xx] with respect to existing driver on my current system x86 64bit after enabling the following 2 kernel boot arguments:
-intel_iommu=on
-iommu.passthrough=0

Post application of this patch,  now not observing the kernel panic and device discovery happens smoothly.


Thanks,
Ajish


 

-----Original Message-----
From: Niklas Cassel <Niklas.Cassel@wdc.com> 
Sent: Monday, December 6, 2021 06:54 PM
To: John Garry <john.garry@huawei.com>
Cc: Ajish Koshy - I30923 <Ajish.Koshy@microchip.com>; jinpu.wang@cloud.ionos.com; jejb@linux.ibm.com; martin.petersen@oracle.com; Viswas G - I30667 <Viswas.G@microchip.com>; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org; Damien Le Moal <damien.lemoal@opensource.wdc.com>
Subject: Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t

[Some people who received this message don't often get email from niklas.cassel@wdc.com. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.]

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On Mon, Dec 06, 2021 at 09:42:23AM +0000, John Garry wrote:
> On 29/11/2021 10:46, Ajish.Koshy@microchip.com wrote:
> > Thanks John for the update. Based on the given issue,
> > we never tested on arm server.
> >
> > Further arm testing will depend on the availability of
> > the server.
> >
> > Meanwhile will do further test on x86 and update
> > on the observations.
>
> Have you tested on x86 with the IOMMU enabled? From my limited experience,
> out of the box the IOMMU is disabled in the BIOS on x86 machines - that is a
> very general statement. But this is not just an issue specific to arm64.

My limited experience tells me the opposite, that modern x64 servers
have IOMMU enabled by default:

E.g.:
https://www.dell.com/support/manuals/en-uk/poweredge-r7515/r7515_bios_pub/processor-settings?guid=guid-ec36d324-4f45-4bc1-bb51-de2db7cc5cd9&lang=en-us
https://support.hpe.com/hpesc/public/docDisplay?docId=a00025662en_us&docLocale=en_US


This driver not working properly on a system with an IOMMU just tells
us that the number of mainline users of this driver is very low :/


Kind regards,
Niklas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-07 10:36       ` Ajish.Koshy
@ 2021-12-07 13:54         ` John Garry
  2021-12-09 12:04           ` Ajish.Koshy
  0 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-07 13:54 UTC (permalink / raw)
  To: Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, damien.lemoal, Niklas.Cassel,
	Vasanthalakshmi.Tharmarajan

On 07/12/2021 10:36, Ajish.Koshy@microchip.com wrote:
> Well I could see this kernel panic [  126.843958] RIP: 0010:pm80xx_chip_smp_req+0x1d6/0x2e0 [pm80xx] with respect to existing driver on my current system x86 64bit after enabling the following 2 kernel boot arguments:
> -intel_iommu=on
> -iommu.passthrough=0
>
OK, so it seems that it was the kernel which was just not enabling the 
IOMMU previously, which would be consistent with what Niklas mentioned.

Anyway, please supply reviewed-by and/or tested-by tags so that the SCSI 
maintainers can pick it up.

I suppose that we should also have:

Fixes: f5860992db55 ("[SCSI] pm80xx: Added SPCv/ve specific hardware
functionalities and relevant changes in common files")

Thanks for testing,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-07 13:54         ` John Garry
@ 2021-12-09 12:04           ` Ajish.Koshy
  2021-12-09 12:19             ` John Garry
  2021-12-09 22:26             ` Damien Le Moal
  0 siblings, 2 replies; 17+ messages in thread
From: Ajish.Koshy @ 2021-12-09 12:04 UTC (permalink / raw)
  To: john.garry
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, damien.lemoal, Niklas.Cassel,
	Vasanthalakshmi.Tharmarajan

Hi John,

Was testing the patch on arm server. Didn't see crash there but observing timeouts and error 
handling getting triggered for drives. But the same code works fine on x86.

At your end do you still face similar situation on arm server ?

Thanks,
Ajish

-----Original Message-----
From: John Garry <john.garry@huawei.com> 
Sent: Tuesday, December 7, 2021 07:24 PM
To: Ajish Koshy - I30923 <Ajish.Koshy@microchip.com>
Cc: jinpu.wang@cloud.ionos.com; jejb@linux.ibm.com; martin.petersen@oracle.com; Viswas G - I30667 <Viswas.G@microchip.com>; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org; damien.lemoal@opensource.wdc.com; Niklas.Cassel@wdc.com; Vasanthalakshmi Tharmarajan - I30664 <Vasanthalakshmi.Tharmarajan@microchip.com>
Subject: Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t

EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe

On 07/12/2021 10:36, Ajish.Koshy@microchip.com wrote:
> Well I could see this kernel panic [  126.843958] RIP: 0010:pm80xx_chip_smp_req+0x1d6/0x2e0 [pm80xx] with respect to existing driver on my current system x86 64bit after enabling the following 2 kernel boot arguments:
> -intel_iommu=on
> -iommu.passthrough=0
>
OK, so it seems that it was the kernel which was just not enabling the IOMMU previously, which would be consistent with what Niklas mentioned.

Anyway, please supply reviewed-by and/or tested-by tags so that the SCSI maintainers can pick it up.

I suppose that we should also have:

Fixes: f5860992db55 ("[SCSI] pm80xx: Added SPCv/ve specific hardware functionalities and relevant changes in common files")

Thanks for testing,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 12:04           ` Ajish.Koshy
@ 2021-12-09 12:19             ` John Garry
  2021-12-09 22:38               ` Damien Le Moal
  2021-12-09 22:26             ` Damien Le Moal
  1 sibling, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-09 12:19 UTC (permalink / raw)
  To: Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, damien.lemoal, Niklas.Cassel,
	Vasanthalakshmi.Tharmarajan

On 09/12/2021 12:04, Ajish.Koshy@microchip.com wrote:
> Was testing the patch on arm server. Didn't see crash there but observing timeouts and error
> handling getting triggered for drives. But the same code works fine on x86.
> 
> At your end do you still face similar situation on arm server ?

Yeah, I see that as well even without enabling the IOMMU.

root@(none)$ [  163.974907] sas: Enter sas_scsi_recover_host busy: 222 
failed: 222
[  163.981108] sas: sas_scsi_find_task: aborting task 0x000000005c703676
root@(none)$
root@(none)$ [  185.963714] pm80xx0:: pm8001_exec_internal_tmf_task 
757:TMF task[1]timeout.

I figured that it was a card FW issue as I have been using what I got 
out the box, and I have no tool to update the firmware on an arm host.

It seems that SSP and STP commands are not completing for some reason, 
from the "busy: 222" line.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-11-26 15:35 [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t John Garry
  2021-11-29 10:46 ` Ajish.Koshy
@ 2021-12-09 13:46 ` John Garry
  2021-12-10 10:23   ` Ajish.Koshy
  1 sibling, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-09 13:46 UTC (permalink / raw)
  To: jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, Ajish.Koshy, linux-scsi, linux-kernel

On 26/11/2021 15:35, John Garry wrote:
>   	/*
> @@ -4280,8 +4283,9 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
>   		pm8001_ha->smp_exp_mode = SMP_INDIRECT;
>   
>   
> -	tmp_addr = cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req));
> -	preq_dma_addr = (char *)phys_to_virt(tmp_addr);
> +	smp_req = &task->smp_task.smp_req;
> +	to = kmap(sg_page(smp_req));

This should be a kmap_atomic() as well, as I see the following for when 
CONFIG_DEBUG_ATOMIC_SLEEP is enabled:

[   27.222116]  dump_backtrace+0x0/0x2b4
[   27.225774]  show_stack+0x1c/0x30
[   27.229084]  dump_stack_lvl+0x68/0x84
[   27.232741]  dump_stack+0x20/0x3c
[   27.236049]  __might_resched+0x1d4/0x240
[   27.239967]  __might_sleep+0x70/0xd0
[   27.243536]  pm80xx_chip_smp_req+0x2c4/0x56c
[   27.247802]  pm8001_task_exec.constprop.0+0x718/0x770
[   27.252848]  pm8001_queue_command+0x1c/0x2c
[   27.257026]  smp_execute_task_sg+0x1e8/0x370
[   27.261289]  sas_ex_phy_discover+0x29c/0x31c
[   27.265553]  smp_ata_check_ready+0x74/0x190
[   27.269729]  ata_wait_ready+0xd0/0x224
[   27.273474]  ata_wait_after_reset+0x78/0xac
[   27.277652]  sas_ata_hard_reset+0xf0/0x18c
[   27.281742]  ata_do_reset.constprop.0+0x80/0x9c
[   27.286266]  ata_eh_reset+0xba4/0x1170
[   27.290008]  ata_eh_recover+0x4b0/0x1b40
[   27.293924]  ata_do_eh+0x8c/0x110
[   27.297232]  ata_std_error_handler+0x80/0xb0
[   27.301495]  ata_scsi_port_error_handler+0x3d4/0x9d0
[   27.306454]  async_sas_ata_eh+0x70/0xf8
[   27.310285]  async_run_entry_fn+0x5c/0x1e0
[   27.314375]  process_one_work+0x378/0x630
[   27.318379]  worker_thread+0xa8/0x6bc
[   27.322033]  kthread+0x214/0x230
[   27.325256]  ret_from_fork+0x10/0x20
[   27.328825] pm80xx0:: pm80xx_chip_smp_req  4292:SMP REQUEST INDIRECT MODE

But I don't think that this is the problem which causes error handling 
to kick in later, as discussed in this thread.

> +	payload = to + smp_req->offset;
>   
>   	/* INDIRECT MODE command settings. Use DMA */
>   	if (pm8001_ha->smp_exp_mode == SMP_INDIRECT) {
> @@ -4289,7 +4293,7 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
>   		/* for SPCv indirect mode. Place the top 4 bytes of
>   		 * SMP Request header here. */
>   		for (i = 0; i < 4; i++)
> -			smp_cmd.smp_req16[i] = *(preq_dma_addr + i);
> +			smp_cmd.smp_req16[i] = *(payload + i);


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 12:04           ` Ajish.Koshy
  2021-12-09 12:19             ` John Garry
@ 2021-12-09 22:26             ` Damien Le Moal
  1 sibling, 0 replies; 17+ messages in thread
From: Damien Le Moal @ 2021-12-09 22:26 UTC (permalink / raw)
  To: Ajish.Koshy, john.garry
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 2021/12/09 21:04, Ajish.Koshy@microchip.com wrote:
> Hi John,
> 
> Was testing the patch on arm server. Didn't see crash there but observing
> timeouts and error handling getting triggered for drives. But the same code
> works fine on x86.
> 
> At your end do you still face similar situation on arm server ?
> 
> Thanks, Ajish

Please do not top post. It makes it hard to follow the conversation.

Are the drives you are using SATA or SAS ?
Could you post the output of dmesg related to the errors you are seeing ?

> 
> -----Original Message----- From: John Garry <john.garry@huawei.com> Sent:
> Tuesday, December 7, 2021 07:24 PM To: Ajish Koshy - I30923
> <Ajish.Koshy@microchip.com> Cc: jinpu.wang@cloud.ionos.com;
> jejb@linux.ibm.com; martin.petersen@oracle.com; Viswas G - I30667
> <Viswas.G@microchip.com>; linux-scsi@vger.kernel.org;
> linux-kernel@vger.kernel.org; damien.lemoal@opensource.wdc.com;
> Niklas.Cassel@wdc.com; Vasanthalakshmi Tharmarajan - I30664
> <Vasanthalakshmi.Tharmarajan@microchip.com> Subject: Re: [PATCH] scsi:
> pm8001: Fix phys_to_virt() usage on dma_addr_t
> 
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the
> content is safe
> 
> On 07/12/2021 10:36, Ajish.Koshy@microchip.com wrote:
>> Well I could see this kernel panic [  126.843958] RIP:
>> 0010:pm80xx_chip_smp_req+0x1d6/0x2e0 [pm80xx] with respect to existing
>> driver on my current system x86 64bit after enabling the following 2 kernel
>> boot arguments: -intel_iommu=on -iommu.passthrough=0
>> 
> OK, so it seems that it was the kernel which was just not enabling the IOMMU
> previously, which would be consistent with what Niklas mentioned.
> 
> Anyway, please supply reviewed-by and/or tested-by tags so that the SCSI
> maintainers can pick it up.
> 
> I suppose that we should also have:
> 
> Fixes: f5860992db55 ("[SCSI] pm80xx: Added SPCv/ve specific hardware
> functionalities and relevant changes in common files")
> 
> Thanks for testing, John


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 12:19             ` John Garry
@ 2021-12-09 22:38               ` Damien Le Moal
  2021-12-09 23:09                 ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Damien Le Moal @ 2021-12-09 22:38 UTC (permalink / raw)
  To: John Garry, Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 2021/12/09 21:19, John Garry wrote:
> On 09/12/2021 12:04, Ajish.Koshy@microchip.com wrote:
>> Was testing the patch on arm server. Didn't see crash there but observing timeouts and error
>> handling getting triggered for drives. But the same code works fine on x86.
>>
>> At your end do you still face similar situation on arm server ?
> 
> Yeah, I see that as well even without enabling the IOMMU.
> 
> root@(none)$ [  163.974907] sas: Enter sas_scsi_recover_host busy: 222 
> failed: 222
> [  163.981108] sas: sas_scsi_find_task: aborting task 0x000000005c703676
> root@(none)$
> root@(none)$ [  185.963714] pm80xx0:: pm8001_exec_internal_tmf_task 
> 757:TMF task[1]timeout.
> 
> I figured that it was a card FW issue as I have been using what I got 
> out the box, and I have no tool to update the firmware on an arm host.
> 
> It seems that SSP and STP commands are not completing for some reason, 
> from the "busy: 222" line.

I have this HBA:

c1:00.0 Serial Attached SCSI controller: ATTO Technology, Inc. ExpressSAS 12Gb/s
SAS/SATA HBA (rev 06)
	Subsystem: ATTO Technology, Inc. ExpressSAS H120F

Which uses the pm80xx driver and I do not see such error. E.g.:

[335568.262395] pm80xx 0000:c1:00.0: pm80xx: driver version 0.1.40
[335568.268931] :: pm8001_pci_alloc  529:Setting link rate to default value
[335569.489392] scsi host18: pm80xx
[335570.801031] sas: phy-18:4 added to port-18:0, phy_mask:0x10 (50010860002f5644)
[335570.801225] sas: DOING DISCOVERY on port 0, pid:58830
[335570.801310] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
[335570.807638] sas: ata22: end_device-18:0: dev error handler
[335570.964864] ata22.00: ATA-11: WDC  WUH721818ALN604, PCGNW232, max UDMA/133
[335579.062526] ata22.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
[335579.070487] ata22.00: Features: NCQ-sndrcv NCQ-prio
[335579.307260] ata22.00: configured for UDMA/133
[335579.313018] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
[335579.323512] scsi 18:0:0:0: Direct-Access     ATA      WDC  WUH721818AL W232
PQ: 0 ANSI: 5
[335579.333243] sas: DONE DISCOVERY on port 0, pid:58830, result:0
[335579.333338] sas: phy-18:5 added to port-18:1, phy_mask:0x20 (50010860002f5645)
[335579.333453] sas: DOING DISCOVERY on port 1, pid:58830
[335579.333596] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
[335579.341596] sas: ata23: end_device-18:1: dev error handler
[335579.341640] sas: ata22: end_device-18:0: dev error handler
[335579.500374] ata23.00: ATA-11: WDC  WUH721818ALN604, PCGNWTW2, max UDMA/133
[335588.427115] ata23.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
[335588.435158] ata23.00: Features: NCQ-sndrcv NCQ-prio
[335588.513212] ata23.00: configured for UDMA/133
[335588.519027] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
[335588.537683] scsi 18:0:1:0: Direct-Access     ATA      WDC  WUH721818AL WTW2
PQ: 0 ANSI: 5
[335588.565288] sas: DONE DISCOVERY on port 1, pid:58830, result:0
[335588.565543] sas: phy-18:7 added to port-18:2, phy_mask:0x80 (50010860002f5647)
[335588.566917] sas: DOING DISCOVERY on port 2, pid:58830
[335588.567515] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
[335588.574948] sas: ata22: end_device-18:0: dev error handler
[335588.574971] sas: ata23: end_device-18:1: dev error handler
[335588.574979] sas: ata24: end_device-18:2: dev error handler
[335588.732190] ata24.00: ATA-11: WDC  WSH722020ALN604, PCGMW803, max UDMA/133
[335597.778187] ata24.00: 4882956288 sectors, multi 0: LBA48 NCQ (depth 32)
[335597.788081] ata24.00: Features: NCQ-sndrcv NCQ-prio
[335597.850404] ata24.00: configured for UDMA/133
[335597.856225] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
[335597.866680] scsi 18:0:2:0: Direct-Access-ZBC ATA      WDC  WSH722020AL W803
PQ: 0 ANSI: 7
[335597.876485] sas: DONE DISCOVERY on port 2, pid:58830, result:0
[335597.879720] sd 18:0:0:0: [sdd] 4394582016 4096-byte logical blocks: (18.0
TB/16.4 TiB)
[335597.881483] sd 18:0:0:0: Attached scsi generic sg3 type 0
[335597.888827] sd 18:0:0:0: [sdd] Write Protect is off
[335597.888830] sd 18:0:0:0: [sdd] Mode Sense: 00 3a 00 00
[335597.888839] sd 18:0:0:0: [sdd] Write cache: enabled, read cache: enabled,
doesn't support DPO or FUA
[335597.968683] sd 18:0:1:0: [sde] 4394582016 4096-byte logical blocks: (18.0
TB/16.4 TiB)
[335597.969489] sd 18:0:1:0: Attached scsi generic sg4 type 0
[335597.978210] sd 18:0:1:0: [sde] Write Protect is off
[335597.978214] sd 18:0:1:0: [sde] Mode Sense: 00 3a 00 00
[335597.978228] sd 18:0:1:0: [sde] Write cache: enabled, read cache: enabled,
doesn't support DPO or FUA
[335598.053869] sd 18:0:2:0: [sdf] Host-managed zoned block device
[335598.054476] sd 18:0:2:0: Attached scsi generic sg5 type 20
[335598.066428] sd 18:0:2:0: [sdf] 4882956288 4096-byte logical blocks: (20.0
TB/18.2 TiB)
[335598.093762] sd 18:0:2:0: [sdf] Write Protect is off
[335598.100101] sd 18:0:2:0: [sdf] Mode Sense: 00 3a 00 00
[335598.100119] sd 18:0:2:0: [sdf] Write cache: enabled, read cache: enabled,
doesn't support DPO or FUA
[335598.158832] sd 18:0:1:0: [sde] Attached SCSI disk
[335598.158870] sd 18:0:0:0: [sdd] Attached SCSI disk
[335600.015402] sd 18:0:2:0: [sdf] 74508 zones of 65536 logical blocks
[335600.099235] sd 18:0:2:0: [sdf] Attached SCSI disk

The driver is uselessly verbose (for some reasons, the dbg messages show up),
but no errors.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 22:38               ` Damien Le Moal
@ 2021-12-09 23:09                 ` John Garry
  2021-12-09 23:55                   ` Damien Le Moal
  0 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-09 23:09 UTC (permalink / raw)
  To: Damien Le Moal, Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 09/12/2021 22:38, Damien Le Moal wrote:
>> It seems that SSP and STP commands are not completing for some reason,
>> from the "busy: 222" line.
> I have this HBA:
> 
> c1:00.0 Serial Attached SCSI controller: ATTO Technology, Inc. ExpressSAS 12Gb/s
> SAS/SATA HBA (rev 06)
> 	Subsystem: ATTO Technology, Inc. ExpressSAS H120F
> 
> Which uses the pm80xx driver and I do not see such error. E.g.:
> 
> [335568.262395] pm80xx 0000:c1:00.0: pm80xx: driver version 0.1.40
> [335568.268931] :: pm8001_pci_alloc  529:Setting link rate to default value
> [335569.489392] scsi host18: pm80xx
> [335570.801031] sas: phy-18:4 added to port-18:0, phy_mask:0x10 (50010860002f5644)
> [335570.801225] sas: DOING DISCOVERY on port 0, pid:58830
> [335570.801310] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
> [335570.807638] sas: ata22: end_device-18:0: dev error handler
> [335570.964864] ata22.00: ATA-11: WDC  WUH721818ALN604, PCGNW232, max UDMA/133
> [335579.062526] ata22.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
> [335579.070487] ata22.00: Features: NCQ-sndrcv NCQ-prio
> [335579.307260] ata22.00: configured for UDMA/133
> [335579.313018] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
> [335579.323512] scsi 18:0:0:0: Direct-Access     ATA      WDC  WUH721818AL W232
> PQ: 0 ANSI: 5
> [335579.333243] sas: DONE DISCOVERY on port 0, pid:58830, result:0
> [335579.333338] sas: phy-18:5 added to port-18:1, phy_mask:0x20 (50010860002f5645)
> [335579.333453] sas: DOING DISCOVERY on port 1, pid:58830
> [335579.333596] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
> [335579.341596] sas: ata23: end_device-18:1: dev error handler
> [335579.341640] sas: ata22: end_device-18:0: dev error handler
> [335579.500374] ata23.00: ATA-11: WDC  WUH721818ALN604, PCGNWTW2, max UDMA/133
> [335588.427115] ata23.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
> [335588.435158] ata23.00: Features: NCQ-sndrcv NCQ-prio
> [335588.513212] ata23.00: configured for UDMA/133
> [335588.519027] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
> [335588.537683] scsi 18:0:1:0: Direct-Access     ATA      WDC  WUH721818AL WTW2
> PQ: 0 ANSI: 5
> [335588.565288] sas: DONE DISCOVERY on port 1, pid:58830, result:0
> [335588.565543] sas: phy-18:7 added to port-18:2, phy_mask:0x80 (50010860002f5647)
> [335588.566917] sas: DOING DISCOVERY on port 2, pid:58830
> [335588.567515] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
> [335588.574948] sas: ata22: end_device-18:0: dev error handler
> [335588.574971] sas: ata23: end_device-18:1: dev error handler
> [335588.574979] sas: ata24: end_device-18:2: dev error handler
> [335588.732190] ata24.00: ATA-11: WDC  WSH722020ALN604, PCGMW803, max UDMA/133
> [335597.778187] ata24.00: 4882956288 sectors, multi 0: LBA48 NCQ (depth 32)
> [335597.788081] ata24.00: Features: NCQ-sndrcv NCQ-prio
> [335597.850404] ata24.00: configured for UDMA/133
> [335597.856225] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
> [335597.866680] scsi 18:0:2:0: Direct-Access-ZBC ATA      WDC  WSH722020AL W803
> PQ: 0 ANSI: 7
> [335597.876485] sas: DONE DISCOVERY on port 2, pid:58830, result:0
> [335597.879720] sd 18:0:0:0: [sdd] 4394582016 4096-byte logical blocks: (18.0
> TB/16.4 TiB)
> [335597.881483] sd 18:0:0:0: Attached scsi generic sg3 type 0
> [335597.888827] sd 18:0:0:0: [sdd] Write Protect is off
> [335597.888830] sd 18:0:0:0: [sdd] Mode Sense: 00 3a 00 00
> [335597.888839] sd 18:0:0:0: [sdd] Write cache: enabled, read cache: enabled,
> doesn't support DPO or FUA
> [335597.968683] sd 18:0:1:0: [sde] 4394582016 4096-byte logical blocks: (18.0
> TB/16.4 TiB)
> [335597.969489] sd 18:0:1:0: Attached scsi generic sg4 type 0
> [335597.978210] sd 18:0:1:0: [sde] Write Protect is off
> [335597.978214] sd 18:0:1:0: [sde] Mode Sense: 00 3a 00 00
> [335597.978228] sd 18:0:1:0: [sde] Write cache: enabled, read cache: enabled,
> doesn't support DPO or FUA
> [335598.053869] sd 18:0:2:0: [sdf] Host-managed zoned block device
> [335598.054476] sd 18:0:2:0: Attached scsi generic sg5 type 20
> [335598.066428] sd 18:0:2:0: [sdf] 4882956288 4096-byte logical blocks: (20.0
> TB/18.2 TiB)
> [335598.093762] sd 18:0:2:0: [sdf] Write Protect is off
> [335598.100101] sd 18:0:2:0: [sdf] Mode Sense: 00 3a 00 00
> [335598.100119] sd 18:0:2:0: [sdf] Write cache: enabled, read cache: enabled,
> doesn't support DPO or FUA
> [335598.158832] sd 18:0:1:0: [sde] Attached SCSI disk
> [335598.158870] sd 18:0:0:0: [sdd] Attached SCSI disk
> [335600.015402] sd 18:0:2:0: [sdf] 74508 zones of 65536 logical blocks
> [335600.099235] sd 18:0:2:0: [sdf] Attached SCSI disk
> 
> The driver is uselessly verbose (for some reasons, the dbg messages show up),
> but no errors.


Here's my log on my arm64 machine:

root@(none)$ echo 0000:04:00.0 > bind
[83.872341] pm80xx 0000:04:00.0: pm80xx: driver version 0.1.40
[83.878359] :: pm8001_pci_alloc  530:Setting link rate to default value
[84.932057] scsi host0: pm80xx
[87.288239] sas: target proto 0x0 at 500e004aaaaaaa1f:0x10 not handled
[87.294793] sas: ex 500e004aaaaaaa1f phy16 failed to discover
[87.360228] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
[89.734180] ata4.00: ATA-8: SAMSUNG HM320JI, 2SS00_01, max UDMA7
[89.740185] ata4.00: 625142448 sectors, multi 0: LBA48 NCQ (depth 32)
[93.237158] ata4.00: configured for UDMA/133
[93.241435] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
[93.262941] scsi 0:0:0:0: Direct-Access  SEAGATE  ST2000NM0045  N004 PQ: 
0 ANSI: 6
[93.275710] sd 0:0:0:0: [sda] 3907029168 512-byte logical blocks: (2.00 
TB/1.82 TiB)
[93.284159] sd 0:0:0:0: [sda] Write Protect is off
[93.284423] scsi 0:0:1:0: Direct-Access  SEAGATE  ST2000NM0045  N004 PQ: 
0 ANSI: 6
[93.298317] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[93.300128] scsi 0:0:2:0: Direct-Access  ATASAMSUNG HM320JI  0_01 PQ: 0 
ANSI: 5
[93.301689] sd 0:0:1:0: [sdb] 3907029168 512-byte logical blocks: (2.00 
TB/1.82 TiB)
[93.302366] sd 0:0:1:0: [sdb] Write Protect is off
[93.303636] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[93.311335]  sdb: sdb1
[93.315129] sd 0:0:2:0: [sdc] 625142448 512-byte logical blocks: (320 
GB/298 GiB)
[93.315678]  sda: sda1
[93.315898] scsi 0:0:3:0: Direct-Access  SEAGATE  ST1000NM0023  0006 PQ: 
0 ANSI: 6
[93.321138] sd 0:0:0:0: [sda] Attached SCSI disk
[93.328114] sd 0:0:1:0: [sdb] Attached SCSI disk
[93.335947] sd 0:0:2:0: [sdc] Write Protect is off
[93.344840] scsi 0:0:4:0: EnclosureHUAWEIExpander 12Gx16  128  PQ: 0 
ANSI: 6
[93.345274] sd 0:0:3:0: [sdd] 1953525168 512-byte logical blocks: (1.00 
TB/932 GiB)
[93.345634] sd 0:0:3:0: [sdd] Write Protect is off
[93.346268] sd 0:0:3:0: [sdd] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[93.351997]  sdd: sdd1 sdd2
[93.355067] sd 0:0:3:0: [sdd] Attached SCSI disk
[93.356196] sd 0:0:2:0: [sdc] Write cache: enabled, read cache: enabled, 
doesn't support DPO or FUA
[93.401162]  sdc: sdc1 sdc2 sdc3
[93.406853] sd 0:0:2:0: [sdc] Attached SCSI disk
[93.425508] sas: ex 500e004aaaaaaa1f phy08 change count has changed
[93.435927] sas: target proto 0x0 at 500e004aaaaaaa1f:0x11 not handled
[93.442480] sas: ex 500e004aaaaaaa1f phy17 failed to discover
[93.448737] sas: target proto 0x0 at 500e004aaaaaaa1f:0x12 not handled
[93.455288] sas: ex 500e004aaaaaaa1f phy18 failed to discover
[93.461527] sas: target proto 0x0 at 500e004aaaaaaa1f:0x13 not handled
[93.468081] sas: ex 500e004aaaaaaa1f phy19 failed to discover
[93.479734] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.500240] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.510489] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.520714] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.531000] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.541351] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.551721] sas: ex 500e004aaaaaaa1f phys DID NOT change
[93.562066] sas: ex 500e004aaaaaaa1f phys DID NOT change
root@(none)$
root@(none)$
root@(none)$

And then it's the mounting a partition or examining it which hangs quite 
reliably:

root@(none)$ mount /dev/sda1 mnt
[  104.232655] EXT4-fs (sda1): recovery complete
[  104.237021] EXT4-fs (sda1): mounted filesystem with ordered data 
mode. Opts: (null). Quota mode: none.
root@(none)$ ls mnt
[  136.842085] sas: Enter sas_scsi_recover_host busy: 203 failed: 203
[  136.848286] sas: sas_scsi_find_task: aborting task 0x(____ptrval____)
[  157.280267] pm80xx1:: pm8001_exec_internal_tmf_task  756:TMF 
task[1]timeout.
[  171.283579] pm80xx1:: mpi_ssp_completion  1935:sas IO status 0x1
[  171.283584] pm80xx1:: pm8001_abort_task  1327:rc= -5
[  171.289579] pm80xx1:: mpi_ssp_completion  1946:SAS Address of IO 
Failure Drive:5000c500a7b95a49
[  171.289583] pm80xx1:: mpi_ssp_completion  2184:task 
0x(____ptrval____) done with io_status 0x1 resp 0x0 stat 0x8d but 
aborted by upper layer!
[  171.315904] sas: sas_eh_handle_sas_errors: task 0x(____ptrval____) is 
done
[  171.322769] sas: sas_scsi_find_task: aborting task 0x(____ptrval____)

Earlier today it was the mount command which was hanging. From debugging 
that, I found that the very first SSP command when mounting is sent the 
HW successfully but no completion interrupt ever occurs there - I really 
don't know why. Other SSP commands complete successfully before this and 
after (TMFs in the error handling), including ones which have sgls.

sda is a SAS drive, but I think SATA has the same issue - I was just 
looking at sda.

One thing I noticed in the driver is that it uses mb() in between 
writing to the DMA memory and initiating the HW - I don't think mb is 
strong enough. However I don't think that is my issue - it wouldn't fail 
reliably if it was.

Thanks,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 23:09                 ` John Garry
@ 2021-12-09 23:55                   ` Damien Le Moal
  2021-12-10 10:35                     ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Damien Le Moal @ 2021-12-09 23:55 UTC (permalink / raw)
  To: John Garry, Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 2021/12/10 8:09, John Garry wrote:
> On 09/12/2021 22:38, Damien Le Moal wrote:
>>> It seems that SSP and STP commands are not completing for some reason,
>>> from the "busy: 222" line.
>> I have this HBA:
>>
>> c1:00.0 Serial Attached SCSI controller: ATTO Technology, Inc. ExpressSAS 12Gb/s
>> SAS/SATA HBA (rev 06)
>> 	Subsystem: ATTO Technology, Inc. ExpressSAS H120F
>>
>> Which uses the pm80xx driver and I do not see such error. E.g.:
>>
>> [335568.262395] pm80xx 0000:c1:00.0: pm80xx: driver version 0.1.40
>> [335568.268931] :: pm8001_pci_alloc  529:Setting link rate to default value
>> [335569.489392] scsi host18: pm80xx
>> [335570.801031] sas: phy-18:4 added to port-18:0, phy_mask:0x10 (50010860002f5644)
>> [335570.801225] sas: DOING DISCOVERY on port 0, pid:58830
>> [335570.801310] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
>> [335570.807638] sas: ata22: end_device-18:0: dev error handler
>> [335570.964864] ata22.00: ATA-11: WDC  WUH721818ALN604, PCGNW232, max UDMA/133
>> [335579.062526] ata22.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
>> [335579.070487] ata22.00: Features: NCQ-sndrcv NCQ-prio
>> [335579.307260] ata22.00: configured for UDMA/133
>> [335579.313018] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
>> [335579.323512] scsi 18:0:0:0: Direct-Access     ATA      WDC  WUH721818AL W232
>> PQ: 0 ANSI: 5
>> [335579.333243] sas: DONE DISCOVERY on port 0, pid:58830, result:0
>> [335579.333338] sas: phy-18:5 added to port-18:1, phy_mask:0x20 (50010860002f5645)
>> [335579.333453] sas: DOING DISCOVERY on port 1, pid:58830
>> [335579.333596] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
>> [335579.341596] sas: ata23: end_device-18:1: dev error handler
>> [335579.341640] sas: ata22: end_device-18:0: dev error handler
>> [335579.500374] ata23.00: ATA-11: WDC  WUH721818ALN604, PCGNWTW2, max UDMA/133
>> [335588.427115] ata23.00: 4394582016 sectors, multi 0: LBA48 NCQ (depth 32)
>> [335588.435158] ata23.00: Features: NCQ-sndrcv NCQ-prio
>> [335588.513212] ata23.00: configured for UDMA/133
>> [335588.519027] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
>> [335588.537683] scsi 18:0:1:0: Direct-Access     ATA      WDC  WUH721818AL WTW2
>> PQ: 0 ANSI: 5
>> [335588.565288] sas: DONE DISCOVERY on port 1, pid:58830, result:0
>> [335588.565543] sas: phy-18:7 added to port-18:2, phy_mask:0x80 (50010860002f5647)
>> [335588.566917] sas: DOING DISCOVERY on port 2, pid:58830
>> [335588.567515] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
>> [335588.574948] sas: ata22: end_device-18:0: dev error handler
>> [335588.574971] sas: ata23: end_device-18:1: dev error handler
>> [335588.574979] sas: ata24: end_device-18:2: dev error handler
>> [335588.732190] ata24.00: ATA-11: WDC  WSH722020ALN604, PCGMW803, max UDMA/133
>> [335597.778187] ata24.00: 4882956288 sectors, multi 0: LBA48 NCQ (depth 32)
>> [335597.788081] ata24.00: Features: NCQ-sndrcv NCQ-prio
>> [335597.850404] ata24.00: configured for UDMA/133
>> [335597.856225] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
>> [335597.866680] scsi 18:0:2:0: Direct-Access-ZBC ATA      WDC  WSH722020AL W803
>> PQ: 0 ANSI: 7
>> [335597.876485] sas: DONE DISCOVERY on port 2, pid:58830, result:0
>> [335597.879720] sd 18:0:0:0: [sdd] 4394582016 4096-byte logical blocks: (18.0
>> TB/16.4 TiB)
>> [335597.881483] sd 18:0:0:0: Attached scsi generic sg3 type 0
>> [335597.888827] sd 18:0:0:0: [sdd] Write Protect is off
>> [335597.888830] sd 18:0:0:0: [sdd] Mode Sense: 00 3a 00 00
>> [335597.888839] sd 18:0:0:0: [sdd] Write cache: enabled, read cache: enabled,
>> doesn't support DPO or FUA
>> [335597.968683] sd 18:0:1:0: [sde] 4394582016 4096-byte logical blocks: (18.0
>> TB/16.4 TiB)
>> [335597.969489] sd 18:0:1:0: Attached scsi generic sg4 type 0
>> [335597.978210] sd 18:0:1:0: [sde] Write Protect is off
>> [335597.978214] sd 18:0:1:0: [sde] Mode Sense: 00 3a 00 00
>> [335597.978228] sd 18:0:1:0: [sde] Write cache: enabled, read cache: enabled,
>> doesn't support DPO or FUA
>> [335598.053869] sd 18:0:2:0: [sdf] Host-managed zoned block device
>> [335598.054476] sd 18:0:2:0: Attached scsi generic sg5 type 20
>> [335598.066428] sd 18:0:2:0: [sdf] 4882956288 4096-byte logical blocks: (20.0
>> TB/18.2 TiB)
>> [335598.093762] sd 18:0:2:0: [sdf] Write Protect is off
>> [335598.100101] sd 18:0:2:0: [sdf] Mode Sense: 00 3a 00 00
>> [335598.100119] sd 18:0:2:0: [sdf] Write cache: enabled, read cache: enabled,
>> doesn't support DPO or FUA
>> [335598.158832] sd 18:0:1:0: [sde] Attached SCSI disk
>> [335598.158870] sd 18:0:0:0: [sdd] Attached SCSI disk
>> [335600.015402] sd 18:0:2:0: [sdf] 74508 zones of 65536 logical blocks
>> [335600.099235] sd 18:0:2:0: [sdf] Attached SCSI disk
>>
>> The driver is uselessly verbose (for some reasons, the dbg messages show up),
>> but no errors.
> 
> 
> Here's my log on my arm64 machine:
> 
> root@(none)$ echo 0000:04:00.0 > bind
> [83.872341] pm80xx 0000:04:00.0: pm80xx: driver version 0.1.40
> [83.878359] :: pm8001_pci_alloc  530:Setting link rate to default value
> [84.932057] scsi host0: pm80xx
> [87.288239] sas: target proto 0x0 at 500e004aaaaaaa1f:0x10 not handled
> [87.294793] sas: ex 500e004aaaaaaa1f phy16 failed to discover
> [87.360228] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
> [89.734180] ata4.00: ATA-8: SAMSUNG HM320JI, 2SS00_01, max UDMA7
> [89.740185] ata4.00: 625142448 sectors, multi 0: LBA48 NCQ (depth 32)
> [93.237158] ata4.00: configured for UDMA/133
> [93.241435] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
> [93.262941] scsi 0:0:0:0: Direct-Access  SEAGATE  ST2000NM0045  N004 PQ: 
> 0 ANSI: 6
> [93.275710] sd 0:0:0:0: [sda] 3907029168 512-byte logical blocks: (2.00 
> TB/1.82 TiB)
> [93.284159] sd 0:0:0:0: [sda] Write Protect is off
> [93.284423] scsi 0:0:1:0: Direct-Access  SEAGATE  ST2000NM0045  N004 PQ: 
> 0 ANSI: 6
> [93.298317] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
> supports DPO and FUA
> [93.300128] scsi 0:0:2:0: Direct-Access  ATASAMSUNG HM320JI  0_01 PQ: 0 
> ANSI: 5
> [93.301689] sd 0:0:1:0: [sdb] 3907029168 512-byte logical blocks: (2.00 
> TB/1.82 TiB)
> [93.302366] sd 0:0:1:0: [sdb] Write Protect is off
> [93.303636] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, 
> supports DPO and FUA
> [93.311335]  sdb: sdb1
> [93.315129] sd 0:0:2:0: [sdc] 625142448 512-byte logical blocks: (320 
> GB/298 GiB)
> [93.315678]  sda: sda1
> [93.315898] scsi 0:0:3:0: Direct-Access  SEAGATE  ST1000NM0023  0006 PQ: 
> 0 ANSI: 6
> [93.321138] sd 0:0:0:0: [sda] Attached SCSI disk
> [93.328114] sd 0:0:1:0: [sdb] Attached SCSI disk
> [93.335947] sd 0:0:2:0: [sdc] Write Protect is off
> [93.344840] scsi 0:0:4:0: EnclosureHUAWEIExpander 12Gx16  128  PQ: 0 
> ANSI: 6
> [93.345274] sd 0:0:3:0: [sdd] 1953525168 512-byte logical blocks: (1.00 
> TB/932 GiB)
> [93.345634] sd 0:0:3:0: [sdd] Write Protect is off
> [93.346268] sd 0:0:3:0: [sdd] Write cache: enabled, read cache: enabled, 
> supports DPO and FUA
> [93.351997]  sdd: sdd1 sdd2
> [93.355067] sd 0:0:3:0: [sdd] Attached SCSI disk
> [93.356196] sd 0:0:2:0: [sdc] Write cache: enabled, read cache: enabled, 
> doesn't support DPO or FUA
> [93.401162]  sdc: sdc1 sdc2 sdc3
> [93.406853] sd 0:0:2:0: [sdc] Attached SCSI disk
> [93.425508] sas: ex 500e004aaaaaaa1f phy08 change count has changed
> [93.435927] sas: target proto 0x0 at 500e004aaaaaaa1f:0x11 not handled
> [93.442480] sas: ex 500e004aaaaaaa1f phy17 failed to discover
> [93.448737] sas: target proto 0x0 at 500e004aaaaaaa1f:0x12 not handled
> [93.455288] sas: ex 500e004aaaaaaa1f phy18 failed to discover
> [93.461527] sas: target proto 0x0 at 500e004aaaaaaa1f:0x13 not handled
> [93.468081] sas: ex 500e004aaaaaaa1f phy19 failed to discover
> [93.479734] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.500240] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.510489] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.520714] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.531000] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.541351] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.551721] sas: ex 500e004aaaaaaa1f phys DID NOT change
> [93.562066] sas: ex 500e004aaaaaaa1f phys DID NOT change
> root@(none)$
> root@(none)$
> root@(none)$
> 
> And then it's the mounting a partition or examining it which hangs quite 
> reliably:
> 
> root@(none)$ mount /dev/sda1 mnt
> [  104.232655] EXT4-fs (sda1): recovery complete
> [  104.237021] EXT4-fs (sda1): mounted filesystem with ordered data 
> mode. Opts: (null). Quota mode: none.
> root@(none)$ ls mnt
> [  136.842085] sas: Enter sas_scsi_recover_host busy: 203 failed: 203
> [  136.848286] sas: sas_scsi_find_task: aborting task 0x(____ptrval____)
> [  157.280267] pm80xx1:: pm8001_exec_internal_tmf_task  756:TMF 
> task[1]timeout.
> [  171.283579] pm80xx1:: mpi_ssp_completion  1935:sas IO status 0x1
> [  171.283584] pm80xx1:: pm8001_abort_task  1327:rc= -5
> [  171.289579] pm80xx1:: mpi_ssp_completion  1946:SAS Address of IO 
> Failure Drive:5000c500a7b95a49
> [  171.289583] pm80xx1:: mpi_ssp_completion  2184:task 
> 0x(____ptrval____) done with io_status 0x1 resp 0x0 stat 0x8d but 
> aborted by upper layer!
> [  171.315904] sas: sas_eh_handle_sas_errors: task 0x(____ptrval____) is 
> done
> [  171.322769] sas: sas_scsi_find_task: aborting task 0x(____ptrval____)
> 
> Earlier today it was the mount command which was hanging. From debugging 
> that, I found that the very first SSP command when mounting is sent the 
> HW successfully but no completion interrupt ever occurs there - I really 
> don't know why. Other SSP commands complete successfully before this and 
> after (TMFs in the error handling), including ones which have sgls.
> 
> sda is a SAS drive, but I think SATA has the same issue - I was just 
> looking at sda.
> 
> One thing I noticed in the driver is that it uses mb() in between 
> writing to the DMA memory and initiating the HW - I don't think mb is 
> strong enough. However I don't think that is my issue - it wouldn't fail 
> reliably if it was.

Weird. I do not have an arm host to test. Could it be that the card FW is
crashing ? Can you recover from the above ? Or do you have to power cycle for
the HDD to be accessible again ?

Other possibility may be an IRQ controller issue with the platform ?

> 
> Thanks,
> John


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 13:46 ` John Garry
@ 2021-12-10 10:23   ` Ajish.Koshy
  2021-12-10 10:44     ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Ajish.Koshy @ 2021-12-10 10:23 UTC (permalink / raw)
  To: john.garry, jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, linux-scsi, linux-kernel, Vasanthalakshmi.Tharmarajan

Hi John,

In my humble opinion what we observed very earlier was with respect smp_request()/response() crash and this patch resolved it. Given that the issue was not only specific to ARM, issue was observed on x86 too with IOMMU enabled. Device discovery went fine post application of this patch on x86. 

What we are observing right now on error handling/timeouts for commands on drives will be altogether different issue that needs separate debugging on ARM platform with separate patch since this is a very initial execution of pm80xx driver on ARM platform. 

This patch is acceptable. Let me know your further views. 

Thanks,
Ajish

On 26/11/2021 15:35, John Garry wrote:
>       /*
> @@ -4280,8 +4283,9 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
>               pm8001_ha->smp_exp_mode = SMP_INDIRECT;
>
>
> -     tmp_addr = cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req));
> -     preq_dma_addr = (char *)phys_to_virt(tmp_addr);
> +     smp_req = &task->smp_task.smp_req;
> +     to = kmap(sg_page(smp_req));

This should be a kmap_atomic() as well, as I see the following for when
CONFIG_DEBUG_ATOMIC_SLEEP is enabled:

[   27.222116]  dump_backtrace+0x0/0x2b4
[   27.225774]  show_stack+0x1c/0x30
[   27.229084]  dump_stack_lvl+0x68/0x84
[   27.232741]  dump_stack+0x20/0x3c
[   27.236049]  __might_resched+0x1d4/0x240
[   27.239967]  __might_sleep+0x70/0xd0
[   27.243536]  pm80xx_chip_smp_req+0x2c4/0x56c
[   27.247802]  pm8001_task_exec.constprop.0+0x718/0x770
[   27.252848]  pm8001_queue_command+0x1c/0x2c
[   27.257026]  smp_execute_task_sg+0x1e8/0x370
[   27.261289]  sas_ex_phy_discover+0x29c/0x31c
[   27.265553]  smp_ata_check_ready+0x74/0x190
[   27.269729]  ata_wait_ready+0xd0/0x224
[   27.273474]  ata_wait_after_reset+0x78/0xac
[   27.277652]  sas_ata_hard_reset+0xf0/0x18c
[   27.281742]  ata_do_reset.constprop.0+0x80/0x9c
[   27.286266]  ata_eh_reset+0xba4/0x1170
[   27.290008]  ata_eh_recover+0x4b0/0x1b40
[   27.293924]  ata_do_eh+0x8c/0x110
[   27.297232]  ata_std_error_handler+0x80/0xb0
[   27.301495]  ata_scsi_port_error_handler+0x3d4/0x9d0
[   27.306454]  async_sas_ata_eh+0x70/0xf8
[   27.310285]  async_run_entry_fn+0x5c/0x1e0
[   27.314375]  process_one_work+0x378/0x630
[   27.318379]  worker_thread+0xa8/0x6bc
[   27.322033]  kthread+0x214/0x230
[   27.325256]  ret_from_fork+0x10/0x20
[   27.328825] pm80xx0:: pm80xx_chip_smp_req  4292:SMP REQUEST INDIRECT MODE

But I don't think that this is the problem which causes error handling
to kick in later, as discussed in this thread.

> +     payload = to + smp_req->offset;
>
>       /* INDIRECT MODE command settings. Use DMA */
>       if (pm8001_ha->smp_exp_mode == SMP_INDIRECT) {
> @@ -4289,7 +4293,7 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
>               /* for SPCv indirect mode. Place the top 4 bytes of
>                * SMP Request header here. */
>               for (i = 0; i < 4; i++)
> -                     smp_cmd.smp_req16[i] = *(preq_dma_addr + i);
> +                     smp_cmd.smp_req16[i] = *(payload + i);


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-09 23:55                   ` Damien Le Moal
@ 2021-12-10 10:35                     ` John Garry
  2021-12-11  0:19                       ` Damien Le Moal
  0 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2021-12-10 10:35 UTC (permalink / raw)
  To: Damien Le Moal, Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 09/12/2021 23:55, Damien Le Moal wrote:
>> Earlier today it was the mount command which was hanging. From debugging
>> that, I found that the very first SSP command when mounting is sent the
>> HW successfully but no completion interrupt ever occurs there - I really
>> don't know why. Other SSP commands complete successfully before this and
>> after (TMFs in the error handling), including ones which have sgls.
>>
>> sda is a SAS drive, but I think SATA has the same issue - I was just
>> looking at sda.
>>
>> One thing I noticed in the driver is that it uses mb() in between
>> writing to the DMA memory and initiating the HW - I don't think mb is
>> strong enough. However I don't think that is my issue - it wouldn't fail
>> reliably if it was.
> Weird. 

Yeah, quite strange.

I will also note that these earlier logs are also red flags, which I 
have not investigated:

[87.288239] sas: target proto 0x0 at 500e004aaaaaaa1f:0x10 not handled
[87.294793] sas: ex 500e004aaaaaaa1f phy16 failed to discover

> I do not have an arm host to test. Could it be that the card FW is
> crashing ?

But the later TMFs seem to succeed, so I doubt it's crashing. I did 
wonder if it's going into some low-power/idle mode and just not 
responding, but not sure on that.

> Can you recover from the above ? 

It never really recovers and is always caught up in some error handling.

> Or do you have to power cycle for
> the HDD to be accessible again ?

Power cycle is necessary to recover as we can't remove the driver when 
it is in error handling

> 
> Other possibility may be an IRQ controller issue with the platform ?
> 

Highly unlikely. I did wonder if the interrupts are properly allocated 
and requested, and they look ok from /proc/interrupts

I also tried limiting the CPUs we bring up to a single CPU and so that 
we only use a single MSIx and a single HW queue, and now get this crash:

[7.775168] loop: module loaded
[7.783226] pm80xx 0000:04:00.0: Adding to iommu group 0
[7.795787] pm80xx 0000:04:00.0: pm80xx: driver version 0.1.40
[7.806789] pm80xx 0000:04:00.0: enabling device (0140 -> 0142)
[7.818910] :: pm8001_pci_alloc  530:Setting link rate to default value
[8.866618] scsi host0: pm80xx
[8.879056] pm80xx0:: process_oq  4169:Firmware Fatal error! Regval:0xc0f
[8.885842] pm80xx0:: print_scratchpad_registers 
4130:MSGU_SCRATCH_PAD_0: 0x40002000
[8.893661] pm80xx0:: print_scratchpad_registers 
4132:MSGU_SCRATCH_PAD_1:0xc0f
[8.900958] pm80xx0:: print_scratchpad_registers 
4134:MSGU_SCRATCH_PAD_2: 0x0
[8.908169] pm80xx0:: print_scratchpad_registers 
4136:MSGU_SCRATCH_PAD_3: 0x30000000
[8.915986] pm80xx0:: print_scratchpad_registers 
4138:MSGU_HOST_SCRATCH_PAD_0: 0x0
[8.923630] pm80xx0:: print_scratchpad_registers 
4140:MSGU_HOST_SCRATCH_PAD_1: 0x0
[8.931274] pm80xx0:: print_scratchpad_registers 
4142:MSGU_HOST_SCRATCH_PAD_2: 0x0
[8.938917] pm80xx0:: print_scratchpad_registers 
4144:MSGU_HOST_SCRATCH_PAD_3: 0x0
[8.946561] pm80xx0:: print_scratchpad_registers 
4146:MSGU_HOST_SCRATCH_PAD_4: 0x0
[8.954205] pm80xx0:: print_scratchpad_registers 
4148:MSGU_HOST_SCRATCH_PAD_5: 0x0
[8.961849] pm80xx0:: print_scratchpad_registers 
4150:MSGU_RSVD_SCRATCH_PAD_0: 0x0
[8.969493] pm80xx0:: print_scratchpad_registers 
4152:MSGU_RSVD_SCRATCH_PAD_1: 0x0
[8.977143] Unable to handle kernel NULL pointer dereference at virtual 
address 0000000000000018
[8.994782] Mem abort info:
[8.997565]   ESR = 0x96000004
[9.006782]   EC = 0x25: DABT (current EL), IL = 32 bits
[9.018781]   SET = 0, FnV = 0
[9.021824]   EA = 0, S1PTW = 0
[9.030797]   FSC = 0x04: level 0 translation fault
[9.038794] Data abort info:
[9.041662]   ISV = 0, ISS = 0x00000004
[9.050781]   CM = 0, WnR = 0
[9.053737] [0000000000000018] user address but active_mm is swapper
[9.070782] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[9.076343] Modules linked in:
[9.079387] CPU: 0 PID: 20 Comm: kworker/0:2 Not tainted 
5.16.0-rc4-00002-ge23d68774178-dirty #328
[9.088333] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 - 
V1.16.01 03/15/2019
[9.096844] Workqueue: pm80xx pm8001_work_fn
[9.101108] pstate: 00400009 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[9.108057] pc : pm8001_work_fn+0x298/0x690
[9.112229] lr : process_one_work+0x1d0/0x354
[9.116574] sp : ffff800012d23d50
[9.119876] x29: ffff800012d23d50 x28: 0000000000000000 x27: 0000000000000000
[9.127000] x26: ffff8000117e8bc0 x25: ffff8000113aaeb0 x24: ffff00209d4b0000
[9.134124] x23: ffff0020ad23b280 x22: ffff00209d4b8000 x21: 0000000000000001
[9.141249] x20: 0000000000000000 x19: 0000000000000038 x18: 0000000000000000
[9.148373] x17: 4441505f48435441 x16: 5243535f44565352 x15: 000000052ff6b548
[9.155496] x14: 0000000000000018 x13: 0000000000000018 x12: 0000000000000000
[9.162620] x11: 0000000000000014 x10: 00000000000009a0 x9 : ffff002086ef6074
[9.169743] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff002086ef6074
[9.176867] x5 : 0000787830386d70 x4 : ffff00209d5e0000 x3 : 0000000000000000
[9.183990] x2 : ffff00209d5e0038 x1 : ffff800010a20120 x0 : 0000000000000051
[9.191114] Call trace:
[9.193547]  pm8001_work_fn+0x298/0x690
[9.197372]  process_one_work+0x1d0/0x354
[9.201369]  worker_thread+0x13c/0x470
[9.205105]  kthread+0x17c/0x190
[9.208321]  ret_from_fork+0x10/0x20
[9.211886] Code: 17fffff1 310006bf 54fffde0 f9400c54 (f9400e80)
[9.217968] ---[ end trace de649a9be2843866 ]---
[9.339812] pm80xx0:: process_oq  4169:Firmware Fatal error! Regval:0xc0f
[9.346602] pm80xx0:: print_scratchpad_registers 
4130:MSGU_SCRATCH_PAD_0: 0x40002000
[9.354420] pm80xx0:: print_scratchpad_registers 
4132:MSGU_SCRATCH_PAD_1:0xc0f
[9.361717] pm80xx0:: print_scratchpad_registers 
4134:MSGU_SCRATCH_PAD_2: 0x0
[9.368927] pm80xx0:: print_scratchpad_registers 
4136:MSGU_SCRATCH_PAD_3: 0x30000000
[9.376744] pm80xx0:: print_scratchpad_registers 
4138:MSGU_HOST_SCRATCH_PAD_0: 0x0
[9.384388] pm80xx0:: print_scratchpad_registers 
4140:MSGU_HOST_SCRATCH_PAD_1: 0x0
[9.392032] pm80xx0:: print_scratchpad_registers 
4142:MSGU_HOST_SCRATCH_PAD_2: 0x0
[9.399676] pm80xx0:: print_scratchpad_registers 
4144:MSGU_HOST_SCRATCH_PAD_3: 0x0
[9.407319] pm80xx0:: print_scratchpad_registers 
4146:MSGU_HOST_SCRATCH_PAD_4: 0x0
[9.414963] pm80xx0:: print_scratchpad_registers 
4148:MSGU_HOST_SCRATCH_PAD_5: 0x0
[9.422607] pm80xx0:: print_scratchpad_registers 
4150:MSGU_RSVD_SCRATCH_PAD_0: 0x0
[9.430251] pm80xx0:: print_scratchpad_registers 
4152:MSGU_RSVD_SCRATCH_PAD_1: 0x0
[   10.028906] Freeing initrd memory: 413456K

...

Thanks,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-10 10:23   ` Ajish.Koshy
@ 2021-12-10 10:44     ` John Garry
  0 siblings, 0 replies; 17+ messages in thread
From: John Garry @ 2021-12-10 10:44 UTC (permalink / raw)
  To: Ajish.Koshy, jinpu.wang, jejb, martin.petersen
  Cc: Viswas.G, linux-scsi, linux-kernel, Vasanthalakshmi.Tharmarajan

On 10/12/2021 10:23, Ajish.Koshy@microchip.com wrote:
> Hi John,

Please config your editor to wrap at ~72 characters and don't top-post.

> 
> In my humble opinion what we observed very earlier was with respect smp_request()/response() crash and this patch resolved it. Given that the issue was not only specific to ARM, issue was observed on x86 too with IOMMU enabled. Device discovery went fine post application of this patch on x86.
> 
> What we are observing right now on error handling/timeouts for commands on drives will be altogether different issue that needs separate debugging on ARM platform with separate patch since this is a very initial execution of pm80xx driver on ARM platform.
> 
> This patch is acceptable. Let me know your further views.
> 

As I mentioned earlier about this patch, a v2 is needed as we need 
kmap_atomic() in both cases. But I would rather resolve all issues 
before getting that merged - it has been broken this way for a long time.

So I mentioned a new issue in my response to Damien - maxcpus=1 on the 
command line crashes on boot. I can imagine that x86 also has that issue.

As for the timeouts, why would the FW not respond in the way I 
described? I would always expect a completion, even for erroneous 
commands. Maybe it is an interrupt issue in the driver. Is there some 
diagnostics I can check - there seems to be a lot of "stuff" in the 
sysfs scsi_host folder.

Thanks,
John

> Thanks,
> Ajish
> 
> On 26/11/2021 15:35, John Garry wrote:
>>        /*
>> @@ -4280,8 +4283,9 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha,
>>                pm8001_ha->smp_exp_mode = SMP_INDIRECT;
>>
>>
>> -     tmp_addr = cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req));
>> -     preq_dma_addr = (char *)phys_to_virt(tmp_addr);
>> +     smp_req = &task->smp_task.smp_req;
>> +     to = kmap(sg_page(smp_req));
> This should be a kmap_atomic() as well, as I see the following for when
> CONFIG_DEBUG_ATOMIC_SLEEP is enabled:


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t
  2021-12-10 10:35                     ` John Garry
@ 2021-12-11  0:19                       ` Damien Le Moal
  0 siblings, 0 replies; 17+ messages in thread
From: Damien Le Moal @ 2021-12-11  0:19 UTC (permalink / raw)
  To: John Garry, Ajish.Koshy
  Cc: jinpu.wang, jejb, martin.petersen, Viswas.G, linux-scsi,
	linux-kernel, Niklas.Cassel, Vasanthalakshmi.Tharmarajan

On 2021/12/10 19:35, John Garry wrote:
> On 09/12/2021 23:55, Damien Le Moal wrote:
>>> Earlier today it was the mount command which was hanging. From debugging
>>> that, I found that the very first SSP command when mounting is sent the
>>> HW successfully but no completion interrupt ever occurs there - I really
>>> don't know why. Other SSP commands complete successfully before this and
>>> after (TMFs in the error handling), including ones which have sgls.
>>>
>>> sda is a SAS drive, but I think SATA has the same issue - I was just
>>> looking at sda.
>>>
>>> One thing I noticed in the driver is that it uses mb() in between
>>> writing to the DMA memory and initiating the HW - I don't think mb is
>>> strong enough. However I don't think that is my issue - it wouldn't fail
>>> reliably if it was.
>> Weird. 
> 
> Yeah, quite strange.
> 
> I will also note that these earlier logs are also red flags, which I 
> have not investigated:
> 
> [87.288239] sas: target proto 0x0 at 500e004aaaaaaa1f:0x10 not handled
> [87.294793] sas: ex 500e004aaaaaaa1f phy16 failed to discover
> 
>> I do not have an arm host to test. Could it be that the card FW is
>> crashing ?
> 
> But the later TMFs seem to succeed, so I doubt it's crashing. I did 
> wonder if it's going into some low-power/idle mode and just not 
> responding, but not sure on that.
> 
>> Can you recover from the above ? 
> 
> It never really recovers and is always caught up in some error handling.
> 
>> Or do you have to power cycle for
>> the HDD to be accessible again ?
> 
> Power cycle is necessary to recover as we can't remove the driver when 
> it is in error handling
> 
>>
>> Other possibility may be an IRQ controller issue with the platform ?
>>
> 
> Highly unlikely. I did wonder if the interrupts are properly allocated 
> and requested, and they look ok from /proc/interrupts
> 
> I also tried limiting the CPUs we bring up to a single CPU and so that 
> we only use a single MSIx and a single HW queue, and now get this crash:
> 
> [7.775168] loop: module loaded
> [7.783226] pm80xx 0000:04:00.0: Adding to iommu group 0
> [7.795787] pm80xx 0000:04:00.0: pm80xx: driver version 0.1.40
> [7.806789] pm80xx 0000:04:00.0: enabling device (0140 -> 0142)
> [7.818910] :: pm8001_pci_alloc  530:Setting link rate to default value
> [8.866618] scsi host0: pm80xx
> [8.879056] pm80xx0:: process_oq  4169:Firmware Fatal error! Regval:0xc0f

I have an old-ish Adaptec card that throws something similar at me if I connect
a host-managed SMR drive to the HBA. After that message, nothing works at all so
in my case I suspect that the FW gets into a really bad state/crashes.

> [8.885842] pm80xx0:: print_scratchpad_registers 
> 4130:MSGU_SCRATCH_PAD_0: 0x40002000
> [8.893661] pm80xx0:: print_scratchpad_registers 
> 4132:MSGU_SCRATCH_PAD_1:0xc0f
> [8.900958] pm80xx0:: print_scratchpad_registers 
> 4134:MSGU_SCRATCH_PAD_2: 0x0
> [8.908169] pm80xx0:: print_scratchpad_registers 
> 4136:MSGU_SCRATCH_PAD_3: 0x30000000
> [8.915986] pm80xx0:: print_scratchpad_registers 
> 4138:MSGU_HOST_SCRATCH_PAD_0: 0x0
> [8.923630] pm80xx0:: print_scratchpad_registers 
> 4140:MSGU_HOST_SCRATCH_PAD_1: 0x0
> [8.931274] pm80xx0:: print_scratchpad_registers 
> 4142:MSGU_HOST_SCRATCH_PAD_2: 0x0
> [8.938917] pm80xx0:: print_scratchpad_registers 
> 4144:MSGU_HOST_SCRATCH_PAD_3: 0x0
> [8.946561] pm80xx0:: print_scratchpad_registers 
> 4146:MSGU_HOST_SCRATCH_PAD_4: 0x0
> [8.954205] pm80xx0:: print_scratchpad_registers 
> 4148:MSGU_HOST_SCRATCH_PAD_5: 0x0
> [8.961849] pm80xx0:: print_scratchpad_registers 
> 4150:MSGU_RSVD_SCRATCH_PAD_0: 0x0
> [8.969493] pm80xx0:: print_scratchpad_registers 
> 4152:MSGU_RSVD_SCRATCH_PAD_1: 0x0
> [8.977143] Unable to handle kernel NULL pointer dereference at virtual 
> address 0000000000000018

Is this with or without your phys_to_virt() dma/iommu fix patch ?
I do remember seeing lots of crashes/hangs with that old-ish Adaptec HBA on x86
host. I can try again to see if the errors are similar. I definitely hit the
iommu problem with that card as I had to boot with iommu=off to, well, be able
to boot :)

Next time I go to the lab, I will plug this card again to test with your iommu
patch.

> [8.994782] Mem abort info:
> [8.997565]   ESR = 0x96000004
> [9.006782]   EC = 0x25: DABT (current EL), IL = 32 bits
> [9.018781]   SET = 0, FnV = 0
> [9.021824]   EA = 0, S1PTW = 0
> [9.030797]   FSC = 0x04: level 0 translation fault
> [9.038794] Data abort info:
> [9.041662]   ISV = 0, ISS = 0x00000004
> [9.050781]   CM = 0, WnR = 0
> [9.053737] [0000000000000018] user address but active_mm is swapper
> [9.070782] Internal error: Oops: 96000004 [#1] PREEMPT SMP
> [9.076343] Modules linked in:
> [9.079387] CPU: 0 PID: 20 Comm: kworker/0:2 Not tainted 
> 5.16.0-rc4-00002-ge23d68774178-dirty #328
> [9.088333] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 - 
> V1.16.01 03/15/2019
> [9.096844] Workqueue: pm80xx pm8001_work_fn
> [9.101108] pstate: 00400009 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [9.108057] pc : pm8001_work_fn+0x298/0x690
> [9.112229] lr : process_one_work+0x1d0/0x354
> [9.116574] sp : ffff800012d23d50
> [9.119876] x29: ffff800012d23d50 x28: 0000000000000000 x27: 0000000000000000
> [9.127000] x26: ffff8000117e8bc0 x25: ffff8000113aaeb0 x24: ffff00209d4b0000
> [9.134124] x23: ffff0020ad23b280 x22: ffff00209d4b8000 x21: 0000000000000001
> [9.141249] x20: 0000000000000000 x19: 0000000000000038 x18: 0000000000000000
> [9.148373] x17: 4441505f48435441 x16: 5243535f44565352 x15: 000000052ff6b548
> [9.155496] x14: 0000000000000018 x13: 0000000000000018 x12: 0000000000000000
> [9.162620] x11: 0000000000000014 x10: 00000000000009a0 x9 : ffff002086ef6074
> [9.169743] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff002086ef6074
> [9.176867] x5 : 0000787830386d70 x4 : ffff00209d5e0000 x3 : 0000000000000000
> [9.183990] x2 : ffff00209d5e0038 x1 : ffff800010a20120 x0 : 0000000000000051
> [9.191114] Call trace:
> [9.193547]  pm8001_work_fn+0x298/0x690
> [9.197372]  process_one_work+0x1d0/0x354
> [9.201369]  worker_thread+0x13c/0x470
> [9.205105]  kthread+0x17c/0x190
> [9.208321]  ret_from_fork+0x10/0x20
> [9.211886] Code: 17fffff1 310006bf 54fffde0 f9400c54 (f9400e80)
> [9.217968] ---[ end trace de649a9be2843866 ]---
> [9.339812] pm80xx0:: process_oq  4169:Firmware Fatal error! Regval:0xc0f
> [9.346602] pm80xx0:: print_scratchpad_registers 
> 4130:MSGU_SCRATCH_PAD_0: 0x40002000
> [9.354420] pm80xx0:: print_scratchpad_registers 
> 4132:MSGU_SCRATCH_PAD_1:0xc0f
> [9.361717] pm80xx0:: print_scratchpad_registers 
> 4134:MSGU_SCRATCH_PAD_2: 0x0
> [9.368927] pm80xx0:: print_scratchpad_registers 
> 4136:MSGU_SCRATCH_PAD_3: 0x30000000
> [9.376744] pm80xx0:: print_scratchpad_registers 
> 4138:MSGU_HOST_SCRATCH_PAD_0: 0x0
> [9.384388] pm80xx0:: print_scratchpad_registers 
> 4140:MSGU_HOST_SCRATCH_PAD_1: 0x0
> [9.392032] pm80xx0:: print_scratchpad_registers 
> 4142:MSGU_HOST_SCRATCH_PAD_2: 0x0
> [9.399676] pm80xx0:: print_scratchpad_registers 
> 4144:MSGU_HOST_SCRATCH_PAD_3: 0x0
> [9.407319] pm80xx0:: print_scratchpad_registers 
> 4146:MSGU_HOST_SCRATCH_PAD_4: 0x0
> [9.414963] pm80xx0:: print_scratchpad_registers 
> 4148:MSGU_HOST_SCRATCH_PAD_5: 0x0
> [9.422607] pm80xx0:: print_scratchpad_registers 
> 4150:MSGU_RSVD_SCRATCH_PAD_0: 0x0
> [9.430251] pm80xx0:: print_scratchpad_registers 
> 4152:MSGU_RSVD_SCRATCH_PAD_1: 0x0
> [   10.028906] Freeing initrd memory: 413456K
> 
> ...
> 
> Thanks,
> John


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-12-11  0:19 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-26 15:35 [PATCH] scsi: pm8001: Fix phys_to_virt() usage on dma_addr_t John Garry
2021-11-29 10:46 ` Ajish.Koshy
2021-12-06  9:42   ` John Garry
2021-12-06 13:24     ` Niklas Cassel
2021-12-07 10:36       ` Ajish.Koshy
2021-12-07 13:54         ` John Garry
2021-12-09 12:04           ` Ajish.Koshy
2021-12-09 12:19             ` John Garry
2021-12-09 22:38               ` Damien Le Moal
2021-12-09 23:09                 ` John Garry
2021-12-09 23:55                   ` Damien Le Moal
2021-12-10 10:35                     ` John Garry
2021-12-11  0:19                       ` Damien Le Moal
2021-12-09 22:26             ` Damien Le Moal
2021-12-09 13:46 ` John Garry
2021-12-10 10:23   ` Ajish.Koshy
2021-12-10 10:44     ` John Garry

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).