linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] Switchtec Fixes and Improvements
@ 2021-09-24 11:08 kelvin.cao
  2021-09-24 11:08 ` [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access kelvin.cao
                   ` (7 more replies)
  0 siblings, 8 replies; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Kelvin Cao <kelvin.cao@microchip.com>

Hi,

Please find a bunch of patches for the switchtec driver collected over the
last few months.

The first 2 patches fix two minor bugs. Patch 3 updates the method of
getting management VEP instance ID based on a new firmware change. Patch
4 replaces ENOTSUPP with EOPNOTSUPP to follow the preference of kernel.
And the last patch adds check of event support to align with new firmware
implementation.

This patchset is based on v5.15-rc2.

Thanks,
Kelvin

Kelvin Cao (4):
  PCI/switchtec: Error out MRPC execution when no GAS access
  PCI/switchtec: Fix a MRPC error status handling issue
  PCI/switchtec: Update the way of getting management VEP instance ID
  PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP

Logan Gunthorpe (1):
  PCI/switchtec: Add check of event support

 drivers/pci/switch/switchtec.c | 87 +++++++++++++++++++++++++++-------
 include/linux/switchtec.h      |  1 +
 2 files changed, 71 insertions(+), 17 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
@ 2021-09-24 11:08 ` kelvin.cao
  2021-10-01 20:18   ` Bjorn Helgaas
  2021-09-24 11:08 ` [PATCH 2/5] PCI/switchtec: Fix a MRPC error status handling issue kelvin.cao
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Kelvin Cao <kelvin.cao@microchip.com>

After a firmware hard reset, MRPC command executions, which are based
on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
indefinitely. This is because after a reset, the host will fail all GAS
reads (get all 1s), in which case the driver won't get a valid MRPC
status.

Add a read check to GAS access when a MRPC command execution doesn't
response timely, error out if the check fails.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 59 ++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 0b301f8be9ed..092653487021 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -45,6 +45,7 @@ enum mrpc_state {
 	MRPC_QUEUED,
 	MRPC_RUNNING,
 	MRPC_DONE,
+	MRPC_IO_ERROR,
 };
 
 struct switchtec_user {
@@ -66,6 +67,13 @@ struct switchtec_user {
 	int event_cnt;
 };
 
+static int check_access(struct switchtec_dev *stdev)
+{
+	u32 device = ioread32(&stdev->mmio_sys_info->device_id);
+
+	return stdev->pdev->device == device;
+}
+
 static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
 {
 	struct switchtec_user *stuser;
@@ -113,6 +121,7 @@ static void stuser_set_state(struct switchtec_user *stuser,
 		[MRPC_QUEUED] = "QUEUED",
 		[MRPC_RUNNING] = "RUNNING",
 		[MRPC_DONE] = "DONE",
+		[MRPC_IO_ERROR] = "IO_ERROR",
 	};
 
 	stuser->state = state;
@@ -184,6 +193,21 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
 	return 0;
 }
 
+static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+	struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
+						   struct switchtec_user, list);
+
+	stuser->cmd_done = true;
+	wake_up_interruptible(&stuser->cmd_comp);
+	list_del_init(&stuser->list);
+	stuser_put(stuser);
+	stdev->mrpc_busy = 0;
+
+	mrpc_cmd_submit(stdev);
+}
+
 static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 {
 	/* requires the mrpc_mutex to already be held when called */
@@ -223,13 +247,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 		memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
 			      stuser->read_len);
 out:
-	stuser->cmd_done = true;
-	wake_up_interruptible(&stuser->cmd_comp);
-	list_del_init(&stuser->list);
-	stuser_put(stuser);
-	stdev->mrpc_busy = 0;
-
-	mrpc_cmd_submit(stdev);
+	mrpc_cleanup_cmd(stdev);
 }
 
 static void mrpc_event_work(struct work_struct *work)
@@ -246,6 +264,23 @@ static void mrpc_event_work(struct work_struct *work)
 	mutex_unlock(&stdev->mrpc_mutex);
 }
 
+static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+
+	struct switchtec_user *stuser;
+
+	if (list_empty(&stdev->mrpc_queue))
+		return;
+
+	stuser = list_entry(stdev->mrpc_queue.next,
+			    struct switchtec_user, list);
+
+	stuser_set_state(stuser, MRPC_IO_ERROR);
+
+	mrpc_cleanup_cmd(stdev);
+}
+
 static void mrpc_timeout_work(struct work_struct *work)
 {
 	struct switchtec_dev *stdev;
@@ -257,6 +292,11 @@ static void mrpc_timeout_work(struct work_struct *work)
 
 	mutex_lock(&stdev->mrpc_mutex);
 
+	if (!check_access(stdev)) {
+		mrpc_error_complete_cmd(stdev);
+		goto out;
+	}
+
 	if (stdev->dma_mrpc)
 		status = stdev->dma_mrpc->status;
 	else
@@ -544,6 +584,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
 	if (rc)
 		return rc;
 
+	if (stuser->state == MRPC_IO_ERROR) {
+		mutex_unlock(&stdev->mrpc_mutex);
+		return -EIO;
+	}
+
 	if (stuser->state != MRPC_DONE) {
 		mutex_unlock(&stdev->mrpc_mutex);
 		return -EBADE;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/5] PCI/switchtec: Fix a MRPC error status handling issue
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
  2021-09-24 11:08 ` [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access kelvin.cao
@ 2021-09-24 11:08 ` kelvin.cao
  2021-09-24 11:08 ` [PATCH 3/5] PCI/switchtec: Update the way of getting management VEP instance ID kelvin.cao
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Kelvin Cao <kelvin.cao@microchip.com>

If an error is encountered when executing a MRPC command, the firmware
will set the status register to SWITCHTEC_MRPC_STATUS_ERROR and return
the error code in the return value register.

Add handling of SWITCHTEC_MRPC_STATUS_ERROR on status register when
completing a MRPC command.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 092653487021..76f14ed15445 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -230,7 +230,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 	stuser_set_state(stuser, MRPC_DONE);
 	stuser->return_code = 0;
 
-	if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE)
+	if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE &&
+	    stuser->status != SWITCHTEC_MRPC_STATUS_ERROR)
 		goto out;
 
 	if (stdev->dma_mrpc)
@@ -614,7 +615,8 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
 out:
 	mutex_unlock(&stdev->mrpc_mutex);
 
-	if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE)
+	if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE ||
+	    stuser->status == SWITCHTEC_MRPC_STATUS_ERROR)
 		return size;
 	else if (stuser->status == SWITCHTEC_MRPC_STATUS_INTERRUPTED)
 		return -ENXIO;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 3/5] PCI/switchtec: Update the way of getting management VEP instance ID
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
  2021-09-24 11:08 ` [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access kelvin.cao
  2021-09-24 11:08 ` [PATCH 2/5] PCI/switchtec: Fix a MRPC error status handling issue kelvin.cao
@ 2021-09-24 11:08 ` kelvin.cao
  2021-09-24 11:08 ` [PATCH 4/5] PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP kelvin.cao
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Kelvin Cao <kelvin.cao@microchip.com>

Gen4 firmware adds DMA VEP and NVMe VEP support in VEP (virtual EP)
instance ID register in addtion to management EP, update the way of
getting management VEP instance ID.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 76f14ed15445..b76094e2c885 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -1125,7 +1125,7 @@ static int ioctl_pff_to_port(struct switchtec_dev *stdev,
 			break;
 		}
 
-		reg = ioread32(&pcfg->vep_pff_inst_id);
+		reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
 		if (reg == p.pff) {
 			p.port = SWITCHTEC_IOCTL_PFF_VEP;
 			break;
@@ -1171,7 +1171,7 @@ static int ioctl_port_to_pff(struct switchtec_dev *stdev,
 		p.pff = ioread32(&pcfg->usp_pff_inst_id);
 		break;
 	case SWITCHTEC_IOCTL_PFF_VEP:
-		p.pff = ioread32(&pcfg->vep_pff_inst_id);
+		p.pff = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
 		break;
 	default:
 		if (p.port > ARRAY_SIZE(pcfg->dsp_pff_inst_id))
@@ -1545,7 +1545,7 @@ static void init_pff(struct switchtec_dev *stdev)
 	if (reg < stdev->pff_csr_count)
 		stdev->pff_local[reg] = 1;
 
-	reg = ioread32(&pcfg->vep_pff_inst_id);
+	reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
 	if (reg < stdev->pff_csr_count)
 		stdev->pff_local[reg] = 1;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 4/5] PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
                   ` (2 preceding siblings ...)
  2021-09-24 11:08 ` [PATCH 3/5] PCI/switchtec: Update the way of getting management VEP instance ID kelvin.cao
@ 2021-09-24 11:08 ` kelvin.cao
  2021-09-24 11:08 ` [PATCH 5/5] PCI/switchtec: Add check of event support kelvin.cao
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Kelvin Cao <kelvin.cao@microchip.com>

ENOTSUPP is not a SUSV4 error code, and the following checkpatch.pl
warning will be given for new patches which still use ENOTSUPP.

    WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP

See the link below for the discussion.

Link: https://lore.kernel.org/netdev/20200511165319.2251678-1-kuba@kernel.org/

Replace ENOTSUPP with EOPNOTSUPP to align with future patches which will
be using EOPNOTSUPP.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index b76094e2c885..20cec2367084 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -368,7 +368,7 @@ static ssize_t field ## _show(struct device *dev, \
 		return io_string_show(buf, &si->gen4.field, \
 				      sizeof(si->gen4.field)); \
 	else \
-		return -ENOTSUPP; \
+		return -EOPNOTSUPP; \
 } \
 \
 static DEVICE_ATTR_RO(field)
@@ -660,7 +660,7 @@ static int ioctl_flash_info(struct switchtec_dev *stdev,
 		info.flash_length = ioread32(&fi->gen4.flash_length);
 		info.num_partitions = SWITCHTEC_NUM_PARTITIONS_GEN4;
 	} else {
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	if (copy_to_user(uinfo, &info, sizeof(info)))
@@ -868,7 +868,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev,
 		if (ret)
 			return ret;
 	} else {
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	if (copy_to_user(uinfo, &info, sizeof(info)))
@@ -1603,7 +1603,7 @@ static int switchtec_init_pci(struct switchtec_dev *stdev,
 	else if (stdev->gen == SWITCHTEC_GEN4)
 		part_id = &stdev->mmio_sys_info->gen4.partition_id;
 	else
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	stdev->partition = ioread8(part_id);
 	stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 5/5] PCI/switchtec: Add check of event support
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
                   ` (3 preceding siblings ...)
  2021-09-24 11:08 ` [PATCH 4/5] PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP kelvin.cao
@ 2021-09-24 11:08 ` kelvin.cao
  2021-09-24 15:53 ` [PATCH 0/5] Switchtec Fixes and Improvements Logan Gunthorpe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 32+ messages in thread
From: kelvin.cao @ 2021-09-24 11:08 UTC (permalink / raw)
  To: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel
  Cc: kelvin.cao, kelvincao

From: Logan Gunthorpe <logang@deltatee.com>

Not all events are supported by every gen/variant of the Switchtec
firmware. To solve this, since Gen4, a new bit in each event header
is introduced to indicate if an event is supported by the firmware.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 8 +++++++-
 include/linux/switchtec.h      | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 20cec2367084..739e063a6b85 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -1016,6 +1016,9 @@ static int event_ctl(struct switchtec_dev *stdev,
 		return PTR_ERR(reg);
 
 	hdr = ioread32(reg);
+	if (hdr & SWITCHTEC_EVENT_NOT_SUPP)
+		return -EOPNOTSUPP;
+
 	for (i = 0; i < ARRAY_SIZE(ctl->data); i++)
 		ctl->data[i] = ioread32(&reg[i + 1]);
 
@@ -1088,7 +1091,7 @@ static int ioctl_event_ctl(struct switchtec_dev *stdev,
 		for (ctl.index = 0; ctl.index < nr_idxs; ctl.index++) {
 			ctl.flags = event_flags;
 			ret = event_ctl(stdev, &ctl);
-			if (ret < 0)
+			if (ret < 0 && ret != -EOPNOTSUPP)
 				return ret;
 		}
 	} else {
@@ -1395,6 +1398,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx)
 	hdr_reg = event_regs[eid].map_reg(stdev, off, idx);
 	hdr = ioread32(hdr_reg);
 
+	if (hdr & SWITCHTEC_EVENT_NOT_SUPP)
+		return 0;
+
 	if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ))
 		return 0;
 
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 082f1d51957a..be24056ac00f 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -19,6 +19,7 @@
 #define SWITCHTEC_EVENT_EN_CLI   BIT(2)
 #define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
 #define SWITCHTEC_EVENT_FATAL    BIT(4)
+#define SWITCHTEC_EVENT_NOT_SUPP BIT(31)
 
 #define SWITCHTEC_DMA_MRPC_EN	BIT(0)
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
                   ` (4 preceding siblings ...)
  2021-09-24 11:08 ` [PATCH 5/5] PCI/switchtec: Add check of event support kelvin.cao
@ 2021-09-24 15:53 ` Logan Gunthorpe
  2021-09-25  5:27   ` Kelvin.Cao
  2021-09-27 16:39 ` Bjorn Helgaas
  2021-10-08 17:05 ` Bjorn Helgaas
  7 siblings, 1 reply; 32+ messages in thread
From: Logan Gunthorpe @ 2021-09-24 15:53 UTC (permalink / raw)
  To: kelvin.cao, kurt.schwemmer, bhelgaas, linux-pci, linux-kernel; +Cc: kelvincao



On 2021-09-24 5:08 a.m., kelvin.cao@microchip.com wrote:
> From: Kelvin Cao <kelvin.cao@microchip.com>
> 
> Hi,
> 
> Please find a bunch of patches for the switchtec driver collected over the
> last few months.
> 
> The first 2 patches fix two minor bugs. Patch 3 updates the method of
> getting management VEP instance ID based on a new firmware change. Patch
> 4 replaces ENOTSUPP with EOPNOTSUPP to follow the preference of kernel.
> And the last patch adds check of event support to align with new firmware
> implementation.
> 
> This patchset is based on v5.15-rc2.
> 
> Thanks,
> Kelvin
> 
> Kelvin Cao (4):
>   PCI/switchtec: Error out MRPC execution when no GAS access
>   PCI/switchtec: Fix a MRPC error status handling issue
>   PCI/switchtec: Update the way of getting management VEP instance ID
>   PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP
> 
> Logan Gunthorpe (1):
>   PCI/switchtec: Add check of event support
> 
>  drivers/pci/switch/switchtec.c | 87 +++++++++++++++++++++++++++-------
>  include/linux/switchtec.h      |  1 +
>  2 files changed, 71 insertions(+), 17 deletions(-)

Thanks Kelvin! This all looks good to me.

For the entire series (except the last patch which I wrote):

Reviewed-by: Logan Gunthorpe <logang@deltatee.com>

Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-09-24 15:53 ` [PATCH 0/5] Switchtec Fixes and Improvements Logan Gunthorpe
@ 2021-09-25  5:27   ` Kelvin.Cao
  0 siblings, 0 replies; 32+ messages in thread
From: Kelvin.Cao @ 2021-09-25  5:27 UTC (permalink / raw)
  To: kurt.schwemmer, bhelgaas, logang, linux-kernel, linux-pci; +Cc: kelvincao

On Fri, 2021-09-24 at 09:53 -0600, Logan Gunthorpe wrote:
> On 2021-09-24 5:08 a.m., kelvin.cao@microchip.com wrote:
> > From: Kelvin Cao <kelvin.cao@microchip.com>
> > 
> > Hi,
> > 
> > Please find a bunch of patches for the switchtec driver collected
> > over the
> > last few months.
> > 
> > The first 2 patches fix two minor bugs. Patch 3 updates the method
> > of
> > getting management VEP instance ID based on a new firmware change.
> > Patch
> > 4 replaces ENOTSUPP with EOPNOTSUPP to follow the preference of
> > kernel.
> > And the last patch adds check of event support to align with new
> > firmware
> > implementation.
> > 
> > This patchset is based on v5.15-rc2.
> > 
> > Thanks,
> > Kelvin
> > 
> > Kelvin Cao (4):
> >   PCI/switchtec: Error out MRPC execution when no GAS access
> >   PCI/switchtec: Fix a MRPC error status handling issue
> >   PCI/switchtec: Update the way of getting management VEP instance
> > ID
> >   PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP
> > 
> > Logan Gunthorpe (1):
> >   PCI/switchtec: Add check of event support
> > 
> >  drivers/pci/switch/switchtec.c | 87 +++++++++++++++++++++++++++---
> > ----
> >  include/linux/switchtec.h      |  1 +
> >  2 files changed, 71 insertions(+), 17 deletions(-)
> 
> Thanks Kelvin! This all looks good to me.
> 
> For the entire series (except the last patch which I wrote):
> 
> Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
> 
> Logan

Thanks for the review!

Kelvin

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
                   ` (5 preceding siblings ...)
  2021-09-24 15:53 ` [PATCH 0/5] Switchtec Fixes and Improvements Logan Gunthorpe
@ 2021-09-27 16:39 ` Bjorn Helgaas
  2021-09-27 18:25   ` Kelvin.Cao
  2021-10-08 17:05 ` Bjorn Helgaas
  7 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-09-27 16:39 UTC (permalink / raw)
  To: kelvin.cao
  Cc: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, Sep 24, 2021 at 11:08:37AM +0000, kelvin.cao@microchip.com wrote:
> From: Kelvin Cao <kelvin.cao@microchip.com>
> 
> Hi,
> 
> Please find a bunch of patches for the switchtec driver collected over the
> last few months.
> 
> The first 2 patches fix two minor bugs. Patch 3 updates the method of
> getting management VEP instance ID based on a new firmware change. Patch
> 4 replaces ENOTSUPP with EOPNOTSUPP to follow the preference of kernel.
> And the last patch adds check of event support to align with new firmware
> implementation.
> 
> This patchset is based on v5.15-rc2.
> 
> Thanks,
> Kelvin
> 
> Kelvin Cao (4):
>   PCI/switchtec: Error out MRPC execution when no GAS access
>   PCI/switchtec: Fix a MRPC error status handling issue
>   PCI/switchtec: Update the way of getting management VEP instance ID
>   PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP
> 
> Logan Gunthorpe (1):
>   PCI/switchtec: Add check of event support
> 
>  drivers/pci/switch/switchtec.c | 87 +++++++++++++++++++++++++++-------
>  include/linux/switchtec.h      |  1 +
>  2 files changed, 71 insertions(+), 17 deletions(-)

Applied with Logan's reviewed-by to pci/switchtec for v5.16, thanks!

I tweaked the comment spacing in "PCI/switchtec: Error out MRPC
execution when no GAS access" so it matches the existing similar
comments.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-09-27 16:39 ` Bjorn Helgaas
@ 2021-09-27 18:25   ` Kelvin.Cao
  0 siblings, 0 replies; 32+ messages in thread
From: Kelvin.Cao @ 2021-09-27 18:25 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Mon, 2021-09-27 at 11:39 -0500, Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:37AM +0000, kelvin.cao@microchip.com
> wrote:
> > From: Kelvin Cao <kelvin.cao@microchip.com>
> > 
> > Hi,
> > 
> > Please find a bunch of patches for the switchtec driver collected
> > over the
> > last few months.
> > 
> > The first 2 patches fix two minor bugs. Patch 3 updates the method
> > of
> > getting management VEP instance ID based on a new firmware change.
> > Patch
> > 4 replaces ENOTSUPP with EOPNOTSUPP to follow the preference of
> > kernel.
> > And the last patch adds check of event support to align with new
> > firmware
> > implementation.
> > 
> > This patchset is based on v5.15-rc2.
> > 
> > Thanks,
> > Kelvin
> > 
> > Kelvin Cao (4):
> >   PCI/switchtec: Error out MRPC execution when no GAS access
> >   PCI/switchtec: Fix a MRPC error status handling issue
> >   PCI/switchtec: Update the way of getting management VEP instance
> > ID
> >   PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP
> > 
> > Logan Gunthorpe (1):
> >   PCI/switchtec: Add check of event support
> > 
> >  drivers/pci/switch/switchtec.c | 87 +++++++++++++++++++++++++++---
> > ----
> >  include/linux/switchtec.h      |  1 +
> >  2 files changed, 71 insertions(+), 17 deletions(-)
> 
> Applied with Logan's reviewed-by to pci/switchtec for v5.16, thanks!
> 
> I tweaked the comment spacing in "PCI/switchtec: Error out MRPC
> execution when no GAS access" so it matches the existing similar
> comments.

Thanks for your time!

I guess the tweak was made to "PCI/switchtec: Replace ENOTSUPP with
EOPNOTSUPP" which also confirmed that "Link:" tag should not be put to
a discussion reference URL line in the comment.

Kelvin

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-09-24 11:08 ` [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access kelvin.cao
@ 2021-10-01 20:18   ` Bjorn Helgaas
  2021-10-01 20:29     ` Logan Gunthorpe
  2021-10-01 22:58     ` Kelvin.Cao
  0 siblings, 2 replies; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-01 20:18 UTC (permalink / raw)
  To: kelvin.cao
  Cc: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com wrote:
> From: Kelvin Cao <kelvin.cao@microchip.com>
> 
> After a firmware hard reset, MRPC command executions, which are based
> on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
> indefinitely. This is because after a reset, the host will fail all GAS
> reads (get all 1s), in which case the driver won't get a valid MRPC
> status.

Trying to write a merge commit log for this, but having a hard time
summarizing it.  It sounds like it covers both Switchtec-specific
(firmware and MRPC commands) and generic PCIe behavior (MMIO read
failures).

This has something to do with a firmware hard reset.  What is that?
Is that like a firmware reboot?  A device reset, e.g., FLR or
secondary bus reset, that causes a firmware reboot?  A device reset
initiated by firmware?

Anyway, apparently when that happens, MMIO reads to the switch fail
(timeout or error completion on PCIe) for a while.  If a device reset
is involved, that much is standard PCIe behavior.  And the driver sees
~0 data from those failed reads.  That's not part of the PCIe spec,
but is typical root complex behavior.

But you said the MRPC commands hang indefinitely.  Presumably MMIO
reads would start succeeding eventually when the device becomes ready,
so I don't know how that translates to "indefinitely."

Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
would help it make sense.

What does "host" refer to?  I guess it's the switch (the
switchtec_dev), since you say it fails MMIO reads?

Naming comment below.

> Add a read check to GAS access when a MRPC command execution doesn't
> response timely, error out if the check fails.
> 
> Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
> ---
>  drivers/pci/switch/switchtec.c | 59 ++++++++++++++++++++++++++++++----
>  1 file changed, 52 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index 0b301f8be9ed..092653487021 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -45,6 +45,7 @@ enum mrpc_state {
>  	MRPC_QUEUED,
>  	MRPC_RUNNING,
>  	MRPC_DONE,
> +	MRPC_IO_ERROR,
>  };
>  
>  struct switchtec_user {
> @@ -66,6 +67,13 @@ struct switchtec_user {
>  	int event_cnt;
>  };
>  
> +static int check_access(struct switchtec_dev *stdev)
> +{
> +	u32 device = ioread32(&stdev->mmio_sys_info->device_id);
> +
> +	return stdev->pdev->device == device;
> +}

Didn't notice this before, but the "check_access()" name is not very
helpful because it doesn't give a clue about what the return value
means.  Does 0 mean no error?  Does 1 mean no error?  From reading the
implementation, I can see that 0 is actually the error case, but I
can't tell from the name.

>  static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
>  {
>  	struct switchtec_user *stuser;
> @@ -113,6 +121,7 @@ static void stuser_set_state(struct switchtec_user *stuser,
>  		[MRPC_QUEUED] = "QUEUED",
>  		[MRPC_RUNNING] = "RUNNING",
>  		[MRPC_DONE] = "DONE",
> +		[MRPC_IO_ERROR] = "IO_ERROR",
>  	};
>  
>  	stuser->state = state;
> @@ -184,6 +193,21 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
>  	return 0;
>  }
>  
> +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +	struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
> +						   struct switchtec_user, list);
> +
> +	stuser->cmd_done = true;
> +	wake_up_interruptible(&stuser->cmd_comp);
> +	list_del_init(&stuser->list);
> +	stuser_put(stuser);
> +	stdev->mrpc_busy = 0;
> +
> +	mrpc_cmd_submit(stdev);
> +}
> +
>  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
>  {
>  	/* requires the mrpc_mutex to already be held when called */
> @@ -223,13 +247,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
>  		memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
>  			      stuser->read_len);
>  out:
> -	stuser->cmd_done = true;
> -	wake_up_interruptible(&stuser->cmd_comp);
> -	list_del_init(&stuser->list);
> -	stuser_put(stuser);
> -	stdev->mrpc_busy = 0;
> -
> -	mrpc_cmd_submit(stdev);
> +	mrpc_cleanup_cmd(stdev);
>  }
>  
>  static void mrpc_event_work(struct work_struct *work)
> @@ -246,6 +264,23 @@ static void mrpc_event_work(struct work_struct *work)
>  	mutex_unlock(&stdev->mrpc_mutex);
>  }
>  
> +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +
> +	struct switchtec_user *stuser;
> +
> +	if (list_empty(&stdev->mrpc_queue))
> +		return;
> +
> +	stuser = list_entry(stdev->mrpc_queue.next,
> +			    struct switchtec_user, list);
> +
> +	stuser_set_state(stuser, MRPC_IO_ERROR);
> +
> +	mrpc_cleanup_cmd(stdev);
> +}
> +
>  static void mrpc_timeout_work(struct work_struct *work)
>  {
>  	struct switchtec_dev *stdev;
> @@ -257,6 +292,11 @@ static void mrpc_timeout_work(struct work_struct *work)
>  
>  	mutex_lock(&stdev->mrpc_mutex);
>  
> +	if (!check_access(stdev)) {
> +		mrpc_error_complete_cmd(stdev);
> +		goto out;
> +	}
> +
>  	if (stdev->dma_mrpc)
>  		status = stdev->dma_mrpc->status;
>  	else
> @@ -544,6 +584,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
>  	if (rc)
>  		return rc;
>  
> +	if (stuser->state == MRPC_IO_ERROR) {
> +		mutex_unlock(&stdev->mrpc_mutex);
> +		return -EIO;
> +	}
> +
>  	if (stuser->state != MRPC_DONE) {
>  		mutex_unlock(&stdev->mrpc_mutex);
>  		return -EBADE;
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 20:18   ` Bjorn Helgaas
@ 2021-10-01 20:29     ` Logan Gunthorpe
  2021-10-01 23:49       ` Kelvin.Cao
  2021-10-01 22:58     ` Kelvin.Cao
  1 sibling, 1 reply; 32+ messages in thread
From: Logan Gunthorpe @ 2021-10-01 20:29 UTC (permalink / raw)
  To: Bjorn Helgaas, kelvin.cao
  Cc: kurt.schwemmer, bhelgaas, linux-pci, linux-kernel, kelvincao



On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com wrote:
>> From: Kelvin Cao <kelvin.cao@microchip.com>
>>
>> After a firmware hard reset, MRPC command executions, which are based
>> on the PCI BAR (which Microchip refers to as GAS) read/write, will hang
>> indefinitely. This is because after a reset, the host will fail all GAS
>> reads (get all 1s), in which case the driver won't get a valid MRPC
>> status.
> 
> Trying to write a merge commit log for this, but having a hard time
> summarizing it.  It sounds like it covers both Switchtec-specific
> (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> failures).
> 
> This has something to do with a firmware hard reset.  What is that?
> Is that like a firmware reboot?  A device reset, e.g., FLR or
> secondary bus reset, that causes a firmware reboot?  A device reset
> initiated by firmware?
> 
> Anyway, apparently when that happens, MMIO reads to the switch fail
> (timeout or error completion on PCIe) for a while.  If a device reset
> is involved, that much is standard PCIe behavior.  And the driver sees
> ~0 data from those failed reads.  That's not part of the PCIe spec,
> but is typical root complex behavior.
> 
> But you said the MRPC commands hang indefinitely.  Presumably MMIO
> reads would start succeeding eventually when the device becomes ready,
> so I don't know how that translates to "indefinitely."

I suspect Kelvin can expand on this and fix the issue below. But in my
experience, the MMIO will read ~0 forever after a firmware reset, until
the system is rebooted. Presumably on systems that have good hot plug
support they are supposed to recover. Though I've never seen that.

The MMIO read that signals the MRPC status always returns ~0 and the
userspace request will eventually time out.

> Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> would help it make sense.
GAS is the term used by the firmware developers and is in all their
documentation. It stands for Global Address Space.

> What does "host" refer to?  I guess it's the switch (the
> switchtec_dev), since you say it fails MMIO reads?

Yes, a bit confusing. The firmware is dead or not setup right so MMIO
reads are not succeeding and the root complex is returning ~0 to the
driver on reads.

Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 20:18   ` Bjorn Helgaas
  2021-10-01 20:29     ` Logan Gunthorpe
@ 2021-10-01 22:58     ` Kelvin.Cao
  2021-10-01 23:52       ` Logan Gunthorpe
  1 sibling, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-01 22:58 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> wrote:
> > From: Kelvin Cao <kelvin.cao@microchip.com>
> > 
> > After a firmware hard reset, MRPC command executions, which are
> > based
> > on the PCI BAR (which Microchip refers to as GAS) read/write, will
> > hang
> > indefinitely. This is because after a reset, the host will fail all
> > GAS
> > reads (get all 1s), in which case the driver won't get a valid MRPC
> > status.
> 
> Trying to write a merge commit log for this, but having a hard time
> summarizing it.  It sounds like it covers both Switchtec-specific
> (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> failures).
> 
> This has something to do with a firmware hard reset.  What is that?
> Is that like a firmware reboot?  A device reset, e.g., FLR or
> secondary bus reset, that causes a firmware reboot?  A device reset
> initiated by firmware?
> 
> Anyway, apparently when that happens, MMIO reads to the switch fail
> (timeout or error completion on PCIe) for a while.  If a device reset
> is involved, that much is standard PCIe behavior.  And the driver
> sees
> ~0 data from those failed reads.  That's not part of the PCIe spec,
> but is typical root complex behavior.
> 
> But you said the MRPC commands hang indefinitely.  Presumably MMIO
> reads would start succeeding eventually when the device becomes
> ready,
> so I don't know how that translates to "indefinitely."
> 
> Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> would help it make sense.
> 
> What does "host" refer to?  I guess it's the switch (the
> switchtec_dev), since you say it fails MMIO reads?
> 
> Naming comment below.
> 
> > Add a read check to GAS access when a MRPC command execution
> > doesn't
> > response timely, error out if the check fails.
> > 
> > Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
> > ---
> >  drivers/pci/switch/switchtec.c | 59
> > ++++++++++++++++++++++++++++++----
> >  1 file changed, 52 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/pci/switch/switchtec.c
> > b/drivers/pci/switch/switchtec.c
> > index 0b301f8be9ed..092653487021 100644
> > --- a/drivers/pci/switch/switchtec.c
> > +++ b/drivers/pci/switch/switchtec.c
> > @@ -45,6 +45,7 @@ enum mrpc_state {
> >       MRPC_QUEUED,
> >       MRPC_RUNNING,
> >       MRPC_DONE,
> > +     MRPC_IO_ERROR,
> >  };
> > 
> >  struct switchtec_user {
> > @@ -66,6 +67,13 @@ struct switchtec_user {
> >       int event_cnt;
> >  };
> > 
> > +static int check_access(struct switchtec_dev *stdev)
> > +{
> > +     u32 device = ioread32(&stdev->mmio_sys_info->device_id);
> > +
> > +     return stdev->pdev->device == device;
> > +}
> 
> Didn't notice this before, but the "check_access()" name is not very
> helpful because it doesn't give a clue about what the return value
> means.  Does 0 mean no error?  Does 1 mean no error?  From reading
> the
> implementation, I can see that 0 is actually the error case, but I
> can't tell from the name.

Yes, will improve the naming, like change it to "has_gas_access()" in
v2 if a v2 patchset is preferred.
> 
> >  static struct switchtec_user *stuser_create(struct switchtec_dev
> > *stdev)
> >  {
> >       struct switchtec_user *stuser;
> > @@ -113,6 +121,7 @@ static void stuser_set_state(struct
> > switchtec_user *stuser,
> >               [MRPC_QUEUED] = "QUEUED",
> >               [MRPC_RUNNING] = "RUNNING",
> >               [MRPC_DONE] = "DONE",
> > +             [MRPC_IO_ERROR] = "IO_ERROR",
> >       };
> > 
> >       stuser->state = state;
> > @@ -184,6 +193,21 @@ static int mrpc_queue_cmd(struct
> > switchtec_user *stuser)
> >       return 0;
> >  }
> > 
> > +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
> > +{
> > +     /* requires the mrpc_mutex to already be held when called */
> > +     struct switchtec_user *stuser = list_entry(stdev-
> > >mrpc_queue.next,
> > +                                                struct
> > switchtec_user, list);
> > +
> > +     stuser->cmd_done = true;
> > +     wake_up_interruptible(&stuser->cmd_comp);
> > +     list_del_init(&stuser->list);
> > +     stuser_put(stuser);
> > +     stdev->mrpc_busy = 0;
> > +
> > +     mrpc_cmd_submit(stdev);
> > +}
> > +
> >  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
> >  {
> >       /* requires the mrpc_mutex to already be held when called */
> > @@ -223,13 +247,7 @@ static void mrpc_complete_cmd(struct
> > switchtec_dev *stdev)
> >               memcpy_fromio(stuser->data, &stdev->mmio_mrpc-
> > >output_data,
> >                             stuser->read_len);
> >  out:
> > -     stuser->cmd_done = true;
> > -     wake_up_interruptible(&stuser->cmd_comp);
> > -     list_del_init(&stuser->list);
> > -     stuser_put(stuser);
> > -     stdev->mrpc_busy = 0;
> > -
> > -     mrpc_cmd_submit(stdev);
> > +     mrpc_cleanup_cmd(stdev);
> >  }
> > 
> >  static void mrpc_event_work(struct work_struct *work)
> > @@ -246,6 +264,23 @@ static void mrpc_event_work(struct work_struct
> > *work)
> >       mutex_unlock(&stdev->mrpc_mutex);
> >  }
> > 
> > +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
> > +{
> > +     /* requires the mrpc_mutex to already be held when called */
> > +
> > +     struct switchtec_user *stuser;
> > +
> > +     if (list_empty(&stdev->mrpc_queue))
> > +             return;
> > +
> > +     stuser = list_entry(stdev->mrpc_queue.next,
> > +                         struct switchtec_user, list);
> > +
> > +     stuser_set_state(stuser, MRPC_IO_ERROR);
> > +
> > +     mrpc_cleanup_cmd(stdev);
> > +}
> > +
> >  static void mrpc_timeout_work(struct work_struct *work)
> >  {
> >       struct switchtec_dev *stdev;
> > @@ -257,6 +292,11 @@ static void mrpc_timeout_work(struct
> > work_struct *work)
> > 
> >       mutex_lock(&stdev->mrpc_mutex);
> > 
> > +     if (!check_access(stdev)) {
> > +             mrpc_error_complete_cmd(stdev);
> > +             goto out;
> > +     }
> > +
> >       if (stdev->dma_mrpc)
> >               status = stdev->dma_mrpc->status;
> >       else
> > @@ -544,6 +584,11 @@ static ssize_t switchtec_dev_read(struct file
> > *filp, char __user *data,
> >       if (rc)
> >               return rc;
> > 
> > +     if (stuser->state == MRPC_IO_ERROR) {
> > +             mutex_unlock(&stdev->mrpc_mutex);
> > +             return -EIO;
> > +     }
> > +
> >       if (stuser->state != MRPC_DONE) {
> >               mutex_unlock(&stdev->mrpc_mutex);
> >               return -EBADE;
> > --
> > 2.25.1
> > 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 20:29     ` Logan Gunthorpe
@ 2021-10-01 23:49       ` Kelvin.Cao
  2021-10-02 15:11         ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-01 23:49 UTC (permalink / raw)
  To: helgaas, logang
  Cc: kurt.schwemmer, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you
> know the content is safe
> 
> On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> > wrote:
> > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > 
> > > After a firmware hard reset, MRPC command executions, which are
> > > based
> > > on the PCI BAR (which Microchip refers to as GAS) read/write,
> > > will hang
> > > indefinitely. This is because after a reset, the host will fail
> > > all GAS
> > > reads (get all 1s), in which case the driver won't get a valid
> > > MRPC
> > > status.
> > 
> > Trying to write a merge commit log for this, but having a hard time
> > summarizing it.  It sounds like it covers both Switchtec-specific
> > (firmware and MRPC commands) and generic PCIe behavior (MMIO read
> > failures).
> > 
> > This has something to do with a firmware hard reset.  What is that?
> > Is that like a firmware reboot?  A device reset, e.g., FLR or
> > secondary bus reset, that causes a firmware reboot?  A device reset
> > initiated by firmware?
A firmware reset can be triggered by a reset command issued to the
firmware to reboot it.
> > Anyway, apparently when that happens, MMIO reads to the switch fail
> > (timeout or error completion on PCIe) for a while.  If a device
> > reset
> > is involved, that much is standard PCIe behavior.  And the driver
> > sees
> > ~0 data from those failed reads.  That's not part of the PCIe spec,
> > but is typical root complex behavior.
> > 
> > But you said the MRPC commands hang indefinitely.  Presumably MMIO
> > reads would start succeeding eventually when the device becomes
> > ready,
> > so I don't know how that translates to "indefinitely."
> 
> I suspect Kelvin can expand on this and fix the issue below. But in
> my
> experience, the MMIO will read ~0 forever after a firmware reset,
> until
> the system is rebooted. Presumably on systems that have good hot plug
> support they are supposed to recover. Though I've never seen that.

This is also my observation, all MMIO read will fail (~0 returned)
until the system is rebooted or a PCI rescan is performed.

> The MMIO read that signals the MRPC status always returns ~0 and the
> userspace request will eventually time out.

The problem in this case is that, in DMA MRPC mode, the status (in host
memory) is always initialized to 'in progress', and it's up to the
firmware to update it to 'done' after the command is executed in the
firmware. After a firmware reset is performed, the firmware cannot be
triggered to start a MRPC command, therefore the status in host memory
remains 'in progress' in the driver, which prevents a MRPC from timing
out. I should have included this in the message.
> 
> > Weird to refer to a PCI BAR as "GAS".  Maybe expanding the acronym
> > would help it make sense.
> GAS is the term used by the firmware developers and is in all their
> documentation. It stands for Global Address Space.
> 
> > What does "host" refer to?  I guess it's the switch (the
> > switchtec_dev), since you say it fails MMIO reads?
> 
> Yes, a bit confusing. The firmware is dead or not setup right so MMIO
> reads are not succeeding and the root complex is returning ~0 to the
> driver on reads.
Ditto. Will update in v2.
> 
> Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 22:58     ` Kelvin.Cao
@ 2021-10-01 23:52       ` Logan Gunthorpe
  2021-10-02  0:05         ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Logan Gunthorpe @ 2021-10-01 23:52 UTC (permalink / raw)
  To: Kelvin.Cao, helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, linux-kernel, linux-pci




On 2021-10-01 4:58 p.m., Kelvin.Cao@microchip.com wrote:
> On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
>> Didn't notice this before, but the "check_access()" name is not very
>> helpful because it doesn't give a clue about what the return value
>> means.  Does 0 mean no error?  Does 1 mean no error?  From reading
>> the
>> implementation, I can see that 0 is actually the error case, but I
>> can't tell from the name.
> 
> Yes, will improve the naming, like change it to "has_gas_access()" in
> v2 if a v2 patchset is preferred.

I'd keep the GAS name out of the kernel. How about something along the
lines of is_firmware_running()? Maybe a comment for the function would
be good as well.

Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 23:52       ` Logan Gunthorpe
@ 2021-10-02  0:05         ` Kelvin.Cao
  0 siblings, 0 replies; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-02  0:05 UTC (permalink / raw)
  To: helgaas, logang
  Cc: kurt.schwemmer, bhelgaas, kelvincao, linux-kernel, linux-pci

On Fri, 2021-10-01 at 17:52 -0600, Logan Gunthorpe wrote:
> On 2021-10-01 4:58 p.m., Kelvin.Cao@microchip.com wrote:
> > On Fri, 2021-10-01 at 15:18 -0500, Bjorn Helgaas wrote:
> > > Didn't notice this before, but the "check_access()" name is not
> > > very
> > > helpful because it doesn't give a clue about what the return
> > > value
> > > means.  Does 0 mean no error?  Does 1 mean no error?  From
> > > reading
> > > the
> > > implementation, I can see that 0 is actually the error case, but
> > > I
> > > can't tell from the name.
> > 
> > Yes, will improve the naming, like change it to "has_gas_access()"
> > in
> > v2 if a v2 patchset is preferred.
> 
> I'd keep the GAS name out of the kernel. How about something along
> the
> lines of is_firmware_running()? Maybe a comment for the function
> would
> be good as well.
> 
Yes, that'll be an improvement.
> Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-01 23:49       ` Kelvin.Cao
@ 2021-10-02 15:11         ` Bjorn Helgaas
  2021-10-04 20:51           ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-02 15:11 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: logang, kurt.schwemmer, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com wrote:
> On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > know the content is safe
> > 
> > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, kelvin.cao@microchip.com
> > > wrote:
> > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > 
> > > > After a firmware hard reset, MRPC command executions, which
> > > > are based on the PCI BAR (which Microchip refers to as GAS)
> > > > read/write, will hang indefinitely. This is because after a
> > > > reset, the host will fail all GAS reads (get all 1s), in which
> > > > case the driver won't get a valid MRPC status.
> > > 
> > > Trying to write a merge commit log for this, but having a hard
> > > time summarizing it.  It sounds like it covers both
> > > Switchtec-specific (firmware and MRPC commands) and generic PCIe
> > > behavior (MMIO read failures).
> > > 
> > > This has something to do with a firmware hard reset.  What is
> > > that?  Is that like a firmware reboot?  A device reset, e.g.,
> > > FLR or secondary bus reset, that causes a firmware reboot?  A
> > > device reset initiated by firmware?
>
> A firmware reset can be triggered by a reset command issued to the
> firmware to reboot it.

So I guess this reset command was issued by the driver?

> > > Anyway, apparently when that happens, MMIO reads to the switch
> > > fail (timeout or error completion on PCIe) for a while.  If a
> > > device reset is involved, that much is standard PCIe behavior.
> > > And the driver sees ~0 data from those failed reads.  That's not
> > > part of the PCIe spec, but is typical root complex behavior.
> > > 
> > > But you said the MRPC commands hang indefinitely.  Presumably
> > > MMIO reads would start succeeding eventually when the device
> > > becomes ready, so I don't know how that translates to
> > > "indefinitely."
> > 
> > I suspect Kelvin can expand on this and fix the issue below. But
> > in my experience, the MMIO will read ~0 forever after a firmware
> > reset, until the system is rebooted. Presumably on systems that
> > have good hot plug support they are supposed to recover. Though
> > I've never seen that.
> 
> This is also my observation, all MMIO read will fail (~0 returned)
> until the system is rebooted or a PCI rescan is performed.

This made sense until you said MMIO reads would start working after a
PCI rescan.  A rescan doesn't really do anything special other than
doing config accesses to the device.  Two things come to mind:

1) Rescan does a config read of the Vendor ID, and devices may
respond with "Configuration Request Retry Status" if they are not
ready.  In this event, Linux retries this for a while.  This scenario
doesn't quite fit because it sounds like this is a device-specific
reset initiated by the driver, and CRS is not permited in this case.
PCIe r5.0, sec 2.3.1, says:

  A device Function is explicitly not permitted to return CRS
  following a software-initiated reset (other than an FLR) of the
  device, e.g., by the device's software driver writing to a
  device-specific reset bit.

2) The device may lose its bus and device number configuration after a
reset, so it must capture bus and device numbers from config writes.
I don't think Linux does this explicitly, but a rescan does do config
writes, which could accidentally fix something (PCIe r5.0, sec 2.2.9).

> > The MMIO read that signals the MRPC status always returns ~0 and the
> > userspace request will eventually time out.
> 
> The problem in this case is that, in DMA MRPC mode, the status (in host
> memory) is always initialized to 'in progress', and it's up to the
> firmware to update it to 'done' after the command is executed in the
> firmware. After a firmware reset is performed, the firmware cannot be
> triggered to start a MRPC command, therefore the status in host memory
> remains 'in progress' in the driver, which prevents a MRPC from timing
> out. I should have included this in the message.

I *thought* the problem was that the PCIe Memory Read failed and the
Root Complex fabricated ~0 data to complete the CPU read.  But now I'm
not sure, because it sounds like it might be that the PCIe transaction
succeeds, but it reads data that hasn't been updated by the firmware,
i.e., it reads 'in progress' because firmware hasn't updated it to
'done'.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-02 15:11         ` Bjorn Helgaas
@ 2021-10-04 20:51           ` Kelvin.Cao
  2021-10-05 20:11             ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-04 20:51 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > know the content is safe
> > > 
> > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, 
> > > > kelvin.cao@microchip.com
> > > > wrote:
> > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > 
> > > > > After a firmware hard reset, MRPC command executions, which
> > > > > are based on the PCI BAR (which Microchip refers to as GAS)
> > > > > read/write, will hang indefinitely. This is because after a
> > > > > reset, the host will fail all GAS reads (get all 1s), in
> > > > > which
> > > > > case the driver won't get a valid MRPC status.
> > > > 
> > > > Trying to write a merge commit log for this, but having a hard
> > > > time summarizing it.  It sounds like it covers both
> > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > PCIe
> > > > behavior (MMIO read failures).
> > > > 
> > > > This has something to do with a firmware hard reset.  What is
> > > > that?  Is that like a firmware reboot?  A device reset, e.g.,
> > > > FLR or secondary bus reset, that causes a firmware reboot?  A
> > > > device reset initiated by firmware?
> > 
> > A firmware reset can be triggered by a reset command issued to the
> > firmware to reboot it.
> 
> So I guess this reset command was issued by the driver?
Yes, the reset command can be issued by a userspace utility to the
firmware via the driver. In some other cases, user can also issue the
reset command, via a sideband interface (like UART), to the firmware. 
> 
> > > > Anyway, apparently when that happens, MMIO reads to the switch
> > > > fail (timeout or error completion on PCIe) for a while.  If a
> > > > device reset is involved, that much is standard PCIe behavior.
> > > > And the driver sees ~0 data from those failed reads.  That's
> > > > not
> > > > part of the PCIe spec, but is typical root complex behavior.
> > > > 
> > > > But you said the MRPC commands hang indefinitely.  Presumably
> > > > MMIO reads would start succeeding eventually when the device
> > > > becomes ready, so I don't know how that translates to
> > > > "indefinitely."
> > > 
> > > I suspect Kelvin can expand on this and fix the issue below. But
> > > in my experience, the MMIO will read ~0 forever after a firmware
> > > reset, until the system is rebooted. Presumably on systems that
> > > have good hot plug support they are supposed to recover. Though
> > > I've never seen that.
> > 
> > This is also my observation, all MMIO read will fail (~0 returned)
> > until the system is rebooted or a PCI rescan is performed.
> 
> This made sense until you said MMIO reads would start working after a
> PCI rescan.  A rescan doesn't really do anything special other than
> doing config accesses to the device.  Two things come to mind:
> 
> 1) Rescan does a config read of the Vendor ID, and devices may
> respond with "Configuration Request Retry Status" if they are not
> ready.  In this event, Linux retries this for a while.  This scenario
> doesn't quite fit because it sounds like this is a device-specific
> reset initiated by the driver, and CRS is not permited in this case.
> PCIe r5.0, sec 2.3.1, says:
> 
>   A device Function is explicitly not permitted to return CRS
>   following a software-initiated reset (other than an FLR) of the
>   device, e.g., by the device's software driver writing to a
>   device-specific reset bit.
> 
> 2) The device may lose its bus and device number configuration after
> a
> reset, so it must capture bus and device numbers from config writes.
> I don't think Linux does this explicitly, but a rescan does do config
> writes, which could accidentally fix something (PCIe r5.0, sec
> 2.2.9).

Thanks Bjorn. It makes perfect sense!
> 
> > > The MMIO read that signals the MRPC status always returns ~0 and
> > > the
> > > userspace request will eventually time out.
> > 
> > The problem in this case is that, in DMA MRPC mode, the status (in
> > host
> > memory) is always initialized to 'in progress', and it's up to the
> > firmware to update it to 'done' after the command is executed in
> > the
> > firmware. After a firmware reset is performed, the firmware cannot
> > be
> > triggered to start a MRPC command, therefore the status in host
> > memory
> > remains 'in progress' in the driver, which prevents a MRPC from
> > timing
> > out. I should have included this in the message.
> 
> I *thought* the problem was that the PCIe Memory Read failed and the
> Root Complex fabricated ~0 data to complete the CPU read.  But now
> I'm
> not sure, because it sounds like it might be that the PCIe
> transaction
> succeeds, but it reads data that hasn't been updated by the firmware,
> i.e., it reads 'in progress' because firmware hasn't updated it to
> 'done'.
> 
> Bjorn

The original message was sort of misleading. After a firmware reset,
CPU getting ~0 for the PCIe Memory Read doesn't explain the hang. In a
MRPC execution (DMA MRPC mode), the MRPC status which is located in the
host moemory, gets initialized by the CPU and updated/finilized by the
firmware. In the situation of a firmware reset, any MRPC initiated
afterwards will not get the status updated by the firmware per the
reason you pointed out above (or similar, to my understanding, firmware
can no longer DMA data to host memory in such cases), therefore the
MRPC execution will never end.


Kelvin


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-04 20:51           ` Kelvin.Cao
@ 2021-10-05 20:11             ` Bjorn Helgaas
  2021-10-06  0:37               ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-05 20:11 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com wrote:
> On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > On Fri, Oct 01, 2021 at 11:49:18PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > > EXTERNAL EMAIL: Do not click links or open attachments unless you
> > > > know the content is safe
> > > > 
> > > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000, 
> > > > > kelvin.cao@microchip.com
> > > > > wrote:
> > > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > > 
> > > > > > After a firmware hard reset, MRPC command executions,
> > > > > > which are based on the PCI BAR (which Microchip refers to
> > > > > > as GAS) read/write, will hang indefinitely. This is
> > > > > > because after a reset, the host will fail all GAS reads
> > > > > > (get all 1s), in which case the driver won't get a valid
> > > > > > MRPC status.
> > > > > 
> > > > > Trying to write a merge commit log for this, but having a
> > > > > hard time summarizing it.  It sounds like it covers both
> > > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > > PCIe behavior (MMIO read failures).
> > > > > 
> > > > > This has something to do with a firmware hard reset.  What
> > > > > is that?  Is that like a firmware reboot?  A device reset,
> > > > > e.g., FLR or secondary bus reset, that causes a firmware
> > > > > reboot?  A device reset initiated by firmware?
> > > 
> > > A firmware reset can be triggered by a reset command issued to
> > > the firmware to reboot it.
> > 
> > So I guess this reset command was issued by the driver?
>
> Yes, the reset command can be issued by a userspace utility to the
> firmware via the driver. In some other cases, user can also issue
> the reset command, via a sideband interface (like UART), to the
> firmware. 

OK, thanks.  That means CRS is not a factor here because this is not
an FLR or similar reset.

> > > > > Anyway, apparently when that happens, MMIO reads to the switch
> > > > > fail (timeout or error completion on PCIe) for a while.  If a
> > > > > device reset is involved, that much is standard PCIe behavior.
> > > > > And the driver sees ~0 data from those failed reads.  That's
> > > > > not
> > > > > part of the PCIe spec, but is typical root complex behavior.
> > > > > 
> > > > > But you said the MRPC commands hang indefinitely.  Presumably
> > > > > MMIO reads would start succeeding eventually when the device
> > > > > becomes ready, so I don't know how that translates to
> > > > > "indefinitely."
> > > > 
> > > > I suspect Kelvin can expand on this and fix the issue below. But
> > > > in my experience, the MMIO will read ~0 forever after a firmware
> > > > reset, until the system is rebooted. Presumably on systems that
> > > > have good hot plug support they are supposed to recover. Though
> > > > I've never seen that.
> > > 
> > > This is also my observation, all MMIO read will fail (~0 returned)
> > > until the system is rebooted or a PCI rescan is performed.
> > 
> > This made sense until you said MMIO reads would start working after a
> > PCI rescan.  A rescan doesn't really do anything special other than
> > doing config accesses to the device.  Two things come to mind:
> > 
> > 1) Rescan does a config read of the Vendor ID, and devices may
> > respond with "Configuration Request Retry Status" if they are not
> > ready.  In this event, Linux retries this for a while.  This scenario
> > doesn't quite fit because it sounds like this is a device-specific
> > reset initiated by the driver, and CRS is not permited in this case.
> > PCIe r5.0, sec 2.3.1, says:
> > 
> >   A device Function is explicitly not permitted to return CRS
> >   following a software-initiated reset (other than an FLR) of the
> >   device, e.g., by the device's software driver writing to a
> >   device-specific reset bit.
> > 
> > 2) The device may lose its bus and device number configuration
> > after a reset, so it must capture bus and device numbers from
> > config writes.  I don't think Linux does this explicitly, but a
> > rescan does do config writes, which could accidentally fix
> > something (PCIe r5.0, sec 2.2.9).
> 
> Thanks Bjorn. It makes perfect sense!
> > 
> > > > The MMIO read that signals the MRPC status always returns ~0
> > > > and the userspace request will eventually time out.
> > > 
> > > The problem in this case is that, in DMA MRPC mode, the status
> > > (in host memory) is always initialized to 'in progress', and
> > > it's up to the firmware to update it to 'done' after the command
> > > is executed in the firmware. After a firmware reset is
> > > performed, the firmware cannot be triggered to start a MRPC
> > > command, therefore the status in host memory remains 'in
> > > progress' in the driver, which prevents a MRPC from timing out.
> > > I should have included this in the message.
> > 
> > I *thought* the problem was that the PCIe Memory Read failed and
> > the Root Complex fabricated ~0 data to complete the CPU read.  But
> > now I'm not sure, because it sounds like it might be that the PCIe
> > transaction succeeds, but it reads data that hasn't been updated
> > by the firmware, i.e., it reads 'in progress' because firmware
> > hasn't updated it to 'done'.
> 
> The original message was sort of misleading. After a firmware reset,
> CPU getting ~0 for the PCIe Memory Read doesn't explain the hang. In
> a MRPC execution (DMA MRPC mode), the MRPC status which is located
> in the host memory, gets initialized by the CPU and
> updated/finalized by the firmware. In the situation of a firmware
> reset, any MRPC initiated afterwards will not get the status updated
> by the firmware per the reason you pointed out above (or similar, to
> my understanding, firmware can no longer DMA data to host memory in
> such cases), therefore the MRPC execution will never end.

I'm glad this makes sense to you, because it still doesn't to me.

check_access() does an MMIO read to something in BAR0.  If that read
returns ~0, it means either the PCIe Memory Read was successful and
the Switchtec device supplied ~0 data (maybe because firmware has not
initialized that part of the BAR) or the PCIe Memory Read failed and
the root complex fabricated the ~0 data.

I'd like to know which one is happening so we can clarify the commit
log text about "MRPC command executions hang indefinitely" and "host
wil fail all GAS reads."  It's not clear whether these are PCIe
protocol issues or driver/firmware interaction issues.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-05 20:11             ` Bjorn Helgaas
@ 2021-10-06  0:37               ` Kelvin.Cao
  2021-10-06  2:33                 ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-06  0:37 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, linux-pci, kelvincao, logang, linux-kernel

On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > On Fri, Oct 01, 2021 at 11:49:18PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Fri, 2021-10-01 at 14:29 -0600, Logan Gunthorpe wrote:
> > > > > EXTERNAL EMAIL: Do not click links or open attachments unless
> > > > > you
> > > > > know the content is safe
> > > > > 
> > > > > On 2021-10-01 2:18 p.m., Bjorn Helgaas wrote:
> > > > > > On Fri, Sep 24, 2021 at 11:08:38AM +0000,
> > > > > > kelvin.cao@microchip.com
> > > > > > wrote:
> > > > > > > From: Kelvin Cao <kelvin.cao@microchip.com>
> > > > > > > 
> > > > > > > After a firmware hard reset, MRPC command executions,
> > > > > > > which are based on the PCI BAR (which Microchip refers to
> > > > > > > as GAS) read/write, will hang indefinitely. This is
> > > > > > > because after a reset, the host will fail all GAS reads
> > > > > > > (get all 1s), in which case the driver won't get a valid
> > > > > > > MRPC status.
> > > > > > 
> > > > > > Trying to write a merge commit log for this, but having a
> > > > > > hard time summarizing it.  It sounds like it covers both
> > > > > > Switchtec-specific (firmware and MRPC commands) and generic
> > > > > > PCIe behavior (MMIO read failures).
> > > > > > 
> > > > > > This has something to do with a firmware hard reset.  What
> > > > > > is that?  Is that like a firmware reboot?  A device reset,
> > > > > > e.g., FLR or secondary bus reset, that causes a firmware
> > > > > > reboot?  A device reset initiated by firmware?
> > > > 
> > > > A firmware reset can be triggered by a reset command issued to
> > > > the firmware to reboot it.
> > > 
> > > So I guess this reset command was issued by the driver?
> > 
> > Yes, the reset command can be issued by a userspace utility to the
> > firmware via the driver. In some other cases, user can also issue
> > the reset command, via a sideband interface (like UART), to the
> > firmware.
> 
> OK, thanks.  That means CRS is not a factor here because this is not
> an FLR or similar reset.
> 
> > > > > > Anyway, apparently when that happens, MMIO reads to the
> > > > > > switch
> > > > > > fail (timeout or error completion on PCIe) for a while.  If
> > > > > > a
> > > > > > device reset is involved, that much is standard PCIe
> > > > > > behavior.
> > > > > > And the driver sees ~0 data from those failed
> > > > > > reads.  That's
> > > > > > not
> > > > > > part of the PCIe spec, but is typical root complex
> > > > > > behavior.
> > > > > > 
> > > > > > But you said the MRPC commands hang
> > > > > > indefinitely.  Presumably
> > > > > > MMIO reads would start succeeding eventually when the
> > > > > > device
> > > > > > becomes ready, so I don't know how that translates to
> > > > > > "indefinitely."
> > > > > 
> > > > > I suspect Kelvin can expand on this and fix the issue below.
> > > > > But
> > > > > in my experience, the MMIO will read ~0 forever after a
> > > > > firmware
> > > > > reset, until the system is rebooted. Presumably on systems
> > > > > that
> > > > > have good hot plug support they are supposed to recover.
> > > > > Though
> > > > > I've never seen that.
> > > > 
> > > > This is also my observation, all MMIO read will fail (~0
> > > > returned)
> > > > until the system is rebooted or a PCI rescan is performed.
> > > 
> > > This made sense until you said MMIO reads would start working
> > > after a
> > > PCI rescan.  A rescan doesn't really do anything special other
> > > than
> > > doing config accesses to the device.  Two things come to mind:
> > > 
> > > 1) Rescan does a config read of the Vendor ID, and devices may
> > > respond with "Configuration Request Retry Status" if they are not
> > > ready.  In this event, Linux retries this for a while.  This
> > > scenario
> > > doesn't quite fit because it sounds like this is a device-
> > > specific
> > > reset initiated by the driver, and CRS is not permited in this
> > > case.
> > > PCIe r5.0, sec 2.3.1, says:
> > > 
> > >   A device Function is explicitly not permitted to return CRS
> > >   following a software-initiated reset (other than an FLR) of the
> > >   device, e.g., by the device's software driver writing to a
> > >   device-specific reset bit.
> > > 
> > > 2) The device may lose its bus and device number configuration
> > > after a reset, so it must capture bus and device numbers from
> > > config writes.  I don't think Linux does this explicitly, but a
> > > rescan does do config writes, which could accidentally fix
> > > something (PCIe r5.0, sec 2.2.9).
> > 
> > Thanks Bjorn. It makes perfect sense!
> > > > > The MMIO read that signals the MRPC status always returns ~0
> > > > > and the userspace request will eventually time out.
> > > > 
> > > > The problem in this case is that, in DMA MRPC mode, the status
> > > > (in host memory) is always initialized to 'in progress', and
> > > > it's up to the firmware to update it to 'done' after the
> > > > command
> > > > is executed in the firmware. After a firmware reset is
> > > > performed, the firmware cannot be triggered to start a MRPC
> > > > command, therefore the status in host memory remains 'in
> > > > progress' in the driver, which prevents a MRPC from timing out.
> > > > I should have included this in the message.
> > > 
> > > I *thought* the problem was that the PCIe Memory Read failed and
> > > the Root Complex fabricated ~0 data to complete the CPU
> > > read.  But
> > > now I'm not sure, because it sounds like it might be that the
> > > PCIe
> > > transaction succeeds, but it reads data that hasn't been updated
> > > by the firmware, i.e., it reads 'in progress' because firmware
> > > hasn't updated it to 'done'.
> > 
> > The original message was sort of misleading. After a firmware
> > reset,
> > CPU getting ~0 for the PCIe Memory Read doesn't explain the hang.
> > In
> > a MRPC execution (DMA MRPC mode), the MRPC status which is located
> > in the host memory, gets initialized by the CPU and
> > updated/finalized by the firmware. In the situation of a firmware
> > reset, any MRPC initiated afterwards will not get the status
> > updated
> > by the firmware per the reason you pointed out above (or similar,
> > to
> > my understanding, firmware can no longer DMA data to host memory in
> > such cases), therefore the MRPC execution will never end.
> 
> I'm glad this makes sense to you, because it still doesn't to me.
> 
> check_access() does an MMIO read to something in BAR0.  If that read
> returns ~0, it means either the PCIe Memory Read was successful and
> the Switchtec device supplied ~0 data (maybe because firmware has not
> initialized that part of the BAR) or the PCIe Memory Read failed and
> the root complex fabricated the ~0 data.
> 
> I'd like to know which one is happening so we can clarify the commit
> log text about "MRPC command executions hang indefinitely" and "host
> wil fail all GAS reads."  It's not clear whether these are PCIe
> protocol issues or driver/firmware interaction issues.
> 
> Bjorn

I think it's the latter case, the ~0 data was fabricated by the root
complex, as the MMIO read in check_access() always returns ~0 until a
reboot or a rescan happens. 

Kelvin

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06  0:37               ` Kelvin.Cao
@ 2021-10-06  2:33                 ` Bjorn Helgaas
  2021-10-06  5:49                   ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-06  2:33 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, linux-pci, kelvincao, logang, linux-kernel

On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com wrote:
> On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > On Mon, Oct 04, 2021 at 08:51:06PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:

> > > > I *thought* the problem was that the PCIe Memory Read failed
> > > > and the Root Complex fabricated ~0 data to complete the CPU
> > > > read.  But now I'm not sure, because it sounds like it might
> > > > be that the PCIe transaction succeeds, but it reads data that
> > > > hasn't been updated by the firmware, i.e., it reads 'in
> > > > progress' because firmware hasn't updated it to 'done'.
> > > 
> > > The original message was sort of misleading. After a firmware
> > > reset, CPU getting ~0 for the PCIe Memory Read doesn't explain
> > > the hang.  In a MRPC execution (DMA MRPC mode), the MRPC status
> > > which is located in the host memory, gets initialized by the CPU
> > > and updated/finalized by the firmware. In the situation of a
> > > firmware reset, any MRPC initiated afterwards will not get the
> > > status updated by the firmware per the reason you pointed out
> > > above (or similar, to my understanding, firmware can no longer
> > > DMA data to host memory in such cases), therefore the MRPC
> > > execution will never end.
> > 
> > I'm glad this makes sense to you, because it still doesn't to me.
> > 
> > check_access() does an MMIO read to something in BAR0.  If that
> > read returns ~0, it means either the PCIe Memory Read was
> > successful and the Switchtec device supplied ~0 data (maybe
> > because firmware has not initialized that part of the BAR) or the
> > PCIe Memory Read failed and the root complex fabricated the ~0
> > data.
> > 
> > I'd like to know which one is happening so we can clarify the
> > commit log text about "MRPC command executions hang indefinitely"
> > and "host wil fail all GAS reads."  It's not clear whether these
> > are PCIe protocol issues or driver/firmware interaction issues.
> 
> I think it's the latter case, the ~0 data was fabricated by the root
> complex, as the MMIO read in check_access() always returns ~0 until
> a reboot or a rescan happens. 

If the root complex fabricates ~0, that means a PCIe transaction
failed, i.e., the device didn't respond.  Rescan only does config
reads and writes.  Why should that cause the PCIe transactions to
magically start working?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06  2:33                 ` Bjorn Helgaas
@ 2021-10-06  5:49                   ` Kelvin.Cao
  2021-10-06 14:19                     ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-06  5:49 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, linux-pci, logang, linux-kernel, kelvincao

On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > On Mon, Oct 04, 2021 at 08:51:06PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > I *thought* the problem was that the PCIe Memory Read failed
> > > > > and the Root Complex fabricated ~0 data to complete the CPU
> > > > > read.  But now I'm not sure, because it sounds like it might
> > > > > be that the PCIe transaction succeeds, but it reads data that
> > > > > hasn't been updated by the firmware, i.e., it reads 'in
> > > > > progress' because firmware hasn't updated it to 'done'.
> > > > 
> > > > The original message was sort of misleading. After a firmware
> > > > reset, CPU getting ~0 for the PCIe Memory Read doesn't explain
> > > > the hang.  In a MRPC execution (DMA MRPC mode), the MRPC status
> > > > which is located in the host memory, gets initialized by the
> > > > CPU
> > > > and updated/finalized by the firmware. In the situation of a
> > > > firmware reset, any MRPC initiated afterwards will not get the
> > > > status updated by the firmware per the reason you pointed out
> > > > above (or similar, to my understanding, firmware can no longer
> > > > DMA data to host memory in such cases), therefore the MRPC
> > > > execution will never end.
> > > 
> > > I'm glad this makes sense to you, because it still doesn't to me.
> > > 
> > > check_access() does an MMIO read to something in BAR0.  If that
> > > read returns ~0, it means either the PCIe Memory Read was
> > > successful and the Switchtec device supplied ~0 data (maybe
> > > because firmware has not initialized that part of the BAR) or the
> > > PCIe Memory Read failed and the root complex fabricated the ~0
> > > data.
> > > 
> > > I'd like to know which one is happening so we can clarify the
> > > commit log text about "MRPC command executions hang indefinitely"
> > > and "host wil fail all GAS reads."  It's not clear whether these
> > > are PCIe protocol issues or driver/firmware interaction issues.
> > 
> > I think it's the latter case, the ~0 data was fabricated by the
> > root
> > complex, as the MMIO read in check_access() always returns ~0 until
> > a reboot or a rescan happens.
> 
> If the root complex fabricates ~0, that means a PCIe transaction
> failed, i.e., the device didn't respond.  Rescan only does config
> reads and writes.  Why should that cause the PCIe transactions to
> magically start working?

I took a closer look. What I observed was like this. A firmware reset
cleared some CSR settings including the MSE and MBE bits and the Base
Address Registers. With a rescan (removing the switch to which the
management EP was binded from root port and rescan), the management EP
was re-enumerated and driver was re-probed, so that the settings
cleared by the firmware reset was properly setup again, therefore PCIe
transactions start working.

Kelvin

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06  5:49                   ` Kelvin.Cao
@ 2021-10-06 14:19                     ` Bjorn Helgaas
  2021-10-06 19:00                       ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-06 14:19 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, linux-pci, logang, linux-kernel, kelvincao

On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com wrote:
> On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 12:37:02AM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > failed and the Root Complex fabricated ~0 data to complete
> > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > updated it to 'done'.
> > > > > 
> > > > > The original message was sort of misleading. After a
> > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > mode), the MRPC status which is located in the host memory,
> > > > > gets initialized by the CPU and updated/finalized by the
> > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > initiated afterwards will not get the status updated by the
> > > > > firmware per the reason you pointed out above (or similar,
> > > > > to my understanding, firmware can no longer DMA data to host
> > > > > memory in such cases), therefore the MRPC execution will
> > > > > never end.
> > > > 
> > > > I'm glad this makes sense to you, because it still doesn't to
> > > > me.
> > > > 
> > > > check_access() does an MMIO read to something in BAR0.  If
> > > > that read returns ~0, it means either the PCIe Memory Read was
> > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > because firmware has not initialized that part of the BAR) or
> > > > the PCIe Memory Read failed and the root complex fabricated
> > > > the ~0 data.
> > > > 
> > > > I'd like to know which one is happening so we can clarify the
> > > > commit log text about "MRPC command executions hang
> > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > clear whether these are PCIe protocol issues or
> > > > driver/firmware interaction issues.
> > > 
> > > I think it's the latter case, the ~0 data was fabricated by the
> > > root complex, as the MMIO read in check_access() always returns
> > > ~0 until a reboot or a rescan happens.
> > 
> > If the root complex fabricates ~0, that means a PCIe transaction
> > failed, i.e., the device didn't respond.  Rescan only does config
> > reads and writes.  Why should that cause the PCIe transactions to
> > magically start working?
> 
> I took a closer look. What I observed was like this. A firmware
> reset cleared some CSR settings including the MSE and MBE bits and
> the Base Address Registers. With a rescan (removing the switch to
> which the management EP was binded from root port and rescan), the
> management EP was re-enumerated and driver was re-probed, so that
> the settings cleared by the firmware reset was properly setup again,
> therefore PCIe transactions start working.

I think what you just said is that 

  - the driver asked the firmware to reset the device

  - the firmware did reset the device, which cleared Memory Space
    Enable

  - nothing restored the device config after the reset, so Memory
    Space Enable remains cleared

  - the driver does MMIO reads to figure out when the reset has
    completed

  - the device doesn't respond to the PCIe Memory Reads because Memory
    Space Enable is cleared

  - the root complex sees a timeout or error completion and fabricates
    ~0 data for the CPU read

  - the driver sees ~0 data from the MMIO read and thinks the device
    or firmware is hung

If that's all true, I think the patch is sort of a band-aid that
doesn't fix the problem at all but only makes the driver's response to
it marginally better.  But the device is still unusable until a rescan
or reboot.

So I think we should drop this patch and do something to restore the
device state after the reset.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06 14:19                     ` Bjorn Helgaas
@ 2021-10-06 19:00                       ` Kelvin.Cao
  2021-10-06 20:20                         ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-06 19:00 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, linux-pci, logang, linux-kernel

On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 12:37:02AM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > Kelvin.Cao@microchip.com
> > > > > wrote:
> > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > complete
> > > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > updated it to 'done'.
> > > > > > 
> > > > > > The original message was sort of misleading. After a
> > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > > mode), the MRPC status which is located in the host memory,
> > > > > > gets initialized by the CPU and updated/finalized by the
> > > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > > initiated afterwards will not get the status updated by the
> > > > > > firmware per the reason you pointed out above (or similar,
> > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > host
> > > > > > memory in such cases), therefore the MRPC execution will
> > > > > > never end.
> > > > > 
> > > > > I'm glad this makes sense to you, because it still doesn't to
> > > > > me.
> > > > > 
> > > > > check_access() does an MMIO read to something in BAR0.  If
> > > > > that read returns ~0, it means either the PCIe Memory Read
> > > > > was
> > > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > > because firmware has not initialized that part of the BAR) or
> > > > > the PCIe Memory Read failed and the root complex fabricated
> > > > > the ~0 data.
> > > > > 
> > > > > I'd like to know which one is happening so we can clarify the
> > > > > commit log text about "MRPC command executions hang
> > > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > > clear whether these are PCIe protocol issues or
> > > > > driver/firmware interaction issues.
> > > > 
> > > > I think it's the latter case, the ~0 data was fabricated by the
> > > > root complex, as the MMIO read in check_access() always returns
> > > > ~0 until a reboot or a rescan happens.
> > > 
> > > If the root complex fabricates ~0, that means a PCIe transaction
> > > failed, i.e., the device didn't respond.  Rescan only does config
> > > reads and writes.  Why should that cause the PCIe transactions to
> > > magically start working?
> > 
> > I took a closer look. What I observed was like this. A firmware
> > reset cleared some CSR settings including the MSE and MBE bits and
> > the Base Address Registers. With a rescan (removing the switch to
> > which the management EP was binded from root port and rescan), the
> > management EP was re-enumerated and driver was re-probed, so that
> > the settings cleared by the firmware reset was properly setup
> > again,
> > therefore PCIe transactions start working.
> 
> I think what you just said is that
> 
>   - the driver asked the firmware to reset the device
> 
>   - the firmware did reset the device, which cleared Memory Space
>     Enable
> 
>   - nothing restored the device config after the reset, so Memory
>     Space Enable remains cleared
> 
>   - the driver does MMIO reads to figure out when the reset has
>     completed
> 
>   - the device doesn't respond to the PCIe Memory Reads because
> Memory
>     Space Enable is cleared
> 
>   - the root complex sees a timeout or error completion and
> fabricates
>     ~0 data for the CPU read
> 
>   - the driver sees ~0 data from the MMIO read and thinks the device
>     or firmware is hung
> 
> If that's all true, I think the patch is sort of a band-aid that
> doesn't fix the problem at all but only makes the driver's response
> to
> it marginally better.  But the device is still unusable until a
> rescan
> or reboot.
> 
> So I think we should drop this patch and do something to restore the
> device state after the reset.

Do you mean we should do something at the driver level to automatically
try to restore the device state after the reset? I was thinking it's up
to the user to make the call to restore the device state or take other
actions, so that returning an error code from MRPC to indicate what
happened would be good enough for the driver. 

Can you possibly shed light on what might be a reasonable way to
restore the device state in the driver if applicable? I was just doing
it by leveraging the remove and rescan interfaces in the sysfs.

> 
> Bjorn

That's all true. I lean towards keeping the patch as I think making the
response better under the following situations might not be bad.

  - The firmware reset case, as we discussed. I'd think it's still
useful for users to get a fast error return which indicates something
bad happened and some actions need to be taken either to abort or try
to recover. In this case, we are assuming that a firmware reset will
boot the firmware successfully.

  - The firwmare crashes and doesn't respond, which normally is the
reason for users to issue a firmware reset command to try to recover it
via either the driver or a sideband interface. The firmware may not be
able to recover by a reset in some extream situations like hardware
errors, so that an error return is probably all the users can get
before another level of recovery happens.

So I'd think this patch is still making the driver better in some way.

Kelvin


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06 19:00                       ` Kelvin.Cao
@ 2021-10-06 20:20                         ` Bjorn Helgaas
  2021-10-06 21:27                           ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-06 20:20 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, kelvincao, linux-pci, logang, linux-kernel

On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com wrote:
> On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 05:49:29AM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2021 at 12:37:02AM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > > Kelvin.Cao@microchip.com
> > > > > > wrote:
> > > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > > complete
> > > > > > > > the CPU read.  But now I'm not sure, because it sounds
> > > > > > > > like it might be that the PCIe transaction succeeds, but
> > > > > > > > it reads data that hasn't been updated by the firmware,
> > > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > > updated it to 'done'.
> > > > > > > 
> > > > > > > The original message was sort of misleading. After a
> > > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > > doesn't explain the hang.  In a MRPC execution (DMA MRPC
> > > > > > > mode), the MRPC status which is located in the host memory,
> > > > > > > gets initialized by the CPU and updated/finalized by the
> > > > > > > firmware. In the situation of a firmware reset, any MRPC
> > > > > > > initiated afterwards will not get the status updated by the
> > > > > > > firmware per the reason you pointed out above (or similar,
> > > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > > host
> > > > > > > memory in such cases), therefore the MRPC execution will
> > > > > > > never end.
> > > > > > 
> > > > > > I'm glad this makes sense to you, because it still doesn't to
> > > > > > me.
> > > > > > 
> > > > > > check_access() does an MMIO read to something in BAR0.  If
> > > > > > that read returns ~0, it means either the PCIe Memory Read
> > > > > > was
> > > > > > successful and the Switchtec device supplied ~0 data (maybe
> > > > > > because firmware has not initialized that part of the BAR) or
> > > > > > the PCIe Memory Read failed and the root complex fabricated
> > > > > > the ~0 data.
> > > > > > 
> > > > > > I'd like to know which one is happening so we can clarify the
> > > > > > commit log text about "MRPC command executions hang
> > > > > > indefinitely" and "host wil fail all GAS reads."  It's not
> > > > > > clear whether these are PCIe protocol issues or
> > > > > > driver/firmware interaction issues.
> > > > > 
> > > > > I think it's the latter case, the ~0 data was fabricated by the
> > > > > root complex, as the MMIO read in check_access() always returns
> > > > > ~0 until a reboot or a rescan happens.
> > > > 
> > > > If the root complex fabricates ~0, that means a PCIe transaction
> > > > failed, i.e., the device didn't respond.  Rescan only does config
> > > > reads and writes.  Why should that cause the PCIe transactions to
> > > > magically start working?
> > > 
> > > I took a closer look. What I observed was like this. A firmware
> > > reset cleared some CSR settings including the MSE and MBE bits and
> > > the Base Address Registers. With a rescan (removing the switch to
> > > which the management EP was binded from root port and rescan), the
> > > management EP was re-enumerated and driver was re-probed, so that
> > > the settings cleared by the firmware reset was properly setup
> > > again,
> > > therefore PCIe transactions start working.
> > 
> > I think what you just said is that
> > 
> >   - the driver asked the firmware to reset the device
> > 
> >   - the firmware did reset the device, which cleared Memory Space
> >     Enable
> > 
> >   - nothing restored the device config after the reset, so Memory
> >     Space Enable remains cleared
> > 
> >   - the driver does MMIO reads to figure out when the reset has
> >     completed
> > 
> >   - the device doesn't respond to the PCIe Memory Reads because
> > Memory
> >     Space Enable is cleared
> > 
> >   - the root complex sees a timeout or error completion and
> > fabricates
> >     ~0 data for the CPU read
> > 
> >   - the driver sees ~0 data from the MMIO read and thinks the device
> >     or firmware is hung
> > 
> > If that's all true, I think the patch is sort of a band-aid that
> > doesn't fix the problem at all but only makes the driver's response
> > to
> > it marginally better.  But the device is still unusable until a
> > rescan
> > or reboot.
> > 
> > So I think we should drop this patch and do something to restore the
> > device state after the reset.
> 
> Do you mean we should do something at the driver level to automatically
> try to restore the device state after the reset? I was thinking it's up
> to the user to make the call to restore the device state or take other
> actions, so that returning an error code from MRPC to indicate what
> happened would be good enough for the driver. 

It sounds like this is a completely predictable situation.  Why would
you want manual user intervention?

I'm pretty sure there are drivers that save state *before* the reset
and restore it afterwards.

> Can you possibly shed light on what might be a reasonable way to
> restore the device state in the driver if applicable? I was just doing
> it by leveraging the remove and rescan interfaces in the sysfs.
> 
> That's all true. I lean towards keeping the patch as I think making the
> response better under the following situations might not be bad.
> 
>   - The firmware reset case, as we discussed. I'd think it's still
> useful for users to get a fast error return which indicates something
> bad happened and some actions need to be taken either to abort or try
> to recover. In this case, we are assuming that a firmware reset will
> boot the firmware successfully.

So wait, you mean you just intentionally ask the firmware to reset,
knowing that the device will be unusable until the user reboots or
does a manual rescan?  And the way to improve this is for the driver
to report an error to the user instead of hanging?  I *guess* that
might be some sort of improvement, but seems like a pretty small one.

>   - The firwmare crashes and doesn't respond, which normally is the
> reason for users to issue a firmware reset command to try to recover it
> via either the driver or a sideband interface. The firmware may not be
> able to recover by a reset in some extream situations like hardware
> errors, so that an error return is probably all the users can get
> before another level of recovery happens.
> 
> So I'd think this patch is still making the driver better in some way.
> 
> Kelvin
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06 20:20                         ` Bjorn Helgaas
@ 2021-10-06 21:27                           ` Kelvin.Cao
  2021-10-07 21:23                             ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-06 21:27 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Wed, 2021-10-06 at 09:19 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 05:49:29AM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > > On Tue, 2021-10-05 at 21:33 -0500, Bjorn Helgaas wrote:
> > > > > On Wed, Oct 06, 2021 at 12:37:02AM +0000,
> > > > > Kelvin.Cao@microchip.com
> > > > > wrote:
> > > > > > On Tue, 2021-10-05 at 15:11 -0500, Bjorn Helgaas wrote:
> > > > > > > On Mon, Oct 04, 2021 at 08:51:06PM +0000,
> > > > > > > Kelvin.Cao@microchip.com
> > > > > > > wrote:
> > > > > > > > On Sat, 2021-10-02 at 10:11 -0500, Bjorn Helgaas wrote:
> > > > > > > > > I *thought* the problem was that the PCIe Memory Read
> > > > > > > > > failed and the Root Complex fabricated ~0 data to
> > > > > > > > > complete
> > > > > > > > > the CPU read.  But now I'm not sure, because it
> > > > > > > > > sounds
> > > > > > > > > like it might be that the PCIe transaction succeeds,
> > > > > > > > > but
> > > > > > > > > it reads data that hasn't been updated by the
> > > > > > > > > firmware,
> > > > > > > > > i.e., it reads 'in progress' because firmware hasn't
> > > > > > > > > updated it to 'done'.
> > > > > > > > 
> > > > > > > > The original message was sort of misleading. After a
> > > > > > > > firmware reset, CPU getting ~0 for the PCIe Memory Read
> > > > > > > > doesn't explain the hang.  In a MRPC execution (DMA
> > > > > > > > MRPC
> > > > > > > > mode), the MRPC status which is located in the host
> > > > > > > > memory,
> > > > > > > > gets initialized by the CPU and updated/finalized by
> > > > > > > > the
> > > > > > > > firmware. In the situation of a firmware reset, any
> > > > > > > > MRPC
> > > > > > > > initiated afterwards will not get the status updated by
> > > > > > > > the
> > > > > > > > firmware per the reason you pointed out above (or
> > > > > > > > similar,
> > > > > > > > to my understanding, firmware can no longer DMA data to
> > > > > > > > host
> > > > > > > > memory in such cases), therefore the MRPC execution
> > > > > > > > will
> > > > > > > > never end.
> > > > > > > 
> > > > > > > I'm glad this makes sense to you, because it still
> > > > > > > doesn't to
> > > > > > > me.
> > > > > > > 
> > > > > > > check_access() does an MMIO read to something in
> > > > > > > BAR0.  If
> > > > > > > that read returns ~0, it means either the PCIe Memory
> > > > > > > Read
> > > > > > > was
> > > > > > > successful and the Switchtec device supplied ~0 data
> > > > > > > (maybe
> > > > > > > because firmware has not initialized that part of the
> > > > > > > BAR) or
> > > > > > > the PCIe Memory Read failed and the root complex
> > > > > > > fabricated
> > > > > > > the ~0 data.
> > > > > > > 
> > > > > > > I'd like to know which one is happening so we can clarify
> > > > > > > the
> > > > > > > commit log text about "MRPC command executions hang
> > > > > > > indefinitely" and "host wil fail all GAS reads."  It's
> > > > > > > not
> > > > > > > clear whether these are PCIe protocol issues or
> > > > > > > driver/firmware interaction issues.
> > > > > > 
> > > > > > I think it's the latter case, the ~0 data was fabricated by
> > > > > > the
> > > > > > root complex, as the MMIO read in check_access() always
> > > > > > returns
> > > > > > ~0 until a reboot or a rescan happens.
> > > > > 
> > > > > If the root complex fabricates ~0, that means a PCIe
> > > > > transaction
> > > > > failed, i.e., the device didn't respond.  Rescan only does
> > > > > config
> > > > > reads and writes.  Why should that cause the PCIe
> > > > > transactions to
> > > > > magically start working?
> > > > 
> > > > I took a closer look. What I observed was like this. A firmware
> > > > reset cleared some CSR settings including the MSE and MBE bits
> > > > and
> > > > the Base Address Registers. With a rescan (removing the switch
> > > > to
> > > > which the management EP was binded from root port and rescan),
> > > > the
> > > > management EP was re-enumerated and driver was re-probed, so
> > > > that
> > > > the settings cleared by the firmware reset was properly setup
> > > > again,
> > > > therefore PCIe transactions start working.
> > > 
> > > I think what you just said is that
> > > 
> > >   - the driver asked the firmware to reset the device
> > > 
> > >   - the firmware did reset the device, which cleared Memory Space
> > >     Enable
> > > 
> > >   - nothing restored the device config after the reset, so Memory
> > >     Space Enable remains cleared
> > > 
> > >   - the driver does MMIO reads to figure out when the reset has
> > >     completed
> > > 
> > >   - the device doesn't respond to the PCIe Memory Reads because
> > > Memory
> > >     Space Enable is cleared
> > > 
> > >   - the root complex sees a timeout or error completion and
> > > fabricates
> > >     ~0 data for the CPU read
> > > 
> > >   - the driver sees ~0 data from the MMIO read and thinks the
> > > device
> > >     or firmware is hung
> > > 
> > > If that's all true, I think the patch is sort of a band-aid that
> > > doesn't fix the problem at all but only makes the driver's
> > > response
> > > to
> > > it marginally better.  But the device is still unusable until a
> > > rescan
> > > or reboot.
> > > 
> > > So I think we should drop this patch and do something to restore
> > > the
> > > device state after the reset.
> > 
> > Do you mean we should do something at the driver level to
> > automatically
> > try to restore the device state after the reset? I was thinking
> > it's up
> > to the user to make the call to restore the device state or take
> > other
> > actions, so that returning an error code from MRPC to indicate what
> > happened would be good enough for the driver.
> 
> It sounds like this is a completely predictable situation.  Why would
> you want manual user intervention?
> 
> I'm pretty sure there are drivers that save state *before* the reset
> and restore it afterwards.

Sometimes it's not predicatable. We have various interfaces to talk to
the firmware, like in-band, UART, TWI, etc. If a firmware reset is
issued to the firmware via the TWI sideband interface from BMC, neither
the host utility which might be periodically issueing MRPC commands to
monitor the switch nor the driver will be aware of this.

If the reset command is issued via the driver, as it's just another
MRPC command initiated by a user space process, the driver doesn't
actually know what will be happening unless it decodes each MRPC it
forwards to the firmware and checks for the reset command, but I doubt
it's something the driver should do.

> 
> > Can you possibly shed light on what might be a reasonable way to
> > restore the device state in the driver if applicable? I was just
> > doing
> > it by leveraging the remove and rescan interfaces in the sysfs.
> > 
> > That's all true. I lean towards keeping the patch as I think making
> > the
> > response better under the following situations might not be bad.
> > 
> >   - The firmware reset case, as we discussed. I'd think it's still
> > useful for users to get a fast error return which indicates
> > something
> > bad happened and some actions need to be taken either to abort or
> > try
> > to recover. In this case, we are assuming that a firmware reset
> > will
> > boot the firmware successfully.
> 
> So wait, you mean you just intentionally ask the firmware to reset,
> knowing that the device will be unusable until the user reboots or
> does a manual rescan?  And the way to improve this is for the driver
> to report an error to the user instead of hanging?  I *guess* that
> might be some sort of improvement, but seems like a pretty small one.

Yes, however, I believe it's something our users really like to have...
With this, they can do their user space programming/scripting more
easily in a synchronous fashion.

> 
> >   - The firwmare crashes and doesn't respond, which normally is the
> > reason for users to issue a firmware reset command to try to
> > recover it
> > via either the driver or a sideband interface. The firmware may not
> > be
> > able to recover by a reset in some extream situations like hardware
> > errors, so that an error return is probably all the users can get
> > before another level of recovery happens.
> > 
> > So I'd think this patch is still making the driver better in some
> > way.
> > 
> > Kelvin
> > 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-06 21:27                           ` Kelvin.Cao
@ 2021-10-07 21:23                             ` Bjorn Helgaas
  2021-10-08  0:06                               ` Kelvin.Cao
  0 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-07 21:23 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, kelvincao, logang, linux-kernel, linux-pci

On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com wrote:
> On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 07:00:55PM +0000, Kelvin.Cao@microchip.com
> > wrote:

> > So wait, you mean you just intentionally ask the firmware to
> > reset, knowing that the device will be unusable until the user
> > reboots or does a manual rescan?  And the way to improve this is
> > for the driver to report an error to the user instead of hanging?
> > I *guess* that might be some sort of improvement, but seems like a
> > pretty small one.
> 
> Yes, however, I believe it's something our users really like to
> have...  With this, they can do their user space
> programming/scripting more easily in a synchronous fashion.
> 
> > >   - The firwmare crashes and doesn't respond, which normally is
> > >   the reason for users to issue a firmware reset command to try
> > >   to recover it via either the driver or a sideband interface.
> > >   The firmware may not be able to recover by a reset in some
> > >   extream situations like hardware errors, so that an error
> > >   return is probably all the users can get before another level
> > >   of recovery happens.
> > > 
> > > So I'd think this patch is still making the driver better in
> > > some way.

OK.  I still think the fact that all these different mechanisms can
reset the device behind your back and make the switch and anything on
the other side of it just stop working is ..., well, let's just say
it's quite surprising to me.

Well, at least this isn't quite so much a mystery anymore and maybe we
can improve the commit log.  E.g., maybe something like this:

  A firmware hard reset may be initiated by various mechanisms
  including a UART interface, TWI sideband interface from BMC, MRPC
  command from userspace, etc.  The switchtec management driver is
  unaware of these resets.

  The reset clears PCI state including the BARs and Memory Space
  Enable bits, so the device no longer responds to the MMIO accesses
  the driver uses to operate it.

  MMIO reads to the device will fail with a PCIe error.  When the root
  complex handles that error, it typically fabricates ~0 data to
  complete the CPU read.

  Check for this sort of error by reading the device ID from MMIO
  space.  This ID can never be ~0, so if we see that value, it
  probably means the PCIe Memory Read failed and we should return an
  error indication to the application using the switchtec driver.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-07 21:23                             ` Bjorn Helgaas
@ 2021-10-08  0:06                               ` Kelvin.Cao
  2021-10-08 11:03                                 ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Kelvin.Cao @ 2021-10-08  0:06 UTC (permalink / raw)
  To: helgaas
  Cc: kurt.schwemmer, bhelgaas, linux-pci, kelvincao, logang, linux-kernel

On Thu, 2021-10-07 at 16:23 -0500, Bjorn Helgaas wrote:
> On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com
> wrote:
> > On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > > On Wed, Oct 06, 2021 at 07:00:55PM +0000, 
> > > Kelvin.Cao@microchip.com
> > > wrote:
> > > So wait, you mean you just intentionally ask the firmware to
> > > reset, knowing that the device will be unusable until the user
> > > reboots or does a manual rescan?  And the way to improve this is
> > > for the driver to report an error to the user instead of hanging?
> > > I *guess* that might be some sort of improvement, but seems like
> > > a
> > > pretty small one.
> > 
> > Yes, however, I believe it's something our users really like to
> > have...  With this, they can do their user space
> > programming/scripting more easily in a synchronous fashion.
> > 
> > > >   - The firwmare crashes and doesn't respond, which normally is
> > > >   the reason for users to issue a firmware reset command to try
> > > >   to recover it via either the driver or a sideband interface.
> > > >   The firmware may not be able to recover by a reset in some
> > > >   extream situations like hardware errors, so that an error
> > > >   return is probably all the users can get before another level
> > > >   of recovery happens.
> > > > 
> > > > So I'd think this patch is still making the driver better in
> > > > some way.
> 
> OK.  I still think the fact that all these different mechanisms can
> reset the device behind your back and make the switch and anything on
> the other side of it just stop working is ..., well, let's just say
> it's quite surprising to me.

Actually there're mechanisms like permission control to limit what
people can do in the firmware, so I guess it's not as bad as it sounds
like. 
> 
> Well, at least this isn't quite so much a mystery anymore and maybe
> we
> can improve the commit log.  E.g., maybe something like this:
> 
>   A firmware hard reset may be initiated by various mechanisms
>   including a UART interface, TWI sideband interface from BMC, MRPC
>   command from userspace, etc.  The switchtec management driver is
>   unaware of these resets.
> 
>   The reset clears PCI state including the BARs and Memory Space
>   Enable bits, so the device no longer responds to the MMIO accesses
>   the driver uses to operate it.
> 
>   MMIO reads to the device will fail with a PCIe error.  When the
> root
>   complex handles that error, it typically fabricates ~0 data to
>   complete the CPU read.
> 
>   Check for this sort of error by reading the device ID from MMIO
>   space.  This ID can never be ~0, so if we see that value, it
>   probably means the PCIe Memory Read failed and we should return an
>   error indication to the application using the switchtec driver.

It looks good to me, the commit log removes the ambiguity. Let me know
if you prefer a v2 patchset with the updated commit log and naming
issue fix.

Thank you Bjorn for your patience and time! 

Kelvin

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access
  2021-10-08  0:06                               ` Kelvin.Cao
@ 2021-10-08 11:03                                 ` Bjorn Helgaas
  0 siblings, 0 replies; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-08 11:03 UTC (permalink / raw)
  To: Kelvin.Cao
  Cc: kurt.schwemmer, bhelgaas, linux-pci, kelvincao, logang, linux-kernel

On Fri, Oct 08, 2021 at 12:06:18AM +0000, Kelvin.Cao@microchip.com wrote:
> On Thu, 2021-10-07 at 16:23 -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 06, 2021 at 09:27:49PM +0000, Kelvin.Cao@microchip.com
> > wrote:
> > > On Wed, 2021-10-06 at 15:20 -0500, Bjorn Helgaas wrote:
> > > > On Wed, Oct 06, 2021 at 07:00:55PM +0000, 
> > > > Kelvin.Cao@microchip.com
> > > > wrote:
> > > > So wait, you mean you just intentionally ask the firmware to
> > > > reset, knowing that the device will be unusable until the user
> > > > reboots or does a manual rescan?  And the way to improve this is
> > > > for the driver to report an error to the user instead of hanging?
> > > > I *guess* that might be some sort of improvement, but seems like
> > > > a
> > > > pretty small one.
> > > 
> > > Yes, however, I believe it's something our users really like to
> > > have...  With this, they can do their user space
> > > programming/scripting more easily in a synchronous fashion.
> > > 
> > > > >   - The firwmare crashes and doesn't respond, which normally is
> > > > >   the reason for users to issue a firmware reset command to try
> > > > >   to recover it via either the driver or a sideband interface.
> > > > >   The firmware may not be able to recover by a reset in some
> > > > >   extream situations like hardware errors, so that an error
> > > > >   return is probably all the users can get before another level
> > > > >   of recovery happens.
> > > > > 
> > > > > So I'd think this patch is still making the driver better in
> > > > > some way.
> > 
> > OK.  I still think the fact that all these different mechanisms can
> > reset the device behind your back and make the switch and anything on
> > the other side of it just stop working is ..., well, let's just say
> > it's quite surprising to me.
> 
> Actually there're mechanisms like permission control to limit what
> people can do in the firmware, so I guess it's not as bad as it sounds
> like. 
> > 
> > Well, at least this isn't quite so much a mystery anymore and maybe
> > we
> > can improve the commit log.  E.g., maybe something like this:
> > 
> >   A firmware hard reset may be initiated by various mechanisms
> >   including a UART interface, TWI sideband interface from BMC, MRPC
> >   command from userspace, etc.  The switchtec management driver is
> >   unaware of these resets.
> > 
> >   The reset clears PCI state including the BARs and Memory Space
> >   Enable bits, so the device no longer responds to the MMIO accesses
> >   the driver uses to operate it.
> > 
> >   MMIO reads to the device will fail with a PCIe error.  When the
> > root
> >   complex handles that error, it typically fabricates ~0 data to
> >   complete the CPU read.
> > 
> >   Check for this sort of error by reading the device ID from MMIO
> >   space.  This ID can never be ~0, so if we see that value, it
> >   probably means the PCIe Memory Read failed and we should return an
> >   error indication to the application using the switchtec driver.
> 
> It looks good to me, the commit log removes the ambiguity. Let me know
> if you prefer a v2 patchset with the updated commit log and naming
> issue fix.

Yes, if you post a v2 of this patch, I'll update my pci/switchtec
branch with it.  Thanks for your patience!

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
                   ` (6 preceding siblings ...)
  2021-09-27 16:39 ` Bjorn Helgaas
@ 2021-10-08 17:05 ` Bjorn Helgaas
  2021-10-08 17:23   ` Logan Gunthorpe
  7 siblings, 1 reply; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-08 17:05 UTC (permalink / raw)
  To: kelvin.cao
  Cc: kurt.schwemmer, logang, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, Sep 24, 2021 at 11:08:37AM +0000, kelvin.cao@microchip.com wrote:
> From: Kelvin Cao <kelvin.cao@microchip.com>
> 
> Hi,
> 
> Please find a bunch of patches for the switchtec driver collected over the
> last few months.

Question: Is there a reason this driver should be in drivers/pci/?

It doesn't use any internal PCI core interfaces, e.g., it doesn't
include drivers/pci/pci.h, and AFAICT it's really just a driver for a
PCI device that happens to be a switch.

I don't really *care* that it's in drivers/pci; I rely on Kurt and
Logan to review changes.  The only problem it presents for me is that
I have to write merge commit logs for the changes.  You'd think that
would be trivial, but since I don't know much about the driver, it
does end up being work for me.

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-10-08 17:05 ` Bjorn Helgaas
@ 2021-10-08 17:23   ` Logan Gunthorpe
  2021-10-08 18:25     ` Bjorn Helgaas
  0 siblings, 1 reply; 32+ messages in thread
From: Logan Gunthorpe @ 2021-10-08 17:23 UTC (permalink / raw)
  To: Bjorn Helgaas, kelvin.cao
  Cc: kurt.schwemmer, bhelgaas, linux-pci, linux-kernel, kelvincao




On 2021-10-08 11:05 a.m., Bjorn Helgaas wrote:
> On Fri, Sep 24, 2021 at 11:08:37AM +0000, kelvin.cao@microchip.com wrote:
>> From: Kelvin Cao <kelvin.cao@microchip.com>
>>
>> Hi,
>>
>> Please find a bunch of patches for the switchtec driver collected over the
>> last few months.
> 
> Question: Is there a reason this driver should be in drivers/pci/?
> 
> It doesn't use any internal PCI core interfaces, e.g., it doesn't
> include drivers/pci/pci.h, and AFAICT it's really just a driver for a
> PCI device that happens to be a switch.
> 
> I don't really *care* that it's in drivers/pci; I rely on Kurt and
> Logan to review changes.  The only problem it presents for me is that
> I have to write merge commit logs for the changes.  You'd think that
> would be trivial, but since I don't know much about the driver, it
> does end up being work for me.

We did discuss this when it was originally merged.

The main reason we want it in the PCI tree is so that it's in a sensible
spot in the Kconfig hierarchy (under PCI support). Seeing it is still
PCI hardware. Dropping it into the miscellaneous devices mess (or
similar) is less than desirable. Moreover, it's not like the maintainers
for misc have any additional knowledge that would make them better
qualified to merge these changes. In fact, I'm sure they'd have less
knowledge and we wouldn't have gotten to the bottom of this last issue
if it had been a different maintainer.

In the future I'll try to be more careful in my reviews to ensure we
have a better understanding and clearer commit messages. If there's
anything else we can do to make your job easier, please let us know.

Thanks,

Logan

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/5] Switchtec Fixes and Improvements
  2021-10-08 17:23   ` Logan Gunthorpe
@ 2021-10-08 18:25     ` Bjorn Helgaas
  0 siblings, 0 replies; 32+ messages in thread
From: Bjorn Helgaas @ 2021-10-08 18:25 UTC (permalink / raw)
  To: Logan Gunthorpe
  Cc: kelvin.cao, kurt.schwemmer, bhelgaas, linux-pci, linux-kernel, kelvincao

On Fri, Oct 08, 2021 at 11:23:46AM -0600, Logan Gunthorpe wrote:
> On 2021-10-08 11:05 a.m., Bjorn Helgaas wrote:
> > On Fri, Sep 24, 2021 at 11:08:37AM +0000, kelvin.cao@microchip.com wrote:
> >> From: Kelvin Cao <kelvin.cao@microchip.com>
> >>
> >> Hi,
> >>
> >> Please find a bunch of patches for the switchtec driver collected over the
> >> last few months.
> > 
> > Question: Is there a reason this driver should be in drivers/pci/?
> > 
> > It doesn't use any internal PCI core interfaces, e.g., it doesn't
> > include drivers/pci/pci.h, and AFAICT it's really just a driver for a
> > PCI device that happens to be a switch.
> > 
> > I don't really *care* that it's in drivers/pci; I rely on Kurt and
> > Logan to review changes.  The only problem it presents for me is that
> > I have to write merge commit logs for the changes.  You'd think that
> > would be trivial, but since I don't know much about the driver, it
> > does end up being work for me.
> 
> We did discuss this when it was originally merged.

Thanks, I thought I remembered talking about it, but didn't bother to
dig it up.

> The main reason we want it in the PCI tree is so that it's in a sensible
> spot in the Kconfig hierarchy (under PCI support). Seeing it is still
> PCI hardware. Dropping it into the miscellaneous devices mess (or
> similar) is less than desirable. Moreover, it's not like the maintainers
> for misc have any additional knowledge that would make them better
> qualified to merge these changes. In fact, I'm sure they'd have less
> knowledge and we wouldn't have gotten to the bottom of this last issue
> if it had been a different maintainer.
> 
> In the future I'll try to be more careful in my reviews to ensure we
> have a better understanding and clearer commit messages. If there's
> anything else we can do to make your job easier, please let us know.

Oh, please don't take this as me complaining about anybody's reviews!
I honestly just look for your or Kurt's ack.  I think I just need to
be a little less fixated on writing the merge commit logs :)

Bjorn

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2021-10-08 18:25 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-24 11:08 [PATCH 0/5] Switchtec Fixes and Improvements kelvin.cao
2021-09-24 11:08 ` [PATCH 1/5] PCI/switchtec: Error out MRPC execution when no GAS access kelvin.cao
2021-10-01 20:18   ` Bjorn Helgaas
2021-10-01 20:29     ` Logan Gunthorpe
2021-10-01 23:49       ` Kelvin.Cao
2021-10-02 15:11         ` Bjorn Helgaas
2021-10-04 20:51           ` Kelvin.Cao
2021-10-05 20:11             ` Bjorn Helgaas
2021-10-06  0:37               ` Kelvin.Cao
2021-10-06  2:33                 ` Bjorn Helgaas
2021-10-06  5:49                   ` Kelvin.Cao
2021-10-06 14:19                     ` Bjorn Helgaas
2021-10-06 19:00                       ` Kelvin.Cao
2021-10-06 20:20                         ` Bjorn Helgaas
2021-10-06 21:27                           ` Kelvin.Cao
2021-10-07 21:23                             ` Bjorn Helgaas
2021-10-08  0:06                               ` Kelvin.Cao
2021-10-08 11:03                                 ` Bjorn Helgaas
2021-10-01 22:58     ` Kelvin.Cao
2021-10-01 23:52       ` Logan Gunthorpe
2021-10-02  0:05         ` Kelvin.Cao
2021-09-24 11:08 ` [PATCH 2/5] PCI/switchtec: Fix a MRPC error status handling issue kelvin.cao
2021-09-24 11:08 ` [PATCH 3/5] PCI/switchtec: Update the way of getting management VEP instance ID kelvin.cao
2021-09-24 11:08 ` [PATCH 4/5] PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP kelvin.cao
2021-09-24 11:08 ` [PATCH 5/5] PCI/switchtec: Add check of event support kelvin.cao
2021-09-24 15:53 ` [PATCH 0/5] Switchtec Fixes and Improvements Logan Gunthorpe
2021-09-25  5:27   ` Kelvin.Cao
2021-09-27 16:39 ` Bjorn Helgaas
2021-09-27 18:25   ` Kelvin.Cao
2021-10-08 17:05 ` Bjorn Helgaas
2021-10-08 17:23   ` Logan Gunthorpe
2021-10-08 18:25     ` Bjorn Helgaas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).