linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
@ 2021-07-05  9:00 Renius Chen
  2021-07-05 10:02 ` Ulf Hansson
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-05  9:00 UTC (permalink / raw)
  To: ulf.hansson, adrian.hunter
  Cc: linux-mmc, linux-kernel, Ben.Chuang, Renius Chen

During a sequence of random 4K read operations, the performance will be
reduced due to spending much time on entering/exiting the low power state
between requests. We disable the low power state negotiation of GL9763E
during a sequence of random 4K read operations to improve the performance
and enable it again after the operations have finished.

Signed-off-by: Renius Chen <reniuschengl@gmail.com>
---
 drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
index 302a7579a9b3..5f1f332b4241 100644
--- a/drivers/mmc/host/sdhci-pci-gli.c
+++ b/drivers/mmc/host/sdhci-pci-gli.c
@@ -88,6 +88,9 @@
 #define PCIE_GLI_9763E_SCR	 0x8E0
 #define   GLI_9763E_SCR_AXI_REQ	   BIT(9)
 
+#define PCIE_GLI_9763E_CFG       0x8A0
+#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
+
 #define PCIE_GLI_9763E_CFG2      0x8A4
 #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
 #define   GLI_9763E_CFG2_L1DLY_MID 0x54
@@ -128,6 +131,11 @@
 
 #define GLI_MAX_TUNING_LOOP 40
 
+struct gli_host {
+	bool start_4k_r;
+	int continuous_4k_r;
+};
+
 /* Genesys Logic chipset */
 static inline void gl9750_wt_on(struct sdhci_host *host)
 {
@@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
 	sdhci_dumpregs(mmc_priv(mmc));
 }
 
+static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
+{
+	struct pci_dev *pdev = slot->chip->pdev;
+	u32 value;
+
+	pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
+	value &= ~GLI_9763E_VHS_REV;
+	value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
+	pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
+
+	pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
+
+	if (enable)
+		value &= ~GLI_9763E_CFG_LPSN_DIS;
+	else
+		value |= GLI_9763E_CFG_LPSN_DIS;
+
+	pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
+
+	pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
+	value &= ~GLI_9763E_VHS_REV;
+	value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
+	pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
+}
+
+static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+	struct mmc_command *cmd;
+	struct sdhci_pci_slot *slot = sdhci_priv(host);
+	struct gli_host *gli_host = sdhci_pci_priv(slot);
+
+	cmd = mrq->cmd;
+
+	if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
+		gli_host->continuous_4k_r++;
+
+		if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
+			gl9763e_set_low_power_negotiation(slot, false);
+
+			gli_host->start_4k_r = true;
+		}
+	} else {
+		gli_host->continuous_4k_r = 0;
+
+		if (gli_host->start_4k_r)	{
+			gl9763e_set_low_power_negotiation(slot, true);
+
+			gli_host->start_4k_r = false;
+		}
+	}
+
+	sdhci_request(mmc, mrq);
+}
+
+
 static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc)
 {
 	struct cqhci_host *cq_host = mmc->cqe_private;
@@ -848,6 +912,9 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
 	gli_pcie_enable_msi(slot);
 	host->mmc_host_ops.hs400_enhanced_strobe =
 					gl9763e_hs400_enhanced_strobe;
+
+	host->mmc_host_ops.request = gl9763e_request;
+
 	gli_set_gl9763e(slot);
 	sdhci_enable_v4_mode(host);
 
@@ -913,4 +980,5 @@ const struct sdhci_pci_fixes sdhci_gl9763e = {
 	.suspend	= sdhci_cqhci_gli_suspend,
 #endif
 	.add_host       = gl9763e_add_host,
+	.priv_size      = sizeof(struct gli_host),
 };
-- 
2.27.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-05  9:00 [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E Renius Chen
@ 2021-07-05 10:02 ` Ulf Hansson
  2021-07-05 10:59   ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Ulf Hansson @ 2021-07-05 10:02 UTC (permalink / raw)
  To: Renius Chen
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
>
> During a sequence of random 4K read operations, the performance will be
> reduced due to spending much time on entering/exiting the low power state
> between requests. We disable the low power state negotiation of GL9763E
> during a sequence of random 4K read operations to improve the performance
> and enable it again after the operations have finished.
>
> Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> ---
>  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
>  1 file changed, 68 insertions(+)
>
> diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> index 302a7579a9b3..5f1f332b4241 100644
> --- a/drivers/mmc/host/sdhci-pci-gli.c
> +++ b/drivers/mmc/host/sdhci-pci-gli.c
> @@ -88,6 +88,9 @@
>  #define PCIE_GLI_9763E_SCR      0x8E0
>  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
>
> +#define PCIE_GLI_9763E_CFG       0x8A0
> +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> +
>  #define PCIE_GLI_9763E_CFG2      0x8A4
>  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
>  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> @@ -128,6 +131,11 @@
>
>  #define GLI_MAX_TUNING_LOOP 40
>
> +struct gli_host {
> +       bool start_4k_r;
> +       int continuous_4k_r;
> +};
> +
>  /* Genesys Logic chipset */
>  static inline void gl9750_wt_on(struct sdhci_host *host)
>  {
> @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
>         sdhci_dumpregs(mmc_priv(mmc));
>  }
>
> +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> +{
> +       struct pci_dev *pdev = slot->chip->pdev;
> +       u32 value;
> +
> +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> +       value &= ~GLI_9763E_VHS_REV;
> +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> +
> +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> +
> +       if (enable)
> +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> +       else
> +               value |= GLI_9763E_CFG_LPSN_DIS;
> +
> +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> +
> +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> +       value &= ~GLI_9763E_VHS_REV;
> +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> +}
> +
> +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> +{
> +       struct sdhci_host *host = mmc_priv(mmc);
> +       struct mmc_command *cmd;
> +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> +
> +       cmd = mrq->cmd;
> +
> +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> +               gli_host->continuous_4k_r++;
> +
> +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> +                       gl9763e_set_low_power_negotiation(slot, false);
> +
> +                       gli_host->start_4k_r = true;
> +               }
> +       } else {
> +               gli_host->continuous_4k_r = 0;
> +
> +               if (gli_host->start_4k_r)       {
> +                       gl9763e_set_low_power_negotiation(slot, true);
> +
> +                       gli_host->start_4k_r = false;
> +               }
> +       }

The above code is trying to figure out what kind of storage use case
that is running, based on information about the buffers. This does not
work, simply because the buffers don't give you all the information
you need to make the right decisions.

Moreover, I am sure you would try to follow up with additional changes
on top, trying to tweak the behaviour to fit another use case - and so
on. My point is, this code doesn't belong in the lowest layer drivers.

To move forward, I suggest you explore using runtime PM in combination
with dev PM qos. In this way, the driver could implement a default
behaviour, which can be tweaked from upper layer governors for
example, but also from user space (via sysfs) allowing more
flexibility and potentially support for various more use cases.

> +
> +       sdhci_request(mmc, mrq);
> +}
> +
> +
>  static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc)
>  {
>         struct cqhci_host *cq_host = mmc->cqe_private;
> @@ -848,6 +912,9 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
>         gli_pcie_enable_msi(slot);
>         host->mmc_host_ops.hs400_enhanced_strobe =
>                                         gl9763e_hs400_enhanced_strobe;
> +
> +       host->mmc_host_ops.request = gl9763e_request;
> +
>         gli_set_gl9763e(slot);
>         sdhci_enable_v4_mode(host);
>
> @@ -913,4 +980,5 @@ const struct sdhci_pci_fixes sdhci_gl9763e = {
>         .suspend        = sdhci_cqhci_gli_suspend,
>  #endif
>         .add_host       = gl9763e_add_host,
> +       .priv_size      = sizeof(struct gli_host),
>  };
> --

Kind regards
Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-05 10:02 ` Ulf Hansson
@ 2021-07-05 10:59   ` Renius Chen
  2021-07-05 12:50     ` Ulf Hansson
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-05 10:59 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午6:03寫道:
>
> On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
> >
> > During a sequence of random 4K read operations, the performance will be
> > reduced due to spending much time on entering/exiting the low power state
> > between requests. We disable the low power state negotiation of GL9763E
> > during a sequence of random 4K read operations to improve the performance
> > and enable it again after the operations have finished.
> >
> > Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> > ---
> >  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
> >  1 file changed, 68 insertions(+)
> >
> > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > index 302a7579a9b3..5f1f332b4241 100644
> > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > @@ -88,6 +88,9 @@
> >  #define PCIE_GLI_9763E_SCR      0x8E0
> >  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
> >
> > +#define PCIE_GLI_9763E_CFG       0x8A0
> > +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> > +
> >  #define PCIE_GLI_9763E_CFG2      0x8A4
> >  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
> >  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> > @@ -128,6 +131,11 @@
> >
> >  #define GLI_MAX_TUNING_LOOP 40
> >
> > +struct gli_host {
> > +       bool start_4k_r;
> > +       int continuous_4k_r;
> > +};
> > +
> >  /* Genesys Logic chipset */
> >  static inline void gl9750_wt_on(struct sdhci_host *host)
> >  {
> > @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
> >         sdhci_dumpregs(mmc_priv(mmc));
> >  }
> >
> > +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> > +{
> > +       struct pci_dev *pdev = slot->chip->pdev;
> > +       u32 value;
> > +
> > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > +       value &= ~GLI_9763E_VHS_REV;
> > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > +
> > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> > +
> > +       if (enable)
> > +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> > +       else
> > +               value |= GLI_9763E_CFG_LPSN_DIS;
> > +
> > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> > +
> > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > +       value &= ~GLI_9763E_VHS_REV;
> > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > +}
> > +
> > +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> > +{
> > +       struct sdhci_host *host = mmc_priv(mmc);
> > +       struct mmc_command *cmd;
> > +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> > +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> > +
> > +       cmd = mrq->cmd;
> > +
> > +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> > +               gli_host->continuous_4k_r++;
> > +
> > +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> > +                       gl9763e_set_low_power_negotiation(slot, false);
> > +
> > +                       gli_host->start_4k_r = true;
> > +               }
> > +       } else {
> > +               gli_host->continuous_4k_r = 0;
> > +
> > +               if (gli_host->start_4k_r)       {
> > +                       gl9763e_set_low_power_negotiation(slot, true);
> > +
> > +                       gli_host->start_4k_r = false;
> > +               }
> > +       }
>
> The above code is trying to figure out what kind of storage use case
> that is running, based on information about the buffers. This does not
> work, simply because the buffers don't give you all the information
> you need to make the right decisions.
>
> Moreover, I am sure you would try to follow up with additional changes
> on top, trying to tweak the behaviour to fit another use case - and so
> on. My point is, this code doesn't belong in the lowest layer drivers.
>
> To move forward, I suggest you explore using runtime PM in combination
> with dev PM qos. In this way, the driver could implement a default
> behaviour, which can be tweaked from upper layer governors for
> example, but also from user space (via sysfs) allowing more
> flexibility and potentially support for various more use cases.
>

Hi Ulf,

Thanks for advice.

But we'll meet the performance issue only during a seqence of requests
of read commands with 4K data length.

So what we have to do is looking into the requests to monitor such
behaviors and disable the low power state negotiation of GL9763e. And
the information from the request buffer is sufficient for this
purpose.

We don't even care about if we disable the low power state negotiation
by a wrong decision because we'll enable it again by any requests
which are not read commands or their data length is not 4K. Disabling
the low power state negotiation of GL9763e not only has no side
effects but also helps its performance.

The behavior is only about the low power state negotiation of GL9763e
and 4K reads, and not related to runtime PM, so that we monitor the
requests and implement it in the driver of GL9763e.

Due to this behavior will only affect our GL9763e but not other
devices, so we think it could be implemented in the lower layer driver
of GL9763e, but not higher level or user space. And we are trying to
modify only our sdhci-pci-gli.c but not other mmc common codes.

Thank you!

> > +
> > +       sdhci_request(mmc, mrq);
> > +}
> > +
> > +
> >  static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc)
> >  {
> >         struct cqhci_host *cq_host = mmc->cqe_private;
> > @@ -848,6 +912,9 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
> >         gli_pcie_enable_msi(slot);
> >         host->mmc_host_ops.hs400_enhanced_strobe =
> >                                         gl9763e_hs400_enhanced_strobe;
> > +
> > +       host->mmc_host_ops.request = gl9763e_request;
> > +
> >         gli_set_gl9763e(slot);
> >         sdhci_enable_v4_mode(host);
> >
> > @@ -913,4 +980,5 @@ const struct sdhci_pci_fixes sdhci_gl9763e = {
> >         .suspend        = sdhci_cqhci_gli_suspend,
> >  #endif
> >         .add_host       = gl9763e_add_host,
> > +       .priv_size      = sizeof(struct gli_host),
> >  };
> > --
>
> Kind regards
> Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-05 10:59   ` Renius Chen
@ 2021-07-05 12:50     ` Ulf Hansson
  2021-07-05 15:09       ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Ulf Hansson @ 2021-07-05 12:50 UTC (permalink / raw)
  To: Renius Chen
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

On Mon, 5 Jul 2021 at 12:59, Renius Chen <reniuschengl@gmail.com> wrote:
>
> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午6:03寫道:
> >
> > On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
> > >
> > > During a sequence of random 4K read operations, the performance will be
> > > reduced due to spending much time on entering/exiting the low power state
> > > between requests. We disable the low power state negotiation of GL9763E
> > > during a sequence of random 4K read operations to improve the performance
> > > and enable it again after the operations have finished.
> > >
> > > Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> > > ---
> > >  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
> > >  1 file changed, 68 insertions(+)
> > >
> > > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > > index 302a7579a9b3..5f1f332b4241 100644
> > > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > > @@ -88,6 +88,9 @@
> > >  #define PCIE_GLI_9763E_SCR      0x8E0
> > >  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
> > >
> > > +#define PCIE_GLI_9763E_CFG       0x8A0
> > > +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> > > +
> > >  #define PCIE_GLI_9763E_CFG2      0x8A4
> > >  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
> > >  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> > > @@ -128,6 +131,11 @@
> > >
> > >  #define GLI_MAX_TUNING_LOOP 40
> > >
> > > +struct gli_host {
> > > +       bool start_4k_r;
> > > +       int continuous_4k_r;
> > > +};
> > > +
> > >  /* Genesys Logic chipset */
> > >  static inline void gl9750_wt_on(struct sdhci_host *host)
> > >  {
> > > @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
> > >         sdhci_dumpregs(mmc_priv(mmc));
> > >  }
> > >
> > > +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> > > +{
> > > +       struct pci_dev *pdev = slot->chip->pdev;
> > > +       u32 value;
> > > +
> > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > +       value &= ~GLI_9763E_VHS_REV;
> > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > +
> > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> > > +
> > > +       if (enable)
> > > +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> > > +       else
> > > +               value |= GLI_9763E_CFG_LPSN_DIS;
> > > +
> > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> > > +
> > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > +       value &= ~GLI_9763E_VHS_REV;
> > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > +}
> > > +
> > > +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> > > +{
> > > +       struct sdhci_host *host = mmc_priv(mmc);
> > > +       struct mmc_command *cmd;
> > > +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> > > +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> > > +
> > > +       cmd = mrq->cmd;
> > > +
> > > +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> > > +               gli_host->continuous_4k_r++;
> > > +
> > > +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> > > +                       gl9763e_set_low_power_negotiation(slot, false);
> > > +
> > > +                       gli_host->start_4k_r = true;
> > > +               }
> > > +       } else {
> > > +               gli_host->continuous_4k_r = 0;
> > > +
> > > +               if (gli_host->start_4k_r)       {
> > > +                       gl9763e_set_low_power_negotiation(slot, true);
> > > +
> > > +                       gli_host->start_4k_r = false;
> > > +               }
> > > +       }
> >
> > The above code is trying to figure out what kind of storage use case
> > that is running, based on information about the buffers. This does not
> > work, simply because the buffers don't give you all the information
> > you need to make the right decisions.
> >
> > Moreover, I am sure you would try to follow up with additional changes
> > on top, trying to tweak the behaviour to fit another use case - and so
> > on. My point is, this code doesn't belong in the lowest layer drivers.
> >
> > To move forward, I suggest you explore using runtime PM in combination
> > with dev PM qos. In this way, the driver could implement a default
> > behaviour, which can be tweaked from upper layer governors for
> > example, but also from user space (via sysfs) allowing more
> > flexibility and potentially support for various more use cases.
> >
>
> Hi Ulf,
>
> Thanks for advice.
>
> But we'll meet the performance issue only during a seqence of requests
> of read commands with 4K data length.
>
> So what we have to do is looking into the requests to monitor such
> behaviors and disable the low power state negotiation of GL9763e. And
> the information from the request buffer is sufficient for this
> purpose.
>
> We don't even care about if we disable the low power state negotiation
> by a wrong decision because we'll enable it again by any requests
> which are not read commands or their data length is not 4K. Disabling
> the low power state negotiation of GL9763e not only has no side
> effects but also helps its performance.
>
> The behavior is only about the low power state negotiation of GL9763e
> and 4K reads, and not related to runtime PM, so that we monitor the
> requests and implement it in the driver of GL9763e.

I don't agree, sorry.

The request doesn't tell you about the behavior/performance of the
eMMC/SD card. You can have some average idea, but things vary
depending on what eMMC/SD card that is being used - and over time when
the card gets used, for example.

But, let's not discuss use cases and exactly how to tune the behavior,
that's a separate discussion.

To repeat what I said, my main point is that this kind of code doesn't
belong in the driver. Instead, please try using runtime PM and dev PM
Qos.

A rather simple attempt would be to deploy runtime PM support and play
with a default autosuspend timeout instead. Would that work for you?

>
> Due to this behavior will only affect our GL9763e but not other
> devices, so we think it could be implemented in the lower layer driver
> of GL9763e, but not higher level or user space. And we are trying to
> modify only our sdhci-pci-gli.c but not other mmc common codes.
>

That's exactly the problem.

In principle, you want to apply some policy to balance performance vs
the energy cost, which is a generic problem that all mmc drivers
share.

So far, the approach have been to run as fast as possible while there
are requests in the queue - and then power off things with runtime PM,
etc, after some period of idle. This certainly can be improved, but it
needs to be done generically on not through independent hacks in
drivers.

Kind regards
Uffe

> Thank you!
>
> > > +
> > > +       sdhci_request(mmc, mrq);
> > > +}
> > > +
> > > +
> > >  static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc)
> > >  {
> > >         struct cqhci_host *cq_host = mmc->cqe_private;
> > > @@ -848,6 +912,9 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
> > >         gli_pcie_enable_msi(slot);
> > >         host->mmc_host_ops.hs400_enhanced_strobe =
> > >                                         gl9763e_hs400_enhanced_strobe;
> > > +
> > > +       host->mmc_host_ops.request = gl9763e_request;
> > > +
> > >         gli_set_gl9763e(slot);
> > >         sdhci_enable_v4_mode(host);
> > >
> > > @@ -913,4 +980,5 @@ const struct sdhci_pci_fixes sdhci_gl9763e = {
> > >         .suspend        = sdhci_cqhci_gli_suspend,
> > >  #endif
> > >         .add_host       = gl9763e_add_host,
> > > +       .priv_size      = sizeof(struct gli_host),
> > >  };
> > > --
> >
> > Kind regards
> > Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-05 12:50     ` Ulf Hansson
@ 2021-07-05 15:09       ` Renius Chen
  2021-07-06  9:16         ` Ulf Hansson
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-05 15:09 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午8:51寫道:
>
> On Mon, 5 Jul 2021 at 12:59, Renius Chen <reniuschengl@gmail.com> wrote:
> >
> > Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午6:03寫道:
> > >
> > > On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
> > > >
> > > > During a sequence of random 4K read operations, the performance will be
> > > > reduced due to spending much time on entering/exiting the low power state
> > > > between requests. We disable the low power state negotiation of GL9763E
> > > > during a sequence of random 4K read operations to improve the performance
> > > > and enable it again after the operations have finished.
> > > >
> > > > Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> > > > ---
> > > >  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
> > > >  1 file changed, 68 insertions(+)
> > > >
> > > > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > > > index 302a7579a9b3..5f1f332b4241 100644
> > > > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > > > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > > > @@ -88,6 +88,9 @@
> > > >  #define PCIE_GLI_9763E_SCR      0x8E0
> > > >  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
> > > >
> > > > +#define PCIE_GLI_9763E_CFG       0x8A0
> > > > +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> > > > +
> > > >  #define PCIE_GLI_9763E_CFG2      0x8A4
> > > >  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
> > > >  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> > > > @@ -128,6 +131,11 @@
> > > >
> > > >  #define GLI_MAX_TUNING_LOOP 40
> > > >
> > > > +struct gli_host {
> > > > +       bool start_4k_r;
> > > > +       int continuous_4k_r;
> > > > +};
> > > > +
> > > >  /* Genesys Logic chipset */
> > > >  static inline void gl9750_wt_on(struct sdhci_host *host)
> > > >  {
> > > > @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
> > > >         sdhci_dumpregs(mmc_priv(mmc));
> > > >  }
> > > >
> > > > +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> > > > +{
> > > > +       struct pci_dev *pdev = slot->chip->pdev;
> > > > +       u32 value;
> > > > +
> > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > +
> > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> > > > +
> > > > +       if (enable)
> > > > +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> > > > +       else
> > > > +               value |= GLI_9763E_CFG_LPSN_DIS;
> > > > +
> > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> > > > +
> > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > +}
> > > > +
> > > > +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> > > > +{
> > > > +       struct sdhci_host *host = mmc_priv(mmc);
> > > > +       struct mmc_command *cmd;
> > > > +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> > > > +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> > > > +
> > > > +       cmd = mrq->cmd;
> > > > +
> > > > +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> > > > +               gli_host->continuous_4k_r++;
> > > > +
> > > > +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> > > > +                       gl9763e_set_low_power_negotiation(slot, false);
> > > > +
> > > > +                       gli_host->start_4k_r = true;
> > > > +               }
> > > > +       } else {
> > > > +               gli_host->continuous_4k_r = 0;
> > > > +
> > > > +               if (gli_host->start_4k_r)       {
> > > > +                       gl9763e_set_low_power_negotiation(slot, true);
> > > > +
> > > > +                       gli_host->start_4k_r = false;
> > > > +               }
> > > > +       }
> > >
> > > The above code is trying to figure out what kind of storage use case
> > > that is running, based on information about the buffers. This does not
> > > work, simply because the buffers don't give you all the information
> > > you need to make the right decisions.
> > >
> > > Moreover, I am sure you would try to follow up with additional changes
> > > on top, trying to tweak the behaviour to fit another use case - and so
> > > on. My point is, this code doesn't belong in the lowest layer drivers.
> > >
> > > To move forward, I suggest you explore using runtime PM in combination
> > > with dev PM qos. In this way, the driver could implement a default
> > > behaviour, which can be tweaked from upper layer governors for
> > > example, but also from user space (via sysfs) allowing more
> > > flexibility and potentially support for various more use cases.
> > >
> >
> > Hi Ulf,
> >
> > Thanks for advice.
> >
> > But we'll meet the performance issue only during a seqence of requests
> > of read commands with 4K data length.
> >
> > So what we have to do is looking into the requests to monitor such
> > behaviors and disable the low power state negotiation of GL9763e. And
> > the information from the request buffer is sufficient for this
> > purpose.
> >
> > We don't even care about if we disable the low power state negotiation
> > by a wrong decision because we'll enable it again by any requests
> > which are not read commands or their data length is not 4K. Disabling
> > the low power state negotiation of GL9763e not only has no side
> > effects but also helps its performance.
> >
> > The behavior is only about the low power state negotiation of GL9763e
> > and 4K reads, and not related to runtime PM, so that we monitor the
> > requests and implement it in the driver of GL9763e.
>
> I don't agree, sorry.
>
> The request doesn't tell you about the behavior/performance of the
> eMMC/SD card. You can have some average idea, but things vary
> depending on what eMMC/SD card that is being used - and over time when
> the card gets used, for example.
>
> But, let's not discuss use cases and exactly how to tune the behavior,
> that's a separate discussion.
>
> To repeat what I said, my main point is that this kind of code doesn't
> belong in the driver. Instead, please try using runtime PM and dev PM
> Qos.
>
> A rather simple attempt would be to deploy runtime PM support and play
> with a default autosuspend timeout instead. Would that work for you?
>

Hi Ulf,


Thanks for your explanation.

I think there may be some misunderstandings here.

Our purpose is to avoid our GL9763e from entering ASPM L1 state during
a sequence of 4K read requests. So we don't have to consider about the
behavior/performance of the eMMC/SD card and what eMMC/SD card that is
being used. We just need to know what kind of requests we are
receiving now from the PCIe root port.

Besides, the APSM L1 is purely hardware behavior in GL9763e and has no
corresponding relationship with runtime PM. It's not activated by
driver and the behaviors are not handled by software. I think runtime
PM is used to handle the behaviors of D0/D3 of the device, but not the
link status of ASPM L0s, L1, etc.

I agree that the policy of balancing performance vs the energy cost is
a generic problem that all mmc drivers share. But our driver of
GL9763e is a host driver, the setting in this patch is also only for
GL9763e, could not be used by other devices. It depends on our
specific hardware design so that it is not a generic solution or
policy. So I think to implement such a patch in our specific GL9763e
driver to execute the specific actions just for our hardware design is
reasonable.

Thanks a lot.


Best regards,

Renius

> >
> > Due to this behavior will only affect our GL9763e but not other
> > devices, so we think it could be implemented in the lower layer driver
> > of GL9763e, but not higher level or user space. And we are trying to
> > modify only our sdhci-pci-gli.c but not other mmc common codes.
> >
>
> That's exactly the problem.
>
> In principle, you want to apply some policy to balance performance vs
> the energy cost, which is a generic problem that all mmc drivers
> share.
>
> So far, the approach have been to run as fast as possible while there
> are requests in the queue - and then power off things with runtime PM,
> etc, after some period of idle. This certainly can be improved, but it
> needs to be done generically on not through independent hacks in
> drivers.
>
> Kind regards
> Uffe
>
> > Thank you!
> >
> > > > +
> > > > +       sdhci_request(mmc, mrq);
> > > > +}
> > > > +
> > > > +
> > > >  static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc)
> > > >  {
> > > >         struct cqhci_host *cq_host = mmc->cqe_private;
> > > > @@ -848,6 +912,9 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
> > > >         gli_pcie_enable_msi(slot);
> > > >         host->mmc_host_ops.hs400_enhanced_strobe =
> > > >                                         gl9763e_hs400_enhanced_strobe;
> > > > +
> > > > +       host->mmc_host_ops.request = gl9763e_request;
> > > > +
> > > >         gli_set_gl9763e(slot);
> > > >         sdhci_enable_v4_mode(host);
> > > >
> > > > @@ -913,4 +980,5 @@ const struct sdhci_pci_fixes sdhci_gl9763e = {
> > > >         .suspend        = sdhci_cqhci_gli_suspend,
> > > >  #endif
> > > >         .add_host       = gl9763e_add_host,
> > > > +       .priv_size      = sizeof(struct gli_host),
> > > >  };
> > > > --
> > >
> > > Kind regards
> > > Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-05 15:09       ` Renius Chen
@ 2021-07-06  9:16         ` Ulf Hansson
  2021-07-06  9:54           ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Ulf Hansson @ 2021-07-06  9:16 UTC (permalink / raw)
  To: Renius Chen
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

On Mon, 5 Jul 2021 at 17:09, Renius Chen <reniuschengl@gmail.com> wrote:
>
> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午8:51寫道:
> >
> > On Mon, 5 Jul 2021 at 12:59, Renius Chen <reniuschengl@gmail.com> wrote:
> > >
> > > Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午6:03寫道:
> > > >
> > > > On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
> > > > >
> > > > > During a sequence of random 4K read operations, the performance will be
> > > > > reduced due to spending much time on entering/exiting the low power state
> > > > > between requests. We disable the low power state negotiation of GL9763E
> > > > > during a sequence of random 4K read operations to improve the performance
> > > > > and enable it again after the operations have finished.
> > > > >
> > > > > Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> > > > > ---
> > > > >  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 68 insertions(+)
> > > > >
> > > > > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > > > > index 302a7579a9b3..5f1f332b4241 100644
> > > > > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > > > > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > > > > @@ -88,6 +88,9 @@
> > > > >  #define PCIE_GLI_9763E_SCR      0x8E0
> > > > >  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
> > > > >
> > > > > +#define PCIE_GLI_9763E_CFG       0x8A0
> > > > > +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> > > > > +
> > > > >  #define PCIE_GLI_9763E_CFG2      0x8A4
> > > > >  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
> > > > >  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> > > > > @@ -128,6 +131,11 @@
> > > > >
> > > > >  #define GLI_MAX_TUNING_LOOP 40
> > > > >
> > > > > +struct gli_host {
> > > > > +       bool start_4k_r;
> > > > > +       int continuous_4k_r;
> > > > > +};
> > > > > +
> > > > >  /* Genesys Logic chipset */
> > > > >  static inline void gl9750_wt_on(struct sdhci_host *host)
> > > > >  {
> > > > > @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
> > > > >         sdhci_dumpregs(mmc_priv(mmc));
> > > > >  }
> > > > >
> > > > > +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> > > > > +{
> > > > > +       struct pci_dev *pdev = slot->chip->pdev;
> > > > > +       u32 value;
> > > > > +
> > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > > +
> > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> > > > > +
> > > > > +       if (enable)
> > > > > +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> > > > > +       else
> > > > > +               value |= GLI_9763E_CFG_LPSN_DIS;
> > > > > +
> > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> > > > > +
> > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > > +}
> > > > > +
> > > > > +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> > > > > +{
> > > > > +       struct sdhci_host *host = mmc_priv(mmc);
> > > > > +       struct mmc_command *cmd;
> > > > > +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> > > > > +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> > > > > +
> > > > > +       cmd = mrq->cmd;
> > > > > +
> > > > > +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> > > > > +               gli_host->continuous_4k_r++;
> > > > > +
> > > > > +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> > > > > +                       gl9763e_set_low_power_negotiation(slot, false);
> > > > > +
> > > > > +                       gli_host->start_4k_r = true;
> > > > > +               }
> > > > > +       } else {
> > > > > +               gli_host->continuous_4k_r = 0;
> > > > > +
> > > > > +               if (gli_host->start_4k_r)       {
> > > > > +                       gl9763e_set_low_power_negotiation(slot, true);
> > > > > +
> > > > > +                       gli_host->start_4k_r = false;
> > > > > +               }
> > > > > +       }
> > > >
> > > > The above code is trying to figure out what kind of storage use case
> > > > that is running, based on information about the buffers. This does not
> > > > work, simply because the buffers don't give you all the information
> > > > you need to make the right decisions.
> > > >
> > > > Moreover, I am sure you would try to follow up with additional changes
> > > > on top, trying to tweak the behaviour to fit another use case - and so
> > > > on. My point is, this code doesn't belong in the lowest layer drivers.
> > > >
> > > > To move forward, I suggest you explore using runtime PM in combination
> > > > with dev PM qos. In this way, the driver could implement a default
> > > > behaviour, which can be tweaked from upper layer governors for
> > > > example, but also from user space (via sysfs) allowing more
> > > > flexibility and potentially support for various more use cases.
> > > >
> > >
> > > Hi Ulf,
> > >
> > > Thanks for advice.
> > >
> > > But we'll meet the performance issue only during a seqence of requests
> > > of read commands with 4K data length.
> > >
> > > So what we have to do is looking into the requests to monitor such
> > > behaviors and disable the low power state negotiation of GL9763e. And
> > > the information from the request buffer is sufficient for this
> > > purpose.
> > >
> > > We don't even care about if we disable the low power state negotiation
> > > by a wrong decision because we'll enable it again by any requests
> > > which are not read commands or their data length is not 4K. Disabling
> > > the low power state negotiation of GL9763e not only has no side
> > > effects but also helps its performance.
> > >
> > > The behavior is only about the low power state negotiation of GL9763e
> > > and 4K reads, and not related to runtime PM, so that we monitor the
> > > requests and implement it in the driver of GL9763e.
> >
> > I don't agree, sorry.
> >
> > The request doesn't tell you about the behavior/performance of the
> > eMMC/SD card. You can have some average idea, but things vary
> > depending on what eMMC/SD card that is being used - and over time when
> > the card gets used, for example.
> >
> > But, let's not discuss use cases and exactly how to tune the behavior,
> > that's a separate discussion.
> >
> > To repeat what I said, my main point is that this kind of code doesn't
> > belong in the driver. Instead, please try using runtime PM and dev PM
> > Qos.
> >
> > A rather simple attempt would be to deploy runtime PM support and play
> > with a default autosuspend timeout instead. Would that work for you?
> >
>
> Hi Ulf,
>
>
> Thanks for your explanation.
>
> I think there may be some misunderstandings here.

I fully understand what you want to do.

>
> Our purpose is to avoid our GL9763e from entering ASPM L1 state during
> a sequence of 4K read requests. So we don't have to consider about the
> behavior/performance of the eMMC/SD card and what eMMC/SD card that is
> being used. We just need to know what kind of requests we are
> receiving now from the PCIe root port.
>
> Besides, the APSM L1 is purely hardware behavior in GL9763e and has no
> corresponding relationship with runtime PM. It's not activated by
> driver and the behaviors are not handled by software. I think runtime
> PM is used to handle the behaviors of D0/D3 of the device, but not the
> link status of ASPM L0s, L1, etc.

Maybe runtime PM isn't the perfect fit for this type of use case.

That still doesn't matter to to me, I will not accept this kind of
governor/policy based code for use cases, in drivers. It doesn't
belong there.

>
> I agree that the policy of balancing performance vs the energy cost is
> a generic problem that all mmc drivers share. But our driver of
> GL9763e is a host driver, the setting in this patch is also only for
> GL9763e, could not be used by other devices. It depends on our
> specific hardware design so that it is not a generic solution or
> policy. So I think to implement such a patch in our specific GL9763e
> driver to execute the specific actions just for our hardware design is
> reasonable.

From the use case point of view, the GL9763e hardware design isn't at
all specific.

In many cases, controllers/platforms have support for low power states
that one want to enter to avoid wasting energy. The difficult part is
to know *when* it makes sense to enter a low power state, as it also
introduces a latency when the power needs to be restored for the
device, to allow it to serve a new request.

To me, it sounds like you may have been too aggressive on avoid
wasting energy. If I understand correctly the idle period you use is
20/21 us, while most other drivers use 50-100 ms as idle period.

[...]

Kind regards
Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-06  9:16         ` Ulf Hansson
@ 2021-07-06  9:54           ` Renius Chen
  2021-07-06 10:08             ` Ulf Hansson
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-06  9:54 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月6日 週二 下午5:16寫道:
>
> On Mon, 5 Jul 2021 at 17:09, Renius Chen <reniuschengl@gmail.com> wrote:
> >
> > Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午8:51寫道:
> > >
> > > On Mon, 5 Jul 2021 at 12:59, Renius Chen <reniuschengl@gmail.com> wrote:
> > > >
> > > > Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月5日 週一 下午6:03寫道:
> > > > >
> > > > > On Mon, 5 Jul 2021 at 11:00, Renius Chen <reniuschengl@gmail.com> wrote:
> > > > > >
> > > > > > During a sequence of random 4K read operations, the performance will be
> > > > > > reduced due to spending much time on entering/exiting the low power state
> > > > > > between requests. We disable the low power state negotiation of GL9763E
> > > > > > during a sequence of random 4K read operations to improve the performance
> > > > > > and enable it again after the operations have finished.
> > > > > >
> > > > > > Signed-off-by: Renius Chen <reniuschengl@gmail.com>
> > > > > > ---
> > > > > >  drivers/mmc/host/sdhci-pci-gli.c | 68 ++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 68 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
> > > > > > index 302a7579a9b3..5f1f332b4241 100644
> > > > > > --- a/drivers/mmc/host/sdhci-pci-gli.c
> > > > > > +++ b/drivers/mmc/host/sdhci-pci-gli.c
> > > > > > @@ -88,6 +88,9 @@
> > > > > >  #define PCIE_GLI_9763E_SCR      0x8E0
> > > > > >  #define   GLI_9763E_SCR_AXI_REQ           BIT(9)
> > > > > >
> > > > > > +#define PCIE_GLI_9763E_CFG       0x8A0
> > > > > > +#define   GLI_9763E_CFG_LPSN_DIS   BIT(12)
> > > > > > +
> > > > > >  #define PCIE_GLI_9763E_CFG2      0x8A4
> > > > > >  #define   GLI_9763E_CFG2_L1DLY     GENMASK(28, 19)
> > > > > >  #define   GLI_9763E_CFG2_L1DLY_MID 0x54
> > > > > > @@ -128,6 +131,11 @@
> > > > > >
> > > > > >  #define GLI_MAX_TUNING_LOOP 40
> > > > > >
> > > > > > +struct gli_host {
> > > > > > +       bool start_4k_r;
> > > > > > +       int continuous_4k_r;
> > > > > > +};
> > > > > > +
> > > > > >  /* Genesys Logic chipset */
> > > > > >  static inline void gl9750_wt_on(struct sdhci_host *host)
> > > > > >  {
> > > > > > @@ -691,6 +699,62 @@ static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc)
> > > > > >         sdhci_dumpregs(mmc_priv(mmc));
> > > > > >  }
> > > > > >
> > > > > > +static void gl9763e_set_low_power_negotiation(struct sdhci_pci_slot *slot, bool enable)
> > > > > > +{
> > > > > > +       struct pci_dev *pdev = slot->chip->pdev;
> > > > > > +       u32 value;
> > > > > > +
> > > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_W);
> > > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > > > +
> > > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_CFG, &value);
> > > > > > +
> > > > > > +       if (enable)
> > > > > > +               value &= ~GLI_9763E_CFG_LPSN_DIS;
> > > > > > +       else
> > > > > > +               value |= GLI_9763E_CFG_LPSN_DIS;
> > > > > > +
> > > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_CFG, value);
> > > > > > +
> > > > > > +       pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
> > > > > > +       value &= ~GLI_9763E_VHS_REV;
> > > > > > +       value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
> > > > > > +       pci_write_config_dword(pdev, PCIE_GLI_9763E_VHS, value);
> > > > > > +}
> > > > > > +
> > > > > > +static void gl9763e_request(struct mmc_host *mmc, struct mmc_request *mrq)
> > > > > > +{
> > > > > > +       struct sdhci_host *host = mmc_priv(mmc);
> > > > > > +       struct mmc_command *cmd;
> > > > > > +       struct sdhci_pci_slot *slot = sdhci_priv(host);
> > > > > > +       struct gli_host *gli_host = sdhci_pci_priv(slot);
> > > > > > +
> > > > > > +       cmd = mrq->cmd;
> > > > > > +
> > > > > > +       if (cmd && (cmd->opcode == MMC_READ_MULTIPLE_BLOCK) && (cmd->data->blocks == 8)) {
> > > > > > +               gli_host->continuous_4k_r++;
> > > > > > +
> > > > > > +               if ((!gli_host->start_4k_r) && (gli_host->continuous_4k_r >= 3)) {
> > > > > > +                       gl9763e_set_low_power_negotiation(slot, false);
> > > > > > +
> > > > > > +                       gli_host->start_4k_r = true;
> > > > > > +               }
> > > > > > +       } else {
> > > > > > +               gli_host->continuous_4k_r = 0;
> > > > > > +
> > > > > > +               if (gli_host->start_4k_r)       {
> > > > > > +                       gl9763e_set_low_power_negotiation(slot, true);
> > > > > > +
> > > > > > +                       gli_host->start_4k_r = false;
> > > > > > +               }
> > > > > > +       }
> > > > >
> > > > > The above code is trying to figure out what kind of storage use case
> > > > > that is running, based on information about the buffers. This does not
> > > > > work, simply because the buffers don't give you all the information
> > > > > you need to make the right decisions.
> > > > >
> > > > > Moreover, I am sure you would try to follow up with additional changes
> > > > > on top, trying to tweak the behaviour to fit another use case - and so
> > > > > on. My point is, this code doesn't belong in the lowest layer drivers.
> > > > >
> > > > > To move forward, I suggest you explore using runtime PM in combination
> > > > > with dev PM qos. In this way, the driver could implement a default
> > > > > behaviour, which can be tweaked from upper layer governors for
> > > > > example, but also from user space (via sysfs) allowing more
> > > > > flexibility and potentially support for various more use cases.
> > > > >
> > > >
> > > > Hi Ulf,
> > > >
> > > > Thanks for advice.
> > > >
> > > > But we'll meet the performance issue only during a seqence of requests
> > > > of read commands with 4K data length.
> > > >
> > > > So what we have to do is looking into the requests to monitor such
> > > > behaviors and disable the low power state negotiation of GL9763e. And
> > > > the information from the request buffer is sufficient for this
> > > > purpose.
> > > >
> > > > We don't even care about if we disable the low power state negotiation
> > > > by a wrong decision because we'll enable it again by any requests
> > > > which are not read commands or their data length is not 4K. Disabling
> > > > the low power state negotiation of GL9763e not only has no side
> > > > effects but also helps its performance.
> > > >
> > > > The behavior is only about the low power state negotiation of GL9763e
> > > > and 4K reads, and not related to runtime PM, so that we monitor the
> > > > requests and implement it in the driver of GL9763e.
> > >
> > > I don't agree, sorry.
> > >
> > > The request doesn't tell you about the behavior/performance of the
> > > eMMC/SD card. You can have some average idea, but things vary
> > > depending on what eMMC/SD card that is being used - and over time when
> > > the card gets used, for example.
> > >
> > > But, let's not discuss use cases and exactly how to tune the behavior,
> > > that's a separate discussion.
> > >
> > > To repeat what I said, my main point is that this kind of code doesn't
> > > belong in the driver. Instead, please try using runtime PM and dev PM
> > > Qos.
> > >
> > > A rather simple attempt would be to deploy runtime PM support and play
> > > with a default autosuspend timeout instead. Would that work for you?
> > >
> >
> > Hi Ulf,
> >
> >
> > Thanks for your explanation.
> >
> > I think there may be some misunderstandings here.
>
> I fully understand what you want to do.
>
> >
> > Our purpose is to avoid our GL9763e from entering ASPM L1 state during
> > a sequence of 4K read requests. So we don't have to consider about the
> > behavior/performance of the eMMC/SD card and what eMMC/SD card that is
> > being used. We just need to know what kind of requests we are
> > receiving now from the PCIe root port.
> >
> > Besides, the APSM L1 is purely hardware behavior in GL9763e and has no
> > corresponding relationship with runtime PM. It's not activated by
> > driver and the behaviors are not handled by software. I think runtime
> > PM is used to handle the behaviors of D0/D3 of the device, but not the
> > link status of ASPM L0s, L1, etc.
>
> Maybe runtime PM isn't the perfect fit for this type of use case.
>
> That still doesn't matter to to me, I will not accept this kind of
> governor/policy based code for use cases, in drivers. It doesn't
> belong there.
>

Hi Ulf,

The behavior of this patch is to set the value of a GL9763e vendor
specified register. Why it doesn't belong to GL9763e driver but other
common codes?

> >
> > I agree that the policy of balancing performance vs the energy cost is
> > a generic problem that all mmc drivers share. But our driver of
> > GL9763e is a host driver, the setting in this patch is also only for
> > GL9763e, could not be used by other devices. It depends on our
> > specific hardware design so that it is not a generic solution or
> > policy. So I think to implement such a patch in our specific GL9763e
> > driver to execute the specific actions just for our hardware design is
> > reasonable.
>
> From the use case point of view, the GL9763e hardware design isn't at
> all specific.
>
> In many cases, controllers/platforms have support for low power states
> that one want to enter to avoid wasting energy. The difficult part is
> to know *when* it makes sense to enter a low power state, as it also
> introduces a latency when the power needs to be restored for the
> device, to allow it to serve a new request.
>
> To me, it sounds like you may have been too aggressive on avoid
> wasting energy. If I understand correctly the idle period you use is
> 20/21 us, while most other drivers use 50-100 ms as idle period.
>

Yes, according to our customer's test for the GL9763e, if the ASPM L1
entry delay of GL9763e, which is the idle period you mentioned, is
larger than 20/21 us, it will not pass the PLT test. The PLT is
requested by Google for evaluating the product's battery life. The
product won't be accepted by Google if it fails the PLT test. So we
set the ASPM L1 entry delay to 20/21us.

With such a short idle period, during 4K reads, the idle time between
the read requests will be larger than 20/21us, so GL9763e will enter
ASPM L1 very frequently to impact the performance.

The bad performance of 4K reads was highlighted by Google, too. Our
customer has to pass both the PLT test and 4K read performance test by
Google's request. So after some discussions with our customer and
Google, we decided to submit such a patch to get the best balance to
satisfy Google's requiremnet.

The function and the register is vendor specified of GL9763e, so we
access it in the vendor driver of GL9763e. Add some functions in other
mmc general codes to do something only for GL9763e and can not be
applied by other devices might be a little bit strange and difficult
to implement and design?


Thanks for your comments.


Best regards,

Renius

> [...]
>
> Kind regards
> Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-06  9:54           ` Renius Chen
@ 2021-07-06 10:08             ` Ulf Hansson
  2021-07-06 10:56               ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Ulf Hansson @ 2021-07-06 10:08 UTC (permalink / raw)
  To: Renius Chen
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

[...]

> > > Thanks for your explanation.
> > >
> > > I think there may be some misunderstandings here.
> >
> > I fully understand what you want to do.
> >
> > >
> > > Our purpose is to avoid our GL9763e from entering ASPM L1 state during
> > > a sequence of 4K read requests. So we don't have to consider about the
> > > behavior/performance of the eMMC/SD card and what eMMC/SD card that is
> > > being used. We just need to know what kind of requests we are
> > > receiving now from the PCIe root port.
> > >
> > > Besides, the APSM L1 is purely hardware behavior in GL9763e and has no
> > > corresponding relationship with runtime PM. It's not activated by
> > > driver and the behaviors are not handled by software. I think runtime
> > > PM is used to handle the behaviors of D0/D3 of the device, but not the
> > > link status of ASPM L0s, L1, etc.
> >
> > Maybe runtime PM isn't the perfect fit for this type of use case.
> >
> > That still doesn't matter to to me, I will not accept this kind of
> > governor/policy based code for use cases, in drivers. It doesn't
> > belong there.
> >
>
> Hi Ulf,
>
> The behavior of this patch is to set the value of a GL9763e vendor
> specified register. Why it doesn't belong to GL9763e driver but other
> common codes?

Let me try one more time.

The code that is needed to put the GL9763e HW into low power state
(writing to GL9763e specific register) certainly belongs in the
driver.

The code that monitors for a specific use case does not.

>
> > >
> > > I agree that the policy of balancing performance vs the energy cost is
> > > a generic problem that all mmc drivers share. But our driver of
> > > GL9763e is a host driver, the setting in this patch is also only for
> > > GL9763e, could not be used by other devices. It depends on our
> > > specific hardware design so that it is not a generic solution or
> > > policy. So I think to implement such a patch in our specific GL9763e
> > > driver to execute the specific actions just for our hardware design is
> > > reasonable.
> >
> > From the use case point of view, the GL9763e hardware design isn't at
> > all specific.
> >
> > In many cases, controllers/platforms have support for low power states
> > that one want to enter to avoid wasting energy. The difficult part is
> > to know *when* it makes sense to enter a low power state, as it also
> > introduces a latency when the power needs to be restored for the
> > device, to allow it to serve a new request.
> >
> > To me, it sounds like you may have been too aggressive on avoid
> > wasting energy. If I understand correctly the idle period you use is
> > 20/21 us, while most other drivers use 50-100 ms as idle period.
> >
>
> Yes, according to our customer's test for the GL9763e, if the ASPM L1
> entry delay of GL9763e, which is the idle period you mentioned, is
> larger than 20/21 us, it will not pass the PLT test. The PLT is
> requested by Google for evaluating the product's battery life. The
> product won't be accepted by Google if it fails the PLT test. So we
> set the ASPM L1 entry delay to 20/21us.
>
> With such a short idle period, during 4K reads, the idle time between
> the read requests will be larger than 20/21us, so GL9763e will enter
> ASPM L1 very frequently to impact the performance.
>
> The bad performance of 4K reads was highlighted by Google, too. Our
> customer has to pass both the PLT test and 4K read performance test by
> Google's request. So after some discussions with our customer and
> Google, we decided to submit such a patch to get the best balance to
> satisfy Google's requiremnet.
>
> The function and the register is vendor specified of GL9763e, so we
> access it in the vendor driver of GL9763e. Add some functions in other
> mmc general codes to do something only for GL9763e and can not be
> applied by other devices might be a little bit strange and difficult
> to implement and design?

I haven't said implementation need to be easy, but suggest a few
options to move forward.

What did state and I am not going to change my opinion on this, is the
governor code that monitors for use cases, don't belong in the driver.

Kind regards
Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-06 10:08             ` Ulf Hansson
@ 2021-07-06 10:56               ` Renius Chen
  2021-07-07 12:15                 ` Ulf Hansson
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-06 10:56 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月6日 週二 下午6:08寫道:
>
> [...]
>
> > > > Thanks for your explanation.
> > > >
> > > > I think there may be some misunderstandings here.
> > >
> > > I fully understand what you want to do.
> > >
> > > >
> > > > Our purpose is to avoid our GL9763e from entering ASPM L1 state during
> > > > a sequence of 4K read requests. So we don't have to consider about the
> > > > behavior/performance of the eMMC/SD card and what eMMC/SD card that is
> > > > being used. We just need to know what kind of requests we are
> > > > receiving now from the PCIe root port.
> > > >
> > > > Besides, the APSM L1 is purely hardware behavior in GL9763e and has no
> > > > corresponding relationship with runtime PM. It's not activated by
> > > > driver and the behaviors are not handled by software. I think runtime
> > > > PM is used to handle the behaviors of D0/D3 of the device, but not the
> > > > link status of ASPM L0s, L1, etc.
> > >
> > > Maybe runtime PM isn't the perfect fit for this type of use case.
> > >
> > > That still doesn't matter to to me, I will not accept this kind of
> > > governor/policy based code for use cases, in drivers. It doesn't
> > > belong there.
> > >
> >
> > Hi Ulf,
> >
> > The behavior of this patch is to set the value of a GL9763e vendor
> > specified register. Why it doesn't belong to GL9763e driver but other
> > common codes?
>
> Let me try one more time.
>
> The code that is needed to put the GL9763e HW into low power state
> (writing to GL9763e specific register) certainly belongs in the
> driver.
>
> The code that monitors for a specific use case does not.
>
> >
> > > >
> > > > I agree that the policy of balancing performance vs the energy cost is
> > > > a generic problem that all mmc drivers share. But our driver of
> > > > GL9763e is a host driver, the setting in this patch is also only for
> > > > GL9763e, could not be used by other devices. It depends on our
> > > > specific hardware design so that it is not a generic solution or
> > > > policy. So I think to implement such a patch in our specific GL9763e
> > > > driver to execute the specific actions just for our hardware design is
> > > > reasonable.
> > >
> > > From the use case point of view, the GL9763e hardware design isn't at
> > > all specific.
> > >
> > > In many cases, controllers/platforms have support for low power states
> > > that one want to enter to avoid wasting energy. The difficult part is
> > > to know *when* it makes sense to enter a low power state, as it also
> > > introduces a latency when the power needs to be restored for the
> > > device, to allow it to serve a new request.
> > >
> > > To me, it sounds like you may have been too aggressive on avoid
> > > wasting energy. If I understand correctly the idle period you use is
> > > 20/21 us, while most other drivers use 50-100 ms as idle period.
> > >
> >
> > Yes, according to our customer's test for the GL9763e, if the ASPM L1
> > entry delay of GL9763e, which is the idle period you mentioned, is
> > larger than 20/21 us, it will not pass the PLT test. The PLT is
> > requested by Google for evaluating the product's battery life. The
> > product won't be accepted by Google if it fails the PLT test. So we
> > set the ASPM L1 entry delay to 20/21us.
> >
> > With such a short idle period, during 4K reads, the idle time between
> > the read requests will be larger than 20/21us, so GL9763e will enter
> > ASPM L1 very frequently to impact the performance.
> >
> > The bad performance of 4K reads was highlighted by Google, too. Our
> > customer has to pass both the PLT test and 4K read performance test by
> > Google's request. So after some discussions with our customer and
> > Google, we decided to submit such a patch to get the best balance to
> > satisfy Google's requiremnet.
> >
> > The function and the register is vendor specified of GL9763e, so we
> > access it in the vendor driver of GL9763e. Add some functions in other
> > mmc general codes to do something only for GL9763e and can not be
> > applied by other devices might be a little bit strange and difficult
> > to implement and design?
>
> I haven't said implementation need to be easy, but suggest a few
> options to move forward.
>
> What did state and I am not going to change my opinion on this, is the
> governor code that monitors for use cases, don't belong in the driver.
>

Hi Ulf,


Thanks, I understand what you mean.

I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
drivers/mmc/host folder, and found that in some SD/MMC host controller
driver codes such as alcor.c, cavium.c, ...etc, there are also
behaviors for monitoring the request in their driver. What's the
difference between theirs and ours?

And if the code that monitors the requstes does not belong the driver,
where should I implement the code and how to add some functions only
for GL9763e in that place, in your opinion?

Thanks for clarifying my questions.


Best regards,

Renius

> Kind regards
> Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-06 10:56               ` Renius Chen
@ 2021-07-07 12:15                 ` Ulf Hansson
  2021-07-07 13:49                   ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Ulf Hansson @ 2021-07-07 12:15 UTC (permalink / raw)
  To: Renius Chen
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

[...]

>
> Thanks, I understand what you mean.
>
> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> drivers/mmc/host folder, and found that in some SD/MMC host controller
> driver codes such as alcor.c, cavium.c, ...etc, there are also
> behaviors for monitoring the request in their driver. What's the
> difference between theirs and ours?

Those checks are there to allow the HWs to be supported properly.

>
> And if the code that monitors the requstes does not belong the driver,
> where should I implement the code and how to add some functions only
> for GL9763e in that place, in your opinion?

Honestly, I am not sure what suits your use case best.

So far we have used runtime PM with a default auto suspend timeout, in
combination with dev PM Qos. In other words, run as fast as possible
to complete the requests in the queue then go back to idle and enter a
low power state. Clearly, that seems not to be sufficient for your use
case, sorry.

Kind regards
Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-07 12:15                 ` Ulf Hansson
@ 2021-07-07 13:49                   ` Renius Chen
  2021-07-14  2:15                     ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-07 13:49 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
>
> [...]
>
> >
> > Thanks, I understand what you mean.
> >
> > I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> > drivers/mmc/host folder, and found that in some SD/MMC host controller
> > driver codes such as alcor.c, cavium.c, ...etc, there are also
> > behaviors for monitoring the request in their driver. What's the
> > difference between theirs and ours?
>
> Those checks are there to allow the HWs to be supported properly.
>
> >
> > And if the code that monitors the requstes does not belong the driver,
> > where should I implement the code and how to add some functions only
> > for GL9763e in that place, in your opinion?
>
> Honestly, I am not sure what suits your use case best.
>
> So far we have used runtime PM with a default auto suspend timeout, in
> combination with dev PM Qos. In other words, run as fast as possible
> to complete the requests in the queue then go back to idle and enter a
> low power state. Clearly, that seems not to be sufficient for your use
> case, sorry.
>
Yes, the runtime PM, auto suspend, and PM Qos are all about the
suspend/resume behaviors of the system or related to power states such
as D0/D3 of the device. But these are totally different from the ASPM
L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
behavior on the link layer and is not handled by any codes in
drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
patch by your opinions, but we are also confused about what or where
suits our use case best. So we wonder how to start the modification
and may need some suggestions to deal with the work, sorry.

Thank you.


Best regards,

Renius


> Kind regards
> Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-07 13:49                   ` Renius Chen
@ 2021-07-14  2:15                     ` Renius Chen
  2021-07-16 10:27                       ` Adrian Hunter
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-14  2:15 UTC (permalink / raw)
  To: Ulf Hansson
  Cc: Adrian Hunter, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Hi Adrain,

What do you think of this patch?
Or do you have any ideas or suggestions about the modification for
Ulf's comments?

Thank you.


Best regards,

Renius

Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
>
> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
> >
> > [...]
> >
> > >
> > > Thanks, I understand what you mean.
> > >
> > > I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> > > drivers/mmc/host folder, and found that in some SD/MMC host controller
> > > driver codes such as alcor.c, cavium.c, ...etc, there are also
> > > behaviors for monitoring the request in their driver. What's the
> > > difference between theirs and ours?
> >
> > Those checks are there to allow the HWs to be supported properly.
> >
> > >
> > > And if the code that monitors the requstes does not belong the driver,
> > > where should I implement the code and how to add some functions only
> > > for GL9763e in that place, in your opinion?
> >
> > Honestly, I am not sure what suits your use case best.
> >
> > So far we have used runtime PM with a default auto suspend timeout, in
> > combination with dev PM Qos. In other words, run as fast as possible
> > to complete the requests in the queue then go back to idle and enter a
> > low power state. Clearly, that seems not to be sufficient for your use
> > case, sorry.
> >
> Yes, the runtime PM, auto suspend, and PM Qos are all about the
> suspend/resume behaviors of the system or related to power states such
> as D0/D3 of the device. But these are totally different from the ASPM
> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
> behavior on the link layer and is not handled by any codes in
> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
> patch by your opinions, but we are also confused about what or where
> suits our use case best. So we wonder how to start the modification
> and may need some suggestions to deal with the work, sorry.
>
> Thank you.
>
>
> Best regards,
>
> Renius
>
>
> > Kind regards
> > Uffe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-14  2:15                     ` Renius Chen
@ 2021-07-16 10:27                       ` Adrian Hunter
  2021-07-19  9:26                         ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Adrian Hunter @ 2021-07-16 10:27 UTC (permalink / raw)
  To: Renius Chen, Ulf Hansson; +Cc: linux-mmc, Linux Kernel Mailing List, Ben Chuang

On 14/07/21 5:15 am, Renius Chen wrote:
> Hi Adrain,
> 
> What do you think of this patch?
> Or do you have any ideas or suggestions about the modification for
> Ulf's comments?

Perhaps try to define your power management requirements in terms of
latencies instead of request size, and then take the issue to the
power management mailing list and power management maintainers for
suggestions.  You will probably need to point out why runtime PM doesn't
met your requirements.

> 
> Thank you.
> 
> 
> Best regards,
> 
> Renius
> 
> Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
>>
>> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
>>>
>>> [...]
>>>
>>>>
>>>> Thanks, I understand what you mean.
>>>>
>>>> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
>>>> drivers/mmc/host folder, and found that in some SD/MMC host controller
>>>> driver codes such as alcor.c, cavium.c, ...etc, there are also
>>>> behaviors for monitoring the request in their driver. What's the
>>>> difference between theirs and ours?
>>>
>>> Those checks are there to allow the HWs to be supported properly.
>>>
>>>>
>>>> And if the code that monitors the requstes does not belong the driver,
>>>> where should I implement the code and how to add some functions only
>>>> for GL9763e in that place, in your opinion?
>>>
>>> Honestly, I am not sure what suits your use case best.
>>>
>>> So far we have used runtime PM with a default auto suspend timeout, in
>>> combination with dev PM Qos. In other words, run as fast as possible
>>> to complete the requests in the queue then go back to idle and enter a
>>> low power state. Clearly, that seems not to be sufficient for your use
>>> case, sorry.
>>>
>> Yes, the runtime PM, auto suspend, and PM Qos are all about the
>> suspend/resume behaviors of the system or related to power states such
>> as D0/D3 of the device. But these are totally different from the ASPM
>> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
>> behavior on the link layer and is not handled by any codes in
>> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
>> patch by your opinions, but we are also confused about what or where
>> suits our use case best. So we wonder how to start the modification
>> and may need some suggestions to deal with the work, sorry.
>>
>> Thank you.
>>
>>
>> Best regards,
>>
>> Renius
>>
>>
>>> Kind regards
>>> Uffe


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-16 10:27                       ` Adrian Hunter
@ 2021-07-19  9:26                         ` Renius Chen
  2021-08-04  6:27                           ` Adrian Hunter
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-07-19  9:26 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ulf Hansson, linux-mmc, Linux Kernel Mailing List, Ben Chuang

Adrian Hunter <adrian.hunter@intel.com> 於 2021年7月16日 週五 下午6:27寫道:
>
> On 14/07/21 5:15 am, Renius Chen wrote:
> > Hi Adrain,
> >
> > What do you think of this patch?
> > Or do you have any ideas or suggestions about the modification for
> > Ulf's comments?
>
> Perhaps try to define your power management requirements in terms of
> latencies instead of request size, and then take the issue to the
> power management mailing list and power management maintainers for
> suggestions.  You will probably need to point out why runtime PM doesn't
> met your requirements.
>

Hi Adrain,


Thanks for your advice.

Our purpose is only to improve the performance of 4K reads, and we
hope that it doesn't affect any other use cases. If we look into the
latencies, it may affect not only 4K reads but also some other use
cases.

Behaviors of ASPM is controlled by circuits of hardware. Drivers only
enable or disable ASPM or set some parameters for ASPM, and are not
able to know when the device enters or exits the L0s/L1 state. So the
PM part of drivers may not suit this case.

This patch could be simply divided into two parts:
1. Monitor requests.
2. Set a vendor specific register of GL9763e.

The part 2 is no problems we think. And Ulf thinks that the behaviors
of part 1 should not be implemented in sdhci-pci-gli.c. Do you have
any suggestions on where we can implement the monitoring?

Thank you.


Best regards,

Renius

> >
> > Thank you.
> >
> >
> > Best regards,
> >
> > Renius
> >
> > Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
> >>
> >> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
> >>>
> >>> [...]
> >>>
> >>>>
> >>>> Thanks, I understand what you mean.
> >>>>
> >>>> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> >>>> drivers/mmc/host folder, and found that in some SD/MMC host controller
> >>>> driver codes such as alcor.c, cavium.c, ...etc, there are also
> >>>> behaviors for monitoring the request in their driver. What's the
> >>>> difference between theirs and ours?
> >>>
> >>> Those checks are there to allow the HWs to be supported properly.
> >>>
> >>>>
> >>>> And if the code that monitors the requstes does not belong the driver,
> >>>> where should I implement the code and how to add some functions only
> >>>> for GL9763e in that place, in your opinion?
> >>>
> >>> Honestly, I am not sure what suits your use case best.
> >>>
> >>> So far we have used runtime PM with a default auto suspend timeout, in
> >>> combination with dev PM Qos. In other words, run as fast as possible
> >>> to complete the requests in the queue then go back to idle and enter a
> >>> low power state. Clearly, that seems not to be sufficient for your use
> >>> case, sorry.
> >>>
> >> Yes, the runtime PM, auto suspend, and PM Qos are all about the
> >> suspend/resume behaviors of the system or related to power states such
> >> as D0/D3 of the device. But these are totally different from the ASPM
> >> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
> >> behavior on the link layer and is not handled by any codes in
> >> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
> >> patch by your opinions, but we are also confused about what or where
> >> suits our use case best. So we wonder how to start the modification
> >> and may need some suggestions to deal with the work, sorry.
> >>
> >> Thank you.
> >>
> >>
> >> Best regards,
> >>
> >> Renius
> >>
> >>
> >>> Kind regards
> >>> Uffe
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-07-19  9:26                         ` Renius Chen
@ 2021-08-04  6:27                           ` Adrian Hunter
  2021-08-10  4:23                             ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Adrian Hunter @ 2021-08-04  6:27 UTC (permalink / raw)
  To: Renius Chen; +Cc: Ulf Hansson, linux-mmc, Linux Kernel Mailing List, Ben Chuang

On 19/07/21 12:26 pm, Renius Chen wrote:
> Adrian Hunter <adrian.hunter@intel.com> 於 2021年7月16日 週五 下午6:27寫道:
>>
>> On 14/07/21 5:15 am, Renius Chen wrote:
>>> Hi Adrain,
>>>
>>> What do you think of this patch?
>>> Or do you have any ideas or suggestions about the modification for
>>> Ulf's comments?
>>
>> Perhaps try to define your power management requirements in terms of
>> latencies instead of request size, and then take the issue to the
>> power management mailing list and power management maintainers for
>> suggestions.  You will probably need to point out why runtime PM doesn't
>> met your requirements.
>>
> 
> Hi Adrain,
> 
> 
> Thanks for your advice.
> 
> Our purpose is only to improve the performance of 4K reads, and we
> hope that it doesn't affect any other use cases. If we look into the
> latencies, it may affect not only 4K reads but also some other use
> cases.

I just meant that, if you present the problem to people on the power
management mailing lists,  you probably need to describe the problem at
an engineering level, instead of describing your solution at a
programming level.

> 
> Behaviors of ASPM is controlled by circuits of hardware. Drivers only
> enable or disable ASPM or set some parameters for ASPM, and are not
> able to know when the device enters or exits the L0s/L1 state. So the
> PM part of drivers may not suit this case.
> 
> This patch could be simply divided into two parts:
> 1. Monitor requests.
> 2. Set a vendor specific register of GL9763e.
> 
> The part 2 is no problems we think. And Ulf thinks that the behaviors
> of part 1 should not be implemented in sdhci-pci-gli.c. Do you have
> any suggestions on where we can implement the monitoring?
> 
> Thank you.
> 
> 
> Best regards,
> 
> Renius
> 
>>>
>>> Thank you.
>>>
>>>
>>> Best regards,
>>>
>>> Renius
>>>
>>> Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
>>>>
>>>> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
>>>>>
>>>>> [...]
>>>>>
>>>>>>
>>>>>> Thanks, I understand what you mean.
>>>>>>
>>>>>> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
>>>>>> drivers/mmc/host folder, and found that in some SD/MMC host controller
>>>>>> driver codes such as alcor.c, cavium.c, ...etc, there are also
>>>>>> behaviors for monitoring the request in their driver. What's the
>>>>>> difference between theirs and ours?
>>>>>
>>>>> Those checks are there to allow the HWs to be supported properly.
>>>>>
>>>>>>
>>>>>> And if the code that monitors the requstes does not belong the driver,
>>>>>> where should I implement the code and how to add some functions only
>>>>>> for GL9763e in that place, in your opinion?
>>>>>
>>>>> Honestly, I am not sure what suits your use case best.
>>>>>
>>>>> So far we have used runtime PM with a default auto suspend timeout, in
>>>>> combination with dev PM Qos. In other words, run as fast as possible
>>>>> to complete the requests in the queue then go back to idle and enter a
>>>>> low power state. Clearly, that seems not to be sufficient for your use
>>>>> case, sorry.
>>>>>
>>>> Yes, the runtime PM, auto suspend, and PM Qos are all about the
>>>> suspend/resume behaviors of the system or related to power states such
>>>> as D0/D3 of the device. But these are totally different from the ASPM
>>>> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
>>>> behavior on the link layer and is not handled by any codes in
>>>> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
>>>> patch by your opinions, but we are also confused about what or where
>>>> suits our use case best. So we wonder how to start the modification
>>>> and may need some suggestions to deal with the work, sorry.
>>>>
>>>> Thank you.
>>>>
>>>>
>>>> Best regards,
>>>>
>>>> Renius
>>>>
>>>>
>>>>> Kind regards
>>>>> Uffe
>>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-08-04  6:27                           ` Adrian Hunter
@ 2021-08-10  4:23                             ` Renius Chen
  2021-08-17 10:30                               ` Renius Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Renius Chen @ 2021-08-10  4:23 UTC (permalink / raw)
  To: linux-pm
  Cc: Ulf Hansson, linux-mmc, Linux Kernel Mailing List, Ben Chuang,
	Adrian Hunter

Hi,


First I'd like to appreciate your time reading this mail.

We had some issues with submitting a patch to MMC and the reviewer
suggested us to look for some help from the PM mailing list.

GL9763e is a PCIe card reader. During a sequence of random 4K reads,
due to the long idle period time between read requests, GL9763e will
enter ASPM L1 very frequently. Hence the performance of random 4K
reads is very worse.

We tried to enlarge the ASPM L1 entry delay to avoid GL9763e from
entering ASPM L1 by the idle period time during 4K reads. But such an
adjustment also affects other use cases. It will reduce the frequency
of entering ASPM L1 under all conditions so that the battery life will
be shorter. This will cause the PLT test to fail.

So we develop a patch to balance the performance of 4K reads and the
battery life. Our purpose is only to improve the performance of 4K
reads, but not to affect any other use cases. First, we monitor the
requests, when a sequence of 4K reads is performing, we'll modify the
value of a vendor specified register in GL9763e to disable ASPM L1 by
the GL9763e hardware. Then re-enable ASPM L1 after the 4K reads are
finished.

But MMC reviewers think such behaviors may not be suitable for a MMC
host driver and believe that there may be some better ways to achieve
our goals.

So I'm here to ask for your advice. Do you have any ideas for this
case? Are this scenario and ASPM related to runtime PM? In my
cognition, the entering and exiting of ASPM L0s and L1 are pure
hardware behaviors and not handled by software, they are different
from suspend/resume and runtime PM and D0/D3, right?

Thanks a lot.


Best regards,

Renius

Adrian Hunter <adrian.hunter@intel.com> 於 2021年8月4日 週三 下午2:26寫道:
>
> On 19/07/21 12:26 pm, Renius Chen wrote:
> > Adrian Hunter <adrian.hunter@intel.com> 於 2021年7月16日 週五 下午6:27寫道:
> >>
> >> On 14/07/21 5:15 am, Renius Chen wrote:
> >>> Hi Adrain,
> >>>
> >>> What do you think of this patch?
> >>> Or do you have any ideas or suggestions about the modification for
> >>> Ulf's comments?
> >>
> >> Perhaps try to define your power management requirements in terms of
> >> latencies instead of request size, and then take the issue to the
> >> power management mailing list and power management maintainers for
> >> suggestions.  You will probably need to point out why runtime PM doesn't
> >> met your requirements.
> >>
> >
> > Hi Adrain,
> >
> >
> > Thanks for your advice.
> >
> > Our purpose is only to improve the performance of 4K reads, and we
> > hope that it doesn't affect any other use cases. If we look into the
> > latencies, it may affect not only 4K reads but also some other use
> > cases.
>
> I just meant that, if you present the problem to people on the power
> management mailing lists,  you probably need to describe the problem at
> an engineering level, instead of describing your solution at a
> programming level.
>
> >
> > Behaviors of ASPM is controlled by circuits of hardware. Drivers only
> > enable or disable ASPM or set some parameters for ASPM, and are not
> > able to know when the device enters or exits the L0s/L1 state. So the
> > PM part of drivers may not suit this case.
> >
> > This patch could be simply divided into two parts:
> > 1. Monitor requests.
> > 2. Set a vendor specific register of GL9763e.
> >
> > The part 2 is no problems we think. And Ulf thinks that the behaviors
> > of part 1 should not be implemented in sdhci-pci-gli.c. Do you have
> > any suggestions on where we can implement the monitoring?
> >
> > Thank you.
> >
> >
> > Best regards,
> >
> > Renius
> >
> >>>
> >>> Thank you.
> >>>
> >>>
> >>> Best regards,
> >>>
> >>> Renius
> >>>
> >>> Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
> >>>>
> >>>> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
> >>>>>
> >>>>> [...]
> >>>>>
> >>>>>>
> >>>>>> Thanks, I understand what you mean.
> >>>>>>
> >>>>>> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> >>>>>> drivers/mmc/host folder, and found that in some SD/MMC host controller
> >>>>>> driver codes such as alcor.c, cavium.c, ...etc, there are also
> >>>>>> behaviors for monitoring the request in their driver. What's the
> >>>>>> difference between theirs and ours?
> >>>>>
> >>>>> Those checks are there to allow the HWs to be supported properly.
> >>>>>
> >>>>>>
> >>>>>> And if the code that monitors the requstes does not belong the driver,
> >>>>>> where should I implement the code and how to add some functions only
> >>>>>> for GL9763e in that place, in your opinion?
> >>>>>
> >>>>> Honestly, I am not sure what suits your use case best.
> >>>>>
> >>>>> So far we have used runtime PM with a default auto suspend timeout, in
> >>>>> combination with dev PM Qos. In other words, run as fast as possible
> >>>>> to complete the requests in the queue then go back to idle and enter a
> >>>>> low power state. Clearly, that seems not to be sufficient for your use
> >>>>> case, sorry.
> >>>>>
> >>>> Yes, the runtime PM, auto suspend, and PM Qos are all about the
> >>>> suspend/resume behaviors of the system or related to power states such
> >>>> as D0/D3 of the device. But these are totally different from the ASPM
> >>>> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
> >>>> behavior on the link layer and is not handled by any codes in
> >>>> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
> >>>> patch by your opinions, but we are also confused about what or where
> >>>> suits our use case best. So we wonder how to start the modification
> >>>> and may need some suggestions to deal with the work, sorry.
> >>>>
> >>>> Thank you.
> >>>>
> >>>>
> >>>> Best regards,
> >>>>
> >>>> Renius
> >>>>
> >>>>
> >>>>> Kind regards
> >>>>> Uffe
> >>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E
  2021-08-10  4:23                             ` Renius Chen
@ 2021-08-17 10:30                               ` Renius Chen
  0 siblings, 0 replies; 17+ messages in thread
From: Renius Chen @ 2021-08-17 10:30 UTC (permalink / raw)
  To: linux-pm, sebastian.reichel, rjw
  Cc: Ulf Hansson, linux-mmc, Ben Chuang, Adrian Hunter,
	Linux Kernel Mailing List

Hi PM maintainers,


Thanks for being patient with me.

Do you have any comments or suggestions about the discussions?

Please kindly let me know if you have any thought on how to proceed with this.

Thank you.


Best regards,

Renius

Renius Chen <reniuschengl@gmail.com> 於 2021年8月10日 週二 下午12:23寫道:
>
> Hi,
>
>
> First I'd like to appreciate your time reading this mail.
>
> We had some issues with submitting a patch to MMC and the reviewer
> suggested us to look for some help from the PM mailing list.
>
> GL9763e is a PCIe card reader. During a sequence of random 4K reads,
> due to the long idle period time between read requests, GL9763e will
> enter ASPM L1 very frequently. Hence the performance of random 4K
> reads is very worse.
>
> We tried to enlarge the ASPM L1 entry delay to avoid GL9763e from
> entering ASPM L1 by the idle period time during 4K reads. But such an
> adjustment also affects other use cases. It will reduce the frequency
> of entering ASPM L1 under all conditions so that the battery life will
> be shorter. This will cause the PLT test to fail.
>
> So we develop a patch to balance the performance of 4K reads and the
> battery life. Our purpose is only to improve the performance of 4K
> reads, but not to affect any other use cases. First, we monitor the
> requests, when a sequence of 4K reads is performing, we'll modify the
> value of a vendor specified register in GL9763e to disable ASPM L1 by
> the GL9763e hardware. Then re-enable ASPM L1 after the 4K reads are
> finished.
>
> But MMC reviewers think such behaviors may not be suitable for a MMC
> host driver and believe that there may be some better ways to achieve
> our goals.
>
> So I'm here to ask for your advice. Do you have any ideas for this
> case? Are this scenario and ASPM related to runtime PM? In my
> cognition, the entering and exiting of ASPM L0s and L1 are pure
> hardware behaviors and not handled by software, they are different
> from suspend/resume and runtime PM and D0/D3, right?
>
> Thanks a lot.
>
>
> Best regards,
>
> Renius
>
> Adrian Hunter <adrian.hunter@intel.com> 於 2021年8月4日 週三 下午2:26寫道:
> >
> > On 19/07/21 12:26 pm, Renius Chen wrote:
> > > Adrian Hunter <adrian.hunter@intel.com> 於 2021年7月16日 週五 下午6:27寫道:
> > >>
> > >> On 14/07/21 5:15 am, Renius Chen wrote:
> > >>> Hi Adrain,
> > >>>
> > >>> What do you think of this patch?
> > >>> Or do you have any ideas or suggestions about the modification for
> > >>> Ulf's comments?
> > >>
> > >> Perhaps try to define your power management requirements in terms of
> > >> latencies instead of request size, and then take the issue to the
> > >> power management mailing list and power management maintainers for
> > >> suggestions.  You will probably need to point out why runtime PM doesn't
> > >> met your requirements.
> > >>
> > >
> > > Hi Adrain,
> > >
> > >
> > > Thanks for your advice.
> > >
> > > Our purpose is only to improve the performance of 4K reads, and we
> > > hope that it doesn't affect any other use cases. If we look into the
> > > latencies, it may affect not only 4K reads but also some other use
> > > cases.
> >
> > I just meant that, if you present the problem to people on the power
> > management mailing lists,  you probably need to describe the problem at
> > an engineering level, instead of describing your solution at a
> > programming level.
> >
> > >
> > > Behaviors of ASPM is controlled by circuits of hardware. Drivers only
> > > enable or disable ASPM or set some parameters for ASPM, and are not
> > > able to know when the device enters or exits the L0s/L1 state. So the
> > > PM part of drivers may not suit this case.
> > >
> > > This patch could be simply divided into two parts:
> > > 1. Monitor requests.
> > > 2. Set a vendor specific register of GL9763e.
> > >
> > > The part 2 is no problems we think. And Ulf thinks that the behaviors
> > > of part 1 should not be implemented in sdhci-pci-gli.c. Do you have
> > > any suggestions on where we can implement the monitoring?
> > >
> > > Thank you.
> > >
> > >
> > > Best regards,
> > >
> > > Renius
> > >
> > >>>
> > >>> Thank you.
> > >>>
> > >>>
> > >>> Best regards,
> > >>>
> > >>> Renius
> > >>>
> > >>> Renius Chen <reniuschengl@gmail.com> 於 2021年7月7日 週三 下午9:49寫道:
> > >>>>
> > >>>> Ulf Hansson <ulf.hansson@linaro.org> 於 2021年7月7日 週三 下午8:16寫道:
> > >>>>>
> > >>>>> [...]
> > >>>>>
> > >>>>>>
> > >>>>>> Thanks, I understand what you mean.
> > >>>>>>
> > >>>>>> I simply searched for the keyword "MMC_READ_MULTIPLE_BLOCK" in the
> > >>>>>> drivers/mmc/host folder, and found that in some SD/MMC host controller
> > >>>>>> driver codes such as alcor.c, cavium.c, ...etc, there are also
> > >>>>>> behaviors for monitoring the request in their driver. What's the
> > >>>>>> difference between theirs and ours?
> > >>>>>
> > >>>>> Those checks are there to allow the HWs to be supported properly.
> > >>>>>
> > >>>>>>
> > >>>>>> And if the code that monitors the requstes does not belong the driver,
> > >>>>>> where should I implement the code and how to add some functions only
> > >>>>>> for GL9763e in that place, in your opinion?
> > >>>>>
> > >>>>> Honestly, I am not sure what suits your use case best.
> > >>>>>
> > >>>>> So far we have used runtime PM with a default auto suspend timeout, in
> > >>>>> combination with dev PM Qos. In other words, run as fast as possible
> > >>>>> to complete the requests in the queue then go back to idle and enter a
> > >>>>> low power state. Clearly, that seems not to be sufficient for your use
> > >>>>> case, sorry.
> > >>>>>
> > >>>> Yes, the runtime PM, auto suspend, and PM Qos are all about the
> > >>>> suspend/resume behaviors of the system or related to power states such
> > >>>> as D0/D3 of the device. But these are totally different from the ASPM
> > >>>> L0s/L1 for link states. Entering/exiting the ASPM is pure hardware
> > >>>> behavior on the link layer and is not handled by any codes in
> > >>>> drivers/mmc/core or drivers/mmc/host. We'd like to try to modify the
> > >>>> patch by your opinions, but we are also confused about what or where
> > >>>> suits our use case best. So we wonder how to start the modification
> > >>>> and may need some suggestions to deal with the work, sorry.
> > >>>>
> > >>>> Thank you.
> > >>>>
> > >>>>
> > >>>> Best regards,
> > >>>>
> > >>>> Renius
> > >>>>
> > >>>>
> > >>>>> Kind regards
> > >>>>> Uffe
> > >>
> >

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-08-17 10:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-05  9:00 [PATCH] [v2] mmc: sdhci-pci-gli: Improve Random 4K Read Performance of GL9763E Renius Chen
2021-07-05 10:02 ` Ulf Hansson
2021-07-05 10:59   ` Renius Chen
2021-07-05 12:50     ` Ulf Hansson
2021-07-05 15:09       ` Renius Chen
2021-07-06  9:16         ` Ulf Hansson
2021-07-06  9:54           ` Renius Chen
2021-07-06 10:08             ` Ulf Hansson
2021-07-06 10:56               ` Renius Chen
2021-07-07 12:15                 ` Ulf Hansson
2021-07-07 13:49                   ` Renius Chen
2021-07-14  2:15                     ` Renius Chen
2021-07-16 10:27                       ` Adrian Hunter
2021-07-19  9:26                         ` Renius Chen
2021-08-04  6:27                           ` Adrian Hunter
2021-08-10  4:23                             ` Renius Chen
2021-08-17 10:30                               ` Renius Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).