All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: Fix cache alignment
@ 2021-01-30 17:53 Marek Vasut
  2021-02-02  3:55 ` Bin Meng
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-01-30 17:53 UTC (permalink / raw)
  To: u-boot

The various structures in the driver are already correcty padded and
cache aligned in memory, however the cache operations are called on
the structure sizes, which themselves might not be cache aligned. Add
the necessary rounding to fix this, which permits the nvme to work on
arm64.

Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
Cc: Bin Meng <bmeng.cn@gmail.com>
---
 drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
index 5d6331ad34..758415a53b 100644
--- a/drivers/nvme/nvme.c
+++ b/drivers/nvme/nvme.c
@@ -53,6 +53,27 @@ struct nvme_queue {
 	unsigned long cmdid_data[];
 };
 
+static void nvme_align_dcache_range(void *start, unsigned long size,
+				    unsigned long *s, unsigned long *e)
+{
+	*s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
+	*e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
+}
+
+static void nvme_flush_dcache_range(void *start, unsigned long size)
+{
+	unsigned long s, e;
+	nvme_align_dcache_range(start, size, &s, &e);
+	flush_dcache_range(s, e);
+}
+
+static void nvme_invalidate_dcache_range(void *start, unsigned long size)
+{
+	unsigned long s, e;
+	nvme_align_dcache_range(start, size, &s, &e);
+	invalidate_dcache_range(s, e);
+}
+
 static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
 {
 	u32 bit = enabled ? NVME_CSTS_RDY : 0;
@@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
 	}
 	*prp2 = (ulong)dev->prp_pool;
 
-	flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
-			   dev->prp_entry_num * sizeof(u64));
+	nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));
 
 	return 0;
 }
@@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
 
 static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
 {
-	u64 start = (ulong)&nvmeq->cqes[index];
-	u64 stop = start + sizeof(struct nvme_completion);
-
-	invalidate_dcache_range(start, stop);
+	nvme_invalidate_dcache_range(&nvmeq->cqes[index],
+				     sizeof(struct nvme_completion));
 
 	return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
 }
@@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	u16 tail = nvmeq->sq_tail;
 
 	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
-	flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
-			   (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
+	nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
 
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
@@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
-	flush_dcache_range((ulong)nvmeq->cqes,
-			   (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
+	nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));
 	dev->online_queues++;
 }
 
@@ -466,13 +482,13 @@ int nvme_identify(struct nvme_dev *dev, unsigned nsid,
 
 	c.identify.cns = cpu_to_le32(cns);
 
-	invalidate_dcache_range(dma_addr,
-				dma_addr + sizeof(struct nvme_id_ctrl));
+	nvme_invalidate_dcache_range((void *)dma_addr,
+				     sizeof(struct nvme_id_ctrl));
 
 	ret = nvme_submit_admin_cmd(dev, &c, NULL);
 	if (!ret)
-		invalidate_dcache_range(dma_addr,
-					dma_addr + sizeof(struct nvme_id_ctrl));
+		nvme_invalidate_dcache_range((void *)dma_addr,
+					     sizeof(struct nvme_id_ctrl));
 
 	return ret;
 }
@@ -729,8 +745,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
 	u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
 	u64 total_lbas = blkcnt;
 
-	flush_dcache_range((unsigned long)buffer,
-			   (unsigned long)buffer + total_len);
+	nvme_flush_dcache_range(buffer, total_len);
 
 	c.rw.opcode = read ? nvme_cmd_read : nvme_cmd_write;
 	c.rw.flags = 0;
@@ -767,8 +782,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
 	}
 
 	if (read)
-		invalidate_dcache_range((unsigned long)buffer,
-					(unsigned long)buffer + total_len);
+		nvme_invalidate_dcache_range(buffer, total_len);
 
 	return (total_len - temp_len) >> desc->log2blksz;
 }
-- 
2.29.2

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-01-30 17:53 [PATCH] nvme: Fix cache alignment Marek Vasut
@ 2021-02-02  3:55 ` Bin Meng
  2021-02-02  8:05   ` Marek Vasut
                     ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Bin Meng @ 2021-02-02  3:55 UTC (permalink / raw)
  To: u-boot

On Sun, Jan 31, 2021 at 1:53 AM Marek Vasut <marek.vasut@gmail.com> wrote:
>
> The various structures in the driver are already correcty padded and

typo: correctly

> cache aligned in memory, however the cache operations are called on
> the structure sizes, which themselves might not be cache aligned. Add
> the necessary rounding to fix this, which permits the nvme to work on
> arm64.

+ARM guys

Which ARM64 SoC did you test this with?

The round down in this patch should be unnecessary. But it's better to
figure out which call to dcache_xxx() with an unaligned end address.

>
> Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
> Cc: Bin Meng <bmeng.cn@gmail.com>
> ---
>  drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
>  1 file changed, 32 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
> index 5d6331ad34..758415a53b 100644
> --- a/drivers/nvme/nvme.c
> +++ b/drivers/nvme/nvme.c
> @@ -53,6 +53,27 @@ struct nvme_queue {
>         unsigned long cmdid_data[];
>  };
>
> +static void nvme_align_dcache_range(void *start, unsigned long size,
> +                                   unsigned long *s, unsigned long *e)
> +{
> +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
> +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
> +}
> +
> +static void nvme_flush_dcache_range(void *start, unsigned long size)
> +{
> +       unsigned long s, e;
> +       nvme_align_dcache_range(start, size, &s, &e);
> +       flush_dcache_range(s, e);
> +}
> +
> +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
> +{
> +       unsigned long s, e;
> +       nvme_align_dcache_range(start, size, &s, &e);
> +       invalidate_dcache_range(s, e);
> +}
> +
>  static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
>  {
>         u32 bit = enabled ? NVME_CSTS_RDY : 0;
> @@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
>         }
>         *prp2 = (ulong)dev->prp_pool;
>
> -       flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
> -                          dev->prp_entry_num * sizeof(u64));
> +       nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));
>
>         return 0;
>  }
> @@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
>
>  static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
>  {
> -       u64 start = (ulong)&nvmeq->cqes[index];
> -       u64 stop = start + sizeof(struct nvme_completion);
> -
> -       invalidate_dcache_range(start, stop);
> +       nvme_invalidate_dcache_range(&nvmeq->cqes[index],
> +                                    sizeof(struct nvme_completion));
>
>         return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
>  }
> @@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
>         u16 tail = nvmeq->sq_tail;
>
>         memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> -       flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
> -                          (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
> +       nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
>
>         if (++tail == nvmeq->q_depth)
>                 tail = 0;
> @@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
>         nvmeq->cq_phase = 1;
>         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
>         memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
> -       flush_dcache_range((ulong)nvmeq->cqes,
> -                          (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
> +       nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));
>         dev->online_queues++;
>  }
>
> @@ -466,13 +482,13 @@ int nvme_identify(struct nvme_dev *dev, unsigned nsid,
>
>         c.identify.cns = cpu_to_le32(cns);
>
> -       invalidate_dcache_range(dma_addr,
> -                               dma_addr + sizeof(struct nvme_id_ctrl));
> +       nvme_invalidate_dcache_range((void *)dma_addr,
> +                                    sizeof(struct nvme_id_ctrl));
>
>         ret = nvme_submit_admin_cmd(dev, &c, NULL);
>         if (!ret)
> -               invalidate_dcache_range(dma_addr,
> -                                       dma_addr + sizeof(struct nvme_id_ctrl));
> +               nvme_invalidate_dcache_range((void *)dma_addr,
> +                                            sizeof(struct nvme_id_ctrl));
>
>         return ret;
>  }
> @@ -729,8 +745,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
>         u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
>         u64 total_lbas = blkcnt;
>
> -       flush_dcache_range((unsigned long)buffer,
> -                          (unsigned long)buffer + total_len);
> +       nvme_flush_dcache_range(buffer, total_len);
>
>         c.rw.opcode = read ? nvme_cmd_read : nvme_cmd_write;
>         c.rw.flags = 0;
> @@ -767,8 +782,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
>         }
>
>         if (read)
> -               invalidate_dcache_range((unsigned long)buffer,
> -                                       (unsigned long)buffer + total_len);
> +               nvme_invalidate_dcache_range(buffer, total_len);
>
>         return (total_len - temp_len) >> desc->log2blksz;
>  }

Regards,
Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  3:55 ` Bin Meng
@ 2021-02-02  8:05   ` Marek Vasut
  2021-02-02  8:54     ` Bin Meng
  2021-02-02 13:04   ` Andre Przywara
  2021-02-02 16:23   ` Andre Przywara
  2 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-02  8:05 UTC (permalink / raw)
  To: u-boot

On 2/2/21 4:55 AM, Bin Meng wrote:

Hi,

>> The various structures in the driver are already correcty padded and
> 
> typo: correctly
> 
>> cache aligned in memory, however the cache operations are called on
>> the structure sizes, which themselves might not be cache aligned. Add
>> the necessary rounding to fix this, which permits the nvme to work on
>> arm64.
> 
> +ARM guys
> 
> Which ARM64 SoC did you test this with?

RCar3, although that's irrelevant, the problem will happen on any arm or 
arm64, and possibly any other system which needs cache management.

> The round down in this patch should be unnecessary.

Can you explain why ?

> But it's better to
> figure out which call to dcache_xxx() with an unaligned end address.

If you look at the code, most of them can (and do) trigger this, 
therefore they need such alignment, as explained in the commit message.

[...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  8:05   ` Marek Vasut
@ 2021-02-02  8:54     ` Bin Meng
  2021-02-02  9:04       ` Marek Vasut
  0 siblings, 1 reply; 22+ messages in thread
From: Bin Meng @ 2021-02-02  8:54 UTC (permalink / raw)
  To: u-boot

On Tue, Feb 2, 2021 at 4:05 PM Marek Vasut <marek.vasut@gmail.com> wrote:
>
> On 2/2/21 4:55 AM, Bin Meng wrote:
>
> Hi,
>
> >> The various structures in the driver are already correcty padded and
> >
> > typo: correctly
> >
> >> cache aligned in memory, however the cache operations are called on
> >> the structure sizes, which themselves might not be cache aligned. Add
> >> the necessary rounding to fix this, which permits the nvme to work on
> >> arm64.
> >
> > +ARM guys
> >
> > Which ARM64 SoC did you test this with?
>
> RCar3, although that's irrelevant, the problem will happen on any arm or
> arm64, and possibly any other system which needs cache management.

There was a recent change to nvme.c that fixed a cache issue on ARMv8
so I thought this might be platform related.

>
> > The round down in this patch should be unnecessary.
>
> Can you explain why ?

I just took a further look and most of the start address should be
cache line aligned (4KiB aligned) except the
nvme_read_completion_status(). It's only 16 bytes aligned which might
not be cache line aligned.

>
> > But it's better to
> > figure out which call to dcache_xxx() with an unaligned end address.
>
> If you look at the code, most of them can (and do) trigger this,
> therefore they need such alignment, as explained in the commit message.

Now I wonder what's the correct implementation of the
invalidate_dcache_range() and flush_dcache_range() in U-Boot?
Shouldn't the round down/up happen in these APIs instead of doing such
in drivers?

Regards,
Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  8:54     ` Bin Meng
@ 2021-02-02  9:04       ` Marek Vasut
  2021-02-02  9:12         ` Bin Meng
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-02  9:04 UTC (permalink / raw)
  To: u-boot

On 2/2/21 9:54 AM, Bin Meng wrote:
[...]
>>>> cache aligned in memory, however the cache operations are called on
>>>> the structure sizes, which themselves might not be cache aligned. Add
>>>> the necessary rounding to fix this, which permits the nvme to work on
>>>> arm64.
>>>
>>> +ARM guys
>>>
>>> Which ARM64 SoC did you test this with?
>>
>> RCar3, although that's irrelevant, the problem will happen on any arm or
>> arm64, and possibly any other system which needs cache management.
> 
> There was a recent change to nvme.c that fixed a cache issue on ARMv8
> so I thought this might be platform related.

I used master, so unlikely.

>>> The round down in this patch should be unnecessary.
>>
>> Can you explain why ?
> 
> I just took a further look and most of the start address should be
> cache line aligned (4KiB aligned) except the
> nvme_read_completion_status(). It's only 16 bytes aligned which might
> not be cache line aligned.

Right, there are various arm chips with 32B/64B alignment requirements.

>>> But it's better to
>>> figure out which call to dcache_xxx() with an unaligned end address.
>>
>> If you look at the code, most of them can (and do) trigger this,
>> therefore they need such alignment, as explained in the commit message.
> 
> Now I wonder what's the correct implementation of the
> invalidate_dcache_range() and flush_dcache_range() in U-Boot?
> Shouldn't the round down/up happen in these APIs instead of doing such
> in drivers?

Definitely not, because then the rounding might flush/invalidate cache 
over areas where this could cause a problem (e.g. neighboring DMA 
descriptors). The driver must do the cache management correctly.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  9:04       ` Marek Vasut
@ 2021-02-02  9:12         ` Bin Meng
  2021-02-02 16:09           ` Marek Vasut
  0 siblings, 1 reply; 22+ messages in thread
From: Bin Meng @ 2021-02-02  9:12 UTC (permalink / raw)
  To: u-boot

On Tue, Feb 2, 2021 at 5:04 PM Marek Vasut <marek.vasut@gmail.com> wrote:
>
> On 2/2/21 9:54 AM, Bin Meng wrote:
> [...]
> >>>> cache aligned in memory, however the cache operations are called on
> >>>> the structure sizes, which themselves might not be cache aligned. Add
> >>>> the necessary rounding to fix this, which permits the nvme to work on
> >>>> arm64.
> >>>
> >>> +ARM guys
> >>>
> >>> Which ARM64 SoC did you test this with?
> >>
> >> RCar3, although that's irrelevant, the problem will happen on any arm or
> >> arm64, and possibly any other system which needs cache management.
> >
> > There was a recent change to nvme.c that fixed a cache issue on ARMv8
> > so I thought this might be platform related.
>
> I used master, so unlikely.

It's strange this issue was not exposed last time when this driver was
tested on ARMv8.

>
> >>> The round down in this patch should be unnecessary.
> >>
> >> Can you explain why ?
> >
> > I just took a further look and most of the start address should be
> > cache line aligned (4KiB aligned) except the
> > nvme_read_completion_status(). It's only 16 bytes aligned which might
> > not be cache line aligned.
>
> Right, there are various arm chips with 32B/64B alignment requirements.
>
> >>> But it's better to
> >>> figure out which call to dcache_xxx() with an unaligned end address.
> >>
> >> If you look at the code, most of them can (and do) trigger this,
> >> therefore they need such alignment, as explained in the commit message.
> >
> > Now I wonder what's the correct implementation of the
> > invalidate_dcache_range() and flush_dcache_range() in U-Boot?
> > Shouldn't the round down/up happen in these APIs instead of doing such
> > in drivers?
>
> Definitely not, because then the rounding might flush/invalidate cache
> over areas where this could cause a problem (e.g. neighboring DMA
> descriptors). The driver must do the cache management correctly.

Well we can implement in these APIs and document its expected usage.
Either way a driver has to do the cache management correctly. Not
doing it in the driver eliminates some duplications of rounding
up/down.

For this case, I believe we just need to take care of
nvme_read_completion_status().

Regards,
Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  3:55 ` Bin Meng
  2021-02-02  8:05   ` Marek Vasut
@ 2021-02-02 13:04   ` Andre Przywara
  2021-02-02 16:08     ` Marek Vasut
  2021-02-02 16:23   ` Andre Przywara
  2 siblings, 1 reply; 22+ messages in thread
From: Andre Przywara @ 2021-02-02 13:04 UTC (permalink / raw)
  To: u-boot

On Tue, 2 Feb 2021 11:55:50 +0800
Bin Meng <bmeng.cn@gmail.com> wrote:

Hi,

> On Sun, Jan 31, 2021 at 1:53 AM Marek Vasut <marek.vasut@gmail.com> wrote:
> >
> > The various structures in the driver are already correcty padded and  
> 
> typo: correctly
> 
> > cache aligned in memory, however the cache operations are called on
> > the structure sizes, which themselves might not be cache aligned. Add
> > the necessary rounding to fix this, which permits the nvme to work on
> > arm64.  
> 
> +ARM guys
> 
> Which ARM64 SoC did you test this with?
> 
> The round down in this patch should be unnecessary. But it's better to
> figure out which call to dcache_xxx() with an unaligned end address.

I agree. There is no requirement for the actual cache maintenance
instruction's address to be aligned, and also we align everything
already in the implementations of flush_dcache_range() and
invalidate_dcache_range().

Actually I think rounding here is *wrong*, as we paper over the
requirement of the *buffer* to be cache line aligned. So this must be
assured at buffer allocation time, and just rounding before calling
cache maintenance merely avoids (read: silences!) the warnings.

I think drivers/net/bcmgenet.c does the right thing.

Happy to provide more detailed rationale and explanations if needed.

Cheers,
Andre
 
> >
> > Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
> > Cc: Bin Meng <bmeng.cn@gmail.com>
> > ---
> >  drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
> >  1 file changed, 32 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
> > index 5d6331ad34..758415a53b 100644
> > --- a/drivers/nvme/nvme.c
> > +++ b/drivers/nvme/nvme.c
> > @@ -53,6 +53,27 @@ struct nvme_queue {
> >         unsigned long cmdid_data[];
> >  };
> >
> > +static void nvme_align_dcache_range(void *start, unsigned long size,
> > +                                   unsigned long *s, unsigned long *e)
> > +{
> > +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
> > +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
> > +}
> > +
> > +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > +{
> > +       unsigned long s, e;
> > +       nvme_align_dcache_range(start, size, &s, &e);
> > +       flush_dcache_range(s, e);
> > +}
> > +
> > +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
> > +{
> > +       unsigned long s, e;
> > +       nvme_align_dcache_range(start, size, &s, &e);
> > +       invalidate_dcache_range(s, e);
> > +}
> > +
> >  static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
> >  {
> >         u32 bit = enabled ? NVME_CSTS_RDY : 0;
> > @@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
> >         }
> >         *prp2 = (ulong)dev->prp_pool;
> >
> > -       flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
> > -                          dev->prp_entry_num * sizeof(u64));
> > +       nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));
> >
> >         return 0;
> >  }
> > @@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
> >
> >  static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
> >  {
> > -       u64 start = (ulong)&nvmeq->cqes[index];
> > -       u64 stop = start + sizeof(struct nvme_completion);
> > -
> > -       invalidate_dcache_range(start, stop);
> > +       nvme_invalidate_dcache_range(&nvmeq->cqes[index],
> > +                                    sizeof(struct nvme_completion));
> >
> >         return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
> >  }
> > @@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
> >         u16 tail = nvmeq->sq_tail;
> >
> >         memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> > -       flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
> > -                          (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
> > +       nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
> >
> >         if (++tail == nvmeq->q_depth)
> >                 tail = 0;
> > @@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> >         nvmeq->cq_phase = 1;
> >         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> >         memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
> > -       flush_dcache_range((ulong)nvmeq->cqes,
> > -                          (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
> > +       nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));
> >         dev->online_queues++;
> >  }
> >
> > @@ -466,13 +482,13 @@ int nvme_identify(struct nvme_dev *dev, unsigned nsid,
> >
> >         c.identify.cns = cpu_to_le32(cns);
> >
> > -       invalidate_dcache_range(dma_addr,
> > -                               dma_addr + sizeof(struct nvme_id_ctrl));
> > +       nvme_invalidate_dcache_range((void *)dma_addr,
> > +                                    sizeof(struct nvme_id_ctrl));
> >
> >         ret = nvme_submit_admin_cmd(dev, &c, NULL);
> >         if (!ret)
> > -               invalidate_dcache_range(dma_addr,
> > -                                       dma_addr + sizeof(struct nvme_id_ctrl));
> > +               nvme_invalidate_dcache_range((void *)dma_addr,
> > +                                            sizeof(struct nvme_id_ctrl));
> >
> >         return ret;
> >  }
> > @@ -729,8 +745,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
> >         u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
> >         u64 total_lbas = blkcnt;
> >
> > -       flush_dcache_range((unsigned long)buffer,
> > -                          (unsigned long)buffer + total_len);
> > +       nvme_flush_dcache_range(buffer, total_len);
> >
> >         c.rw.opcode = read ? nvme_cmd_read : nvme_cmd_write;
> >         c.rw.flags = 0;
> > @@ -767,8 +782,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
> >         }
> >
> >         if (read)
> > -               invalidate_dcache_range((unsigned long)buffer,
> > -                                       (unsigned long)buffer + total_len);
> > +               nvme_invalidate_dcache_range(buffer, total_len);
> >
> >         return (total_len - temp_len) >> desc->log2blksz;
> >  }  
> 
> Regards,
> Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02 13:04   ` Andre Przywara
@ 2021-02-02 16:08     ` Marek Vasut
  0 siblings, 0 replies; 22+ messages in thread
From: Marek Vasut @ 2021-02-02 16:08 UTC (permalink / raw)
  To: u-boot

On 2/2/21 2:04 PM, Andre Przywara wrote:

Hi,

[...]

>>> The various structures in the driver are already correcty padded and
>>
>> typo: correctly
>>
>>> cache aligned in memory, however the cache operations are called on
>>> the structure sizes, which themselves might not be cache aligned. Add
>>> the necessary rounding to fix this, which permits the nvme to work on
>>> arm64.
>>
>> +ARM guys
>>
>> Which ARM64 SoC did you test this with?
>>
>> The round down in this patch should be unnecessary. But it's better to
>> figure out which call to dcache_xxx() with an unaligned end address.
> 
> I agree. There is no requirement for the actual cache maintenance
> instruction's address to be aligned, and also we align everything
> already in the implementations of flush_dcache_range() and
> invalidate_dcache_range().

I would suggest you insert check_cache_range() into 
arch/arm/cpu/armv8/cache_v8.c invalidate_dcache_range() and 
flush_dcache_range() which would prove you wrong.

> Actually I think rounding here is *wrong*, as we paper over the
> requirement of the *buffer* to be cache line aligned. So this must be
> assured at buffer allocation time, and just rounding before calling
> cache maintenance merely avoids (read: silences!) the warnings.
> 
> I think drivers/net/bcmgenet.c does the right thing.
> 
> Happy to provide more detailed rationale and explanations if needed.

Please do spend some more time on this, possibly even test your claim on 
real hardware, and then provide more details.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  9:12         ` Bin Meng
@ 2021-02-02 16:09           ` Marek Vasut
  0 siblings, 0 replies; 22+ messages in thread
From: Marek Vasut @ 2021-02-02 16:09 UTC (permalink / raw)
  To: u-boot

On 2/2/21 10:12 AM, Bin Meng wrote:
[...]
>>>>>> cache aligned in memory, however the cache operations are called on
>>>>>> the structure sizes, which themselves might not be cache aligned. Add
>>>>>> the necessary rounding to fix this, which permits the nvme to work on
>>>>>> arm64.
>>>>>
>>>>> +ARM guys
>>>>>
>>>>> Which ARM64 SoC did you test this with?
>>>>
>>>> RCar3, although that's irrelevant, the problem will happen on any arm or
>>>> arm64, and possibly any other system which needs cache management.
>>>
>>> There was a recent change to nvme.c that fixed a cache issue on ARMv8
>>> so I thought this might be platform related.
>>
>> I used master, so unlikely.
> 
> It's strange this issue was not exposed last time when this driver was
> tested on ARMv8.

Which ARMv8 platform ?

>>>>> The round down in this patch should be unnecessary.
>>>>
>>>> Can you explain why ?
>>>
>>> I just took a further look and most of the start address should be
>>> cache line aligned (4KiB aligned) except the
>>> nvme_read_completion_status(). It's only 16 bytes aligned which might
>>> not be cache line aligned.
>>
>> Right, there are various arm chips with 32B/64B alignment requirements.
>>
>>>>> But it's better to
>>>>> figure out which call to dcache_xxx() with an unaligned end address.
>>>>
>>>> If you look at the code, most of them can (and do) trigger this,
>>>> therefore they need such alignment, as explained in the commit message.
>>>
>>> Now I wonder what's the correct implementation of the
>>> invalidate_dcache_range() and flush_dcache_range() in U-Boot?
>>> Shouldn't the round down/up happen in these APIs instead of doing such
>>> in drivers?
>>
>> Definitely not, because then the rounding might flush/invalidate cache
>> over areas where this could cause a problem (e.g. neighboring DMA
>> descriptors). The driver must do the cache management correctly.
> 
> Well we can implement in these APIs and document its expected usage.

That would be harmful, because you cannot really flush/invalidate half 
of cache line. Consider you have two 16B DMA descriptors next to each 
other, which is often the case, then the driver has to implement proper 
cache handling.

That this particular driver already has structures reasonably well 
padded that they can be flushed by adding rounding is an exception.

> Either way a driver has to do the cache management correctly. Not
> doing it in the driver eliminates some duplications of rounding
> up/down.

Indeed, and rounding up/down is the correct way to do it in this driver, 
see the commit message.

> For this case, I believe we just need to take care of
> nvme_read_completion_status().

Can you validate this ?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02  3:55 ` Bin Meng
  2021-02-02  8:05   ` Marek Vasut
  2021-02-02 13:04   ` Andre Przywara
@ 2021-02-02 16:23   ` Andre Przywara
  2021-02-02 21:18     ` Marek Vasut
  2 siblings, 1 reply; 22+ messages in thread
From: Andre Przywara @ 2021-02-02 16:23 UTC (permalink / raw)
  To: u-boot

On Tue, 2 Feb 2021 11:55:50 +0800
Bin Meng <bmeng.cn@gmail.com> wrote:

Hi,

had a look at the code, those are my findings:

> On Sun, Jan 31, 2021 at 1:53 AM Marek Vasut <marek.vasut@gmail.com> wrote:
> >
> > The various structures in the driver are already correcty padded and  
> 
> typo: correctly
> 
> > cache aligned in memory, however the cache operations are called on
> > the structure sizes, which themselves might not be cache aligned. Add
> > the necessary rounding to fix this, which permits the nvme to work on
> > arm64.  
> 
> +ARM guys
> 
> Which ARM64 SoC did you test this with?
> 
> The round down in this patch should be unnecessary. But it's better to
> figure out which call to dcache_xxx() with an unaligned end address.
> 
> >
> > Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
> > Cc: Bin Meng <bmeng.cn@gmail.com>
> > ---
> >  drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
> >  1 file changed, 32 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
> > index 5d6331ad34..758415a53b 100644
> > --- a/drivers/nvme/nvme.c
> > +++ b/drivers/nvme/nvme.c
> > @@ -53,6 +53,27 @@ struct nvme_queue {
> >         unsigned long cmdid_data[];
> >  };
> >
> > +static void nvme_align_dcache_range(void *start, unsigned long size,
> > +                                   unsigned long *s, unsigned long *e)
> > +{
> > +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
> > +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
> > +}

As mentioned in the other email, just rounding the value that we are
going to pass to the cache maintenance operation does not make sense,
so this function should go.

> > +
> > +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > +{
> > +       unsigned long s, e;
> > +       nvme_align_dcache_range(start, size, &s, &e);
> > +       flush_dcache_range(s, e);

There is no good reason for alignment restrictions when it comes to
clean (& invalidate), so there is no need for this wrapper.

> > +}
> > +
> > +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
> > +{
> > +       unsigned long s, e;
> > +       nvme_align_dcache_range(start, size, &s, &e);
> > +       invalidate_dcache_range(s, e);

invalidate is tricky, we could use a wrapper similar to
invalidate_dcache_check() in bcmgenet.c. However I am not sure this is
very useful here, because most requests are already for a whole buffer
size (page or block).

Alignment of the buffer address will be checked by the implementation
of invalidate_dcache_range().

> > +}
> > +
> >  static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
> >  {
> >         u32 bit = enabled ? NVME_CSTS_RDY : 0;
> > @@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
> >         }
> >         *prp2 = (ulong)dev->prp_pool;
> >
> > -       flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
> > -                          dev->prp_entry_num * sizeof(u64));
> > +       nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));

As mentioned above, there should be no reason to check or round addresses for flush().
It gets a bit more tricky if buffers overlap with buffers that are also
invalidated at some point, but that does not seem to be the case here.

> >
> >         return 0;
> >  }
> > @@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
> >
> >  static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
> >  {
> > -       u64 start = (ulong)&nvmeq->cqes[index];
> > -       u64 stop = start + sizeof(struct nvme_completion);
> > -
> > -       invalidate_dcache_range(start, stop);

IIUC, this wants to invalidate a single CQ entry, which is 16 bytes
long. So this will also invalidate the neighboring entry, at least
(with 64 byte cache line size even three others).
Does the hardware require the queue entries to be consecutive?
If not, we could blow up struct nvme_completion to have the size of a
cache line.
If yes, this gets a bit more tricky.

One generic solution would be to use "coherent" memory, which is
basically mapped as uncached. And then put the CQ array in there. I
think we already have some support in U-Boot for that?

> > +       nvme_invalidate_dcache_range(&nvmeq->cqes[index],
> > +                                    sizeof(struct nvme_completion));
> >
> >         return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
> >  }
> > @@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
> >         u16 tail = nvmeq->sq_tail;
> >
> >         memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> > -       flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
> > -                          (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
> > +       nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
> >
> >         if (++tail == nvmeq->q_depth)
> >                 tail = 0;
> > @@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> >         nvmeq->cq_phase = 1;
> >         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> >         memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
> > -       flush_dcache_range((ulong)nvmeq->cqes,
> > -                          (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
> > +       nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));

This will flush multiple CQ entries, same as above. If a neighboring
entry is waiting for the status bit to flip (as above), this might be
harmful, as it might be overwritten.
If we can't blow up a single CQ entry, uncached memory might be better?

> >         dev->online_queues++;
> >  }
> >
> > @@ -466,13 +482,13 @@ int nvme_identify(struct nvme_dev *dev, unsigned nsid,
> >
> >         c.identify.cns = cpu_to_le32(cns);
> >
> > -       invalidate_dcache_range(dma_addr,
> > -                               dma_addr + sizeof(struct nvme_id_ctrl));

If I counted correctly, then nvme_id_ctrl is 4K in size, so that should
be fine. And dma_addr seems to come from a "page_size" aligned
allocation, so that should be fine as well.

> > +       nvme_invalidate_dcache_range((void *)dma_addr,
> > +                                    sizeof(struct nvme_id_ctrl));
> >
> >         ret = nvme_submit_admin_cmd(dev, &c, NULL);
> >         if (!ret)
> > -               invalidate_dcache_range(dma_addr,
> > -                                       dma_addr + sizeof(struct nvme_id_ctrl));

same as above.

> > +               nvme_invalidate_dcache_range((void *)dma_addr,
> > +                                            sizeof(struct nvme_id_ctrl));
> >
> >         return ret;
> >  }
> > @@ -729,8 +745,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
> >         u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
> >         u64 total_lbas = blkcnt;
> >
> > -       flush_dcache_range((unsigned long)buffer,
> > -                          (unsigned long)buffer + total_len);
> > +       nvme_flush_dcache_range(buffer, total_len);
> >
> >         c.rw.opcode = read ? nvme_cmd_read : nvme_cmd_write;
> >         c.rw.flags = 0;
> > @@ -767,8 +782,7 @@ static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
> >         }
> >
> >         if (read)
> > -               invalidate_dcache_range((unsigned long)buffer,
> > -                                       (unsigned long)buffer + total_len);

total_len is probably fine, that seems to be at least as big as
the block size, which is hopefully bigger than a cache line.
"buffer" comes directly from the UCLASS_BLK layer, does anyone know
where those buffers are allocated?

Cheers,
Andre


> > +               nvme_invalidate_dcache_range(buffer, total_len);
> >
> >         return (total_len - temp_len) >> desc->log2blksz;
> >  }  
> 
> Regards,
> Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02 16:23   ` Andre Przywara
@ 2021-02-02 21:18     ` Marek Vasut
  2021-02-03 10:42       ` Andre Przywara
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-02 21:18 UTC (permalink / raw)
  To: u-boot

On 2/2/21 5:23 PM, Andre Przywara wrote:
[...]

>>>   drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
>>>   1 file changed, 32 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
>>> index 5d6331ad34..758415a53b 100644
>>> --- a/drivers/nvme/nvme.c
>>> +++ b/drivers/nvme/nvme.c
>>> @@ -53,6 +53,27 @@ struct nvme_queue {
>>>          unsigned long cmdid_data[];
>>>   };
>>>
>>> +static void nvme_align_dcache_range(void *start, unsigned long size,
>>> +                                   unsigned long *s, unsigned long *e)
>>> +{
>>> +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
>>> +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
>>> +}
> 
> As mentioned in the other email, just rounding the value that we are
> going to pass to the cache maintenance operation does not make sense,
> so this function should go.

You keep saying that this patch makes no sense, but I don't see any 
explanation _why_ there is a problem.

>>> +
>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
>>> +{
>>> +       unsigned long s, e;
>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>> +       flush_dcache_range(s, e);
> 
> There is no good reason for alignment restrictions when it comes to
> clean (& invalidate), so there is no need for this wrapper.

Is that on ARM64-specific or is that applicable in general ? The driver 
is expected to work on any CPU. The flushed area start/end address must 
be cache line aligned, hence the wrapper is correct and needed.

>>> +}
>>> +
>>> +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
>>> +{
>>> +       unsigned long s, e;
>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>> +       invalidate_dcache_range(s, e);
> 
> invalidate is tricky, we could use a wrapper similar to
> invalidate_dcache_check() in bcmgenet.c. However I am not sure this is
> very useful here, because most requests are already for a whole buffer
> size (page or block).
> 
> Alignment of the buffer address will be checked by the implementation
> of invalidate_dcache_range().

Both invalidate_dcache_range() and flush_dcache_range() will perform no 
operation if either start or end address is not cache-line aligned. If a 
driver passes such unaligned address to either function, then the driver 
must be fixed, because it will fail to perform cache flush/invalidate 
ops silently and then fail silently, e.g. like this nvme driver does.

>>> +}
>>> +
>>>   static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
>>>   {
>>>          u32 bit = enabled ? NVME_CSTS_RDY : 0;
>>> @@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
>>>          }
>>>          *prp2 = (ulong)dev->prp_pool;
>>>
>>> -       flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
>>> -                          dev->prp_entry_num * sizeof(u64));
>>> +       nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));
> 
> As mentioned above, there should be no reason to check or round addresses for flush().

Again, if the start/end address of the flush/invalidate_dcache_range 
operation is unaligned, the operation is not performed, hence the 
alignment is mandatory.

> It gets a bit more tricky if buffers overlap with buffers that are also
> invalidated at some point, but that does not seem to be the case here.

The buffers are correctly padded already, hence there is no problem with 
overlap, and hence the rounding is exactly what has to be done.

>>>
>>>          return 0;
>>>   }
>>> @@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
>>>
>>>   static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
>>>   {
>>> -       u64 start = (ulong)&nvmeq->cqes[index];
>>> -       u64 stop = start + sizeof(struct nvme_completion);
>>> -
>>> -       invalidate_dcache_range(start, stop);
> 
> IIUC, this wants to invalidate a single CQ entry, which is 16 bytes
> long. So this will also invalidate the neighboring entry, at least
> (with 64 byte cache line size even three others).
> Does the hardware require the queue entries to be consecutive?
> If not, we could blow up struct nvme_completion to have the size of a
> cache line.
> If yes, this gets a bit more tricky.

The CQ entry is correctly padded, so this concern does not apply.
Note that this is also mentioned in the commit message.

> One generic solution would be to use "coherent" memory, which is
> basically mapped as uncached. And then put the CQ array in there. I
> think we already have some support in U-Boot for that?

Yes, some drivers seems to be misusing coherent memory where it makes 
zero sense to use it. Here it makes zero sense, since the CQ entries are 
cacheline aligned and all that needs to be done is correctly apply the 
cache operations, which is done by this patch.

>>> +       nvme_invalidate_dcache_range(&nvmeq->cqes[index],
>>> +                                    sizeof(struct nvme_completion));
>>>
>>>          return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
>>>   }
>>> @@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
>>>          u16 tail = nvmeq->sq_tail;
>>>
>>>          memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>>> -       flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
>>> -                          (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
>>> +       nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
>>>
>>>          if (++tail == nvmeq->q_depth)
>>>                  tail = 0;
>>> @@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
>>>          nvmeq->cq_phase = 1;
>>>          nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
>>>          memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
>>> -       flush_dcache_range((ulong)nvmeq->cqes,
>>> -                          (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
>>> +       nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));
> 
> This will flush multiple CQ entries, same as above. If a neighboring
> entry is waiting for the status bit to flip (as above), this might be
> harmful, as it might be overwritten.
> If we can't blow up a single CQ entry, uncached memory might be better?

No, see above.

[...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-02 21:18     ` Marek Vasut
@ 2021-02-03 10:42       ` Andre Przywara
  2021-02-03 13:08         ` Marek Vasut
  0 siblings, 1 reply; 22+ messages in thread
From: Andre Przywara @ 2021-02-03 10:42 UTC (permalink / raw)
  To: u-boot

On Tue, 2 Feb 2021 22:18:47 +0100
Marek Vasut <marek.vasut@gmail.com> wrote:

Hi,

> On 2/2/21 5:23 PM, Andre Przywara wrote:
> [...]
> 
> >>>   drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
> >>>   1 file changed, 32 insertions(+), 18 deletions(-)
> >>>
> >>> diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
> >>> index 5d6331ad34..758415a53b 100644
> >>> --- a/drivers/nvme/nvme.c
> >>> +++ b/drivers/nvme/nvme.c
> >>> @@ -53,6 +53,27 @@ struct nvme_queue {
> >>>          unsigned long cmdid_data[];
> >>>   };
> >>>
> >>> +static void nvme_align_dcache_range(void *start, unsigned long size,
> >>> +                                   unsigned long *s, unsigned long *e)
> >>> +{
> >>> +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
> >>> +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
> >>> +}  
> > 
> > As mentioned in the other email, just rounding the value that we are
> > going to pass to the cache maintenance operation does not make sense,
> > so this function should go.  
> 
> You keep saying that this patch makes no sense, but I don't see any 
> explanation _why_ there is a problem.

My main point is that you merely align the values that you pass to the
cache alignment function, but not the actual buffer addresses.

Not sure if this is clear, but the VAs used in cache maintenance
instructions do NOT need to be cache line aligned. The silicon
implementation will take care of this:
"When a DC instruction requires an address argument this takes the form
of a 64-bit register that holds the VA argument. No alignment
restrictions apply for this address."
(ARMv8 ARM DDI 0487F.c D4.4.8 A64 Cache maintenance instructions: The
data cache maintenance instruction (DC), pg. D4-2505)

"When the data is stated to be an MVA, it does not have to be cache line aligned."
(ARMv7 ARM DDI 0406C.d B4.2.1 Cache and branch predictor maintenance
operations, VMSA; section "MVA", pg. B4-1736)

I think I once chased this back to even ARMv6.

What requires alignment are the buffers used with cache maintenance, to
avoid a cache maintenance operation inadvertently affecting other data.

One dangerous operation is a "pure" invalidate on a *dirty* cache line,
as this may potentially lose data.
The other harm I see is when cleaning a line, when the hardware (some
DMA) has updated the backing memory meanwhile (for instance to update a
status bit or to fill a buffer with data from the hardware.

Avoiding those two situations requires careful alignment and padding of
the *buffers*, blanketly rounding any addresses handed to the cache
maintenance operation will not help. 

> >>> +
> >>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
> >>> +{
> >>> +       unsigned long s, e;
> >>> +       nvme_align_dcache_range(start, size, &s, &e);
> >>> +       flush_dcache_range(s, e);  
> > 
> > There is no good reason for alignment restrictions when it comes to
> > clean (& invalidate), so there is no need for this wrapper.  
> 
> Is that on ARM64-specific or is that applicable in general ? The driver 
> is expected to work on any CPU.

Cache clean (actually: cache clean&invalidate) is what happens on evictions
all of the time, at the cache controller's discretion. So there is no
real harm in that operation per se. When an eviction happens on a
*clean* cache line, this is basically just an invalidate, which is also not
harmful.

There are harmful cases when buffers sharing a cache line are both "just invalidated"
and "cleaned" at different points in time.

> The flushed area start/end address must 
> be cache line aligned, hence the wrapper is correct and needed.

Why? (See below)

> >>> +}
> >>> +
> >>> +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
> >>> +{
> >>> +       unsigned long s, e;
> >>> +       nvme_align_dcache_range(start, size, &s, &e);
> >>> +       invalidate_dcache_range(s, e);  
> > 
> > invalidate is tricky, we could use a wrapper similar to
> > invalidate_dcache_check() in bcmgenet.c. However I am not sure this is
> > very useful here, because most requests are already for a whole buffer
> > size (page or block).
> > 
> > Alignment of the buffer address will be checked by the implementation
> > of invalidate_dcache_range().  
> 
> Both invalidate_dcache_range() and flush_dcache_range() will perform no 
> operation if either start or end address is not cache-line aligned.

Where does this come from? I don't see this. ARMv8 goes directly to
assembly, where there is no check or bail out. It actually aligns the
values passed in:
        /* x2 <- minimal cache line size in cache system */
        sub     x3, x2, #1
        bic     x0, x0, x3
1:      dc      ivac, x0        /* invalidate data or unified cache */
(arch/arm/cpu/armv8/cache.S:__asm_invalidate_dcache_range)
I think it does this to simplify the algorithm to walk over the range.

On v7 there is indeed a check, but just for inval, not for flush.

Can you say what the reason for this v7 check was? I would say it's a
heads up to the programmer, to make sure we invalidate only a certain
buffer, but nothing else.

BUT: just aligning the address for the *cache operation* is missing
the point, as the instruction works on the whole cache line anyway. We
need to make sure that no other data (stack, heap, buffer) is sharing
the same cache line *and* is dirty,@the time when we invalidate.
Just rounding the addresses lets you merely pass the check_cache_range()
function, but will not solve the actual issue.

Take an example:
   nvmeq->cqes = (void *)memalign(4096, NVME_CQ_SIZE(depth))
     will result in this allocation (0x4000 is just some address):
 0x4000       0x4010       0x4020
    |  cqes[0]   |   cqes[1]  |

So struct nvme_completion seems to be exactly 16 bytes long, so both
entries will occupy the same cache line (cache line size >= 32 bytes).
With the common 64 byte size any data after that (at 0x4020) would be
affected as well.
So if you invalidate one entry, the other will also be invalidated. Any
address between 0x4000 and 0x403f will do that, whether this is the
start address of cqes[1], cqes[0] or any of those, rounded down.

We need to be aware of that. Looking again more closely at this
case, it seems fine here, as we only ever seem to *read* from those
(after initialisation), so they would never become dirty. Invalidating
neighboring entries should do no harm then.
As for other data after the buffer sharing the cache line: This is fine
*here* because there is a memalign(4096, ...) after this memalign above,
so the (4096 - 32) bytes after cqes[1] are not used. But honestly this
should either be documented or forced, by blowing up the size
of the allocation to a multiple of the cache line size.

So to summarise: The *whole* of the CQ entries are correctly aligned and
padded, but only because of the *next* memalign has an alignment
bigger than the cache line size.
A *single* CQ entry is not fine, as it's never starting *and* ending at
a cache line boundary. In this case this is fine, as all CQ entries are
only ever read, so invalidating those always clean cache lines does no
harm.

> If a 
> driver passes such unaligned address to either function, then the driver 
> must be fixed, because it will fail to perform cache flush/invalidate 
> ops silently and then fail silently, e.g. like this nvme driver does.

If a driver *uses* such unaligned address for a *pure invalidate*, it
should be carefully investigated if this is safe, or if anything else
is potentially sharing the same cache line. A blanket rounding will not
help with this, and will just paper over the issue.

Cheers,
Andre


> >>> +}
> >>> +
> >>>   static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
> >>>   {
> >>>          u32 bit = enabled ? NVME_CSTS_RDY : 0;
> >>> @@ -129,8 +150,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
> >>>          }
> >>>          *prp2 = (ulong)dev->prp_pool;
> >>>
> >>> -       flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
> >>> -                          dev->prp_entry_num * sizeof(u64));
> >>> +       nvme_flush_dcache_range(dev->prp_pool, dev->prp_entry_num * sizeof(u64));  
> > 
> > As mentioned above, there should be no reason to check or round addresses for flush().  
> 
> Again, if the start/end address of the flush/invalidate_dcache_range 
> operation is unaligned, the operation is not performed, hence the 
> alignment is mandatory.
> 
> > It gets a bit more tricky if buffers overlap with buffers that are also
> > invalidated at some point, but that does not seem to be the case here.  
> 
> The buffers are correctly padded already, hence there is no problem with 
> overlap, and hence the rounding is exactly what has to be done.
> 
> >>>
> >>>          return 0;
> >>>   }
> >>> @@ -144,10 +164,8 @@ static __le16 nvme_get_cmd_id(void)
> >>>
> >>>   static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
> >>>   {
> >>> -       u64 start = (ulong)&nvmeq->cqes[index];
> >>> -       u64 stop = start + sizeof(struct nvme_completion);
> >>> -
> >>> -       invalidate_dcache_range(start, stop);  
> > 
> > IIUC, this wants to invalidate a single CQ entry, which is 16 bytes
> > long. So this will also invalidate the neighboring entry, at least
> > (with 64 byte cache line size even three others).
> > Does the hardware require the queue entries to be consecutive?
> > If not, we could blow up struct nvme_completion to have the size of a
> > cache line.
> > If yes, this gets a bit more tricky.  
> 
> The CQ entry is correctly padded, so this concern does not apply.
> Note that this is also mentioned in the commit message.
> 
> > One generic solution would be to use "coherent" memory, which is
> > basically mapped as uncached. And then put the CQ array in there. I
> > think we already have some support in U-Boot for that?  
> 
> Yes, some drivers seems to be misusing coherent memory where it makes 
> zero sense to use it. Here it makes zero sense, since the CQ entries are 
> cacheline aligned and all that needs to be done is correctly apply the 
> cache operations, which is done by this patch.
> 
> >>> +       nvme_invalidate_dcache_range(&nvmeq->cqes[index],
> >>> +                                    sizeof(struct nvme_completion));
> >>>
> >>>          return le16_to_cpu(readw(&(nvmeq->cqes[index].status)));
> >>>   }
> >>> @@ -163,8 +181,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
> >>>          u16 tail = nvmeq->sq_tail;
> >>>
> >>>          memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> >>> -       flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
> >>> -                          (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
> >>> +       nvme_flush_dcache_range(&nvmeq->sq_cmds[tail], sizeof(*cmd));
> >>>
> >>>          if (++tail == nvmeq->q_depth)
> >>>                  tail = 0;
> >>> @@ -338,8 +355,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> >>>          nvmeq->cq_phase = 1;
> >>>          nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> >>>          memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
> >>> -       flush_dcache_range((ulong)nvmeq->cqes,
> >>> -                          (ulong)nvmeq->cqes + NVME_CQ_SIZE(nvmeq->q_depth));
> >>> +       nvme_flush_dcache_range(nvmeq->cqes, NVME_CQ_SIZE(nvmeq->q_depth));  
> > 
> > This will flush multiple CQ entries, same as above. If a neighboring
> > entry is waiting for the status bit to flip (as above), this might be
> > harmful, as it might be overwritten.
> > If we can't blow up a single CQ entry, uncached memory might be better?  
> 
> No, see above.
> 
> [...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-03 10:42       ` Andre Przywara
@ 2021-02-03 13:08         ` Marek Vasut
  2021-02-04 10:26           ` Andre Przywara
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-03 13:08 UTC (permalink / raw)
  To: u-boot

On 2/3/21 11:42 AM, Andre Przywara wrote:

[...]

>>>>>    drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
>>>>>    1 file changed, 32 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
>>>>> index 5d6331ad34..758415a53b 100644
>>>>> --- a/drivers/nvme/nvme.c
>>>>> +++ b/drivers/nvme/nvme.c
>>>>> @@ -53,6 +53,27 @@ struct nvme_queue {
>>>>>           unsigned long cmdid_data[];
>>>>>    };
>>>>>
>>>>> +static void nvme_align_dcache_range(void *start, unsigned long size,
>>>>> +                                   unsigned long *s, unsigned long *e)
>>>>> +{
>>>>> +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
>>>>> +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
>>>>> +}
>>>
>>> As mentioned in the other email, just rounding the value that we are
>>> going to pass to the cache maintenance operation does not make sense,
>>> so this function should go.
>>
>> You keep saying that this patch makes no sense, but I don't see any
>> explanation _why_ there is a problem.
> 
> My main point is that you merely align the values that you pass to the
> cache alignment function, but not the actual buffer addresses.

Yes, which is explained in the commit message, in this driver it is 
sufficient to around the start address down and end address up.

> Not sure if this is clear, but the VAs used in cache maintenance
> instructions do NOT need to be cache line aligned. The silicon
> implementation will take care of this:

No, this is incorrect. The flush/invalidate_dcache_range() functions 
only support cacheline aligned addresses, if the address is not aligned, 
then these functions do nothing. If arm64 has some special behavior, 
then that's irrelevant here, since this is a generic driver and must 
work on all architectures.

> "When a DC instruction requires an address argument this takes the form
> of a 64-bit register that holds the VA argument. No alignment
> restrictions apply for this address."
> (ARMv8 ARM DDI 0487F.c D4.4.8 A64 Cache maintenance instructions: The
> data cache maintenance instruction (DC), pg. D4-2505)
> 
> "When the data is stated to be an MVA, it does not have to be cache line aligned."
> (ARMv7 ARM DDI 0406C.d B4.2.1 Cache and branch predictor maintenance
> operations, VMSA; section "MVA", pg. B4-1736)
> 
> I think I once chased this back to even ARMv6.
> 
> What requires alignment are the buffers used with cache maintenance, to
> avoid a cache maintenance operation inadvertently affecting other data.
> 
> One dangerous operation is a "pure" invalidate on a *dirty* cache line,
> as this may potentially lose data.
> The other harm I see is when cleaning a line, when the hardware (some
> DMA) has updated the backing memory meanwhile (for instance to update a
> status bit or to fill a buffer with data from the hardware.
> 
> Avoiding those two situations requires careful alignment and padding of
> the *buffers*, blanketly rounding any addresses handed to the cache
> maintenance operation will not help.

If you read the patch, you will notice that these conditions either 
happened before too, because the cache operations were in place already 
OR they do not apply at all, because the buffers in question are already 
well aligned.

>>>>> +
>>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
>>>>> +{
>>>>> +       unsigned long s, e;
>>>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>>>> +       flush_dcache_range(s, e);
>>>
>>> There is no good reason for alignment restrictions when it comes to
>>> clean (& invalidate), so there is no need for this wrapper.
>>
>> Is that on ARM64-specific or is that applicable in general ? The driver
>> is expected to work on any CPU.
> 
> Cache clean (actually: cache clean&invalidate) is what happens on evictions
> all of the time, at the cache controller's discretion. So there is no
> real harm in that operation per se. When an eviction happens on a
> *clean* cache line, this is basically just an invalidate, which is also not
> harmful.
> 
> There are harmful cases when buffers sharing a cache line are both "just invalidated"
> and "cleaned" at different points in time.

Is that on ARM64-specific or is that applicable in general ? (the above 
does not answer that question)

>> The flushed area start/end address must
>> be cache line aligned, hence the wrapper is correct and needed.
> 
> Why? (See below)
> 
>>>>> +}
>>>>> +
>>>>> +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
>>>>> +{
>>>>> +       unsigned long s, e;
>>>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>>>> +       invalidate_dcache_range(s, e);
>>>
>>> invalidate is tricky, we could use a wrapper similar to
>>> invalidate_dcache_check() in bcmgenet.c. However I am not sure this is
>>> very useful here, because most requests are already for a whole buffer
>>> size (page or block).
>>>
>>> Alignment of the buffer address will be checked by the implementation
>>> of invalidate_dcache_range().
>>
>> Both invalidate_dcache_range() and flush_dcache_range() will perform no
>> operation if either start or end address is not cache-line aligned.
> 
> Where does this come from? I don't see this. ARMv8 goes directly to
> assembly, where there is no check or bail out.

So ARMv8 has custom behavior, but that is not how these functions are 
supposed to work, look at check_cache_range() usage elsewhere. If the 
documentation is lacking, see 397b5697ad ("arm: Move check_cache_range() 
into a common place") and co.

It actually aligns the
> values passed in:
>          /* x2 <- minimal cache line size in cache system */
>          sub     x3, x2, #1
>          bic     x0, x0, x3
> 1:      dc      ivac, x0        /* invalidate data or unified cache */
> (arch/arm/cpu/armv8/cache.S:__asm_invalidate_dcache_range)
> I think it does this to simplify the algorithm to walk over the range.
> 
> On v7 there is indeed a check, but just for inval, not for flush.

Again, arch specific behavior on which you cannot depend in generic driver.

> Can you say what the reason for this v7 check was? I would say it's a
> heads up to the programmer, to make sure we invalidate only a certain
> buffer, but nothing else.

The reason for those checks is because not all architectures can 
flush/invalidate at sub-cache-line granularity, which results in obscure 
problems.

> BUT: just aligning the address for the *cache operation* is missing
> the point, as the instruction works on the whole cache line anyway. We
> need to make sure that no other data (stack, heap, buffer) is sharing
> the same cache line *and* is dirty, at the time when we invalidate.
> Just rounding the addresses lets you merely pass the check_cache_range()
> function, but will not solve the actual issue.
> 
> Take an example:
>     nvmeq->cqes = (void *)memalign(4096, NVME_CQ_SIZE(depth))
>       will result in this allocation (0x4000 is just some address):
>   0x4000       0x4010       0x4020
>      |  cqes[0]   |   cqes[1]  |
> 
> So struct nvme_completion seems to be exactly 16 bytes long, so both
> entries will occupy the same cache line (cache line size >= 32 bytes).
> With the common 64 byte size any data after that (at 0x4020) would be
> affected as well.
> So if you invalidate one entry, the other will also be invalidated. Any
> address between 0x4000 and 0x403f will do that, whether this is the
> start address of cqes[1], cqes[0] or any of those, rounded down.

And is either of the neighboring cges[] used, so does this cause any 
issues ? I believe the answer is no.

> We need to be aware of that. Looking again more closely at this
> case, it seems fine here, as we only ever seem to *read* from those
> (after initialisation), so they would never become dirty. Invalidating
> neighboring entries should do no harm then.
> As for other data after the buffer sharing the cache line: This is fine
> *here* because there is a memalign(4096, ...) after this memalign above,
> so the (4096 - 32) bytes after cqes[1] are not used. But honestly this
> should either be documented or forced, by blowing up the size
> of the allocation to a multiple of the cache line size.
> 
> So to summarise: The *whole* of the CQ entries are correctly aligned and
> padded, but only because of the *next* memalign has an alignment
> bigger than the cache line size.
> A *single* CQ entry is not fine, as it's never starting *and* ending at
> a cache line boundary. In this case this is fine, as all CQ entries are
> only ever read, so invalidating those always clean cache lines does no
> harm.

Good, I am happy to hear we finally reached the same conclusion.

So what do you propose is adjusted in this bugfix ?

>> If a
>> driver passes such unaligned address to either function, then the driver
>> must be fixed, because it will fail to perform cache flush/invalidate
>> ops silently and then fail silently, e.g. like this nvme driver does.
> 
> If a driver *uses* such unaligned address for a *pure invalidate*, it
> should be carefully investigated if this is safe, or if anything else
> is potentially sharing the same cache line. A blanket rounding will not
> help with this, and will just paper over the issue.

See check_cache_range(), the sole reason for its existence was to notify 
driver developers about various cache alignment problems. I believe it 
should be added on armv8 too.

[...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-03 13:08         ` Marek Vasut
@ 2021-02-04 10:26           ` Andre Przywara
  2021-02-04 16:57             ` Tom Rini
  0 siblings, 1 reply; 22+ messages in thread
From: Andre Przywara @ 2021-02-04 10:26 UTC (permalink / raw)
  To: u-boot

On Wed, 3 Feb 2021 14:08:39 +0100
Marek Vasut <marek.vasut@gmail.com> wrote:

Hi Marek,

I think our opinions actually don't differ that much, but we might be
either misunderstanding ourselves or talk about different things.
See below.

> On 2/3/21 11:42 AM, Andre Przywara wrote:
> 
> [...]
> 
> >>>>>    drivers/nvme/nvme.c | 50 +++++++++++++++++++++++++++++----------------
> >>>>>    1 file changed, 32 insertions(+), 18 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/nvme/nvme.c b/drivers/nvme/nvme.c
> >>>>> index 5d6331ad34..758415a53b 100644
> >>>>> --- a/drivers/nvme/nvme.c
> >>>>> +++ b/drivers/nvme/nvme.c
> >>>>> @@ -53,6 +53,27 @@ struct nvme_queue {
> >>>>>           unsigned long cmdid_data[];
> >>>>>    };
> >>>>>
> >>>>> +static void nvme_align_dcache_range(void *start, unsigned long size,
> >>>>> +                                   unsigned long *s, unsigned long *e)
> >>>>> +{
> >>>>> +       *s = rounddown((uintptr_t)start, ARCH_DMA_MINALIGN);
> >>>>> +       *e = roundup((uintptr_t)start + size, ARCH_DMA_MINALIGN);
> >>>>> +}  
> >>>
> >>> As mentioned in the other email, just rounding the value that we are
> >>> going to pass to the cache maintenance operation does not make sense,
> >>> so this function should go.  
> >>
> >> You keep saying that this patch makes no sense, but I don't see any
> >> explanation _why_ there is a problem.  
> > 
> > My main point is that you merely align the values that you pass to the
> > cache alignment function, but not the actual buffer addresses.  
> 
> Yes, which is explained in the commit message, in this driver it is 
> sufficient to around the start address down and end address up.

I haven't checked the whole driver with scrutiny, but this is totally
non-obvious, and also fragile. For instance single cqes entries are not
padded, and the padding of the whole array just works by chance, due to
the following memalign and an implementation detail of memalign_simple.

So this deserves a comment in the code, for the contentious calls.
 
> > Not sure if this is clear, but the VAs used in cache maintenance
> > instructions do NOT need to be cache line aligned. The silicon
> > implementation will take care of this:  
> 
> No, this is incorrect.
> The flush/invalidate_dcache_range() functions 

We are talking about different things here. The CPU instructions do not
require alignment, so it's a pure U-Boot property.
I see that's it's not the worst idea to demand alignment, to make
driver author's aware of the side effects of cache alignment.
But the ARM CPU instructions have the rounding built-in.

> only support cacheline aligned addresses, if the address is not aligned, 
> then these functions do nothing. If arm64 has some special behavior, 
> then that's irrelevant here, since this is a generic driver and must 
> work on all architectures.

But your bugfix here was particularly for arm64, so what does it do
then? arm64 would work with or without the rounding, in fact the
implementation in cache.S rounds before the CPU instructions (which
would round again themselves).

> > "When a DC instruction requires an address argument this takes the
> > form of a 64-bit register that holds the VA argument. No alignment
> > restrictions apply for this address."
> > (ARMv8 ARM DDI 0487F.c D4.4.8 A64 Cache maintenance instructions:
> > The data cache maintenance instruction (DC), pg. D4-2505)
> > 
> > "When the data is stated to be an MVA, it does not have to be cache
> > line aligned." (ARMv7 ARM DDI 0406C.d B4.2.1 Cache and branch
> > predictor maintenance operations, VMSA; section "MVA", pg. B4-1736)
> > 
> > I think I once chased this back to even ARMv6.
> > 
> > What requires alignment are the buffers used with cache
> > maintenance, to avoid a cache maintenance operation inadvertently
> > affecting other data.
> > 
> > One dangerous operation is a "pure" invalidate on a *dirty* cache
> > line, as this may potentially lose data.
> > The other harm I see is when cleaning a line, when the hardware
> > (some DMA) has updated the backing memory meanwhile (for instance
> > to update a status bit or to fill a buffer with data from the
> > hardware.
> > 
> > Avoiding those two situations requires careful alignment and
> > padding of the *buffers*, blanketly rounding any addresses handed
> > to the cache maintenance operation will not help.  
> 
> If you read the patch, you will notice that these conditions either 
> happened before too, because the cache operations were in place
> already OR they do not apply at all, because the buffers in question
> are already well aligned.

Great! But why do you need to round then? If the buffers are already all
aligned, we just hand their addresses to *_cache_range(), and things
would be fine.
Just looking at the invalidates, we just need to round in
nvme_read_completion_status, because a single entry is *not* aligned.
And in *this particular case* this is fine, because they are only read
by the CPU and the cache line is shared only with other CQ entries (see
above).
Actually I would suggest in this case:
- blow up the cqes allocation size to cover whole cache lines:
  memalign(4096, ALIGN(NVME_CQ_SIZE(depth), ARCH_DMA_MINALIGN)
- have a wrapper that always invalidates the whole region, to not give
  the illusion that invalidating just a single entry is a thing:
  invalidate_all_cqes()
  {
	ulong start = (ulong)&nvmeq->cqes[0];
	ulong end = start + ALIGN(NVME_CQ_SIZE(NVME_Q_DEPTH),
				  ARCH_DMA_MINALIGN);

	invalidate_cache_range(start, stop);
  }

All other invalidates don't need rounding, and my argument is that this
blanket rounding is jeopardizing the very reason you ask for the
alignment in the first place.

> >>>>> +
> >>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
> >>>>> +{
> >>>>> +       unsigned long s, e;
> >>>>> +       nvme_align_dcache_range(start, size, &s, &e);
> >>>>> +       flush_dcache_range(s, e);  
> >>>
> >>> There is no good reason for alignment restrictions when it comes to
> >>> clean (& invalidate), so there is no need for this wrapper.  
> >>
> >> Is that on ARM64-specific or is that applicable in general ? The driver
> >> is expected to work on any CPU.  
> > 
> > Cache clean (actually: cache clean&invalidate) is what happens on evictions
> > all of the time, at the cache controller's discretion. So there is no
> > real harm in that operation per se. When an eviction happens on a
> > *clean* cache line, this is basically just an invalidate, which is also not
> > harmful.
> > 
> > There are harmful cases when buffers sharing a cache line are both "just invalidated"
> > and "cleaned" at different points in time.  
> 
> Is that on ARM64-specific or is that applicable in general ? (the above 
> does not answer that question)

I would say that's a property of *every* write-back cache
implementation:
https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg

> 
> >> The flushed area start/end address must
> >> be cache line aligned, hence the wrapper is correct and needed.  
> > 
> > Why? (See below)
> >   
> >>>>> +}
> >>>>> +
> >>>>> +static void nvme_invalidate_dcache_range(void *start, unsigned long size)
> >>>>> +{
> >>>>> +       unsigned long s, e;
> >>>>> +       nvme_align_dcache_range(start, size, &s, &e);
> >>>>> +       invalidate_dcache_range(s, e);  
> >>>
> >>> invalidate is tricky, we could use a wrapper similar to
> >>> invalidate_dcache_check() in bcmgenet.c. However I am not sure this is
> >>> very useful here, because most requests are already for a whole buffer
> >>> size (page or block).
> >>>
> >>> Alignment of the buffer address will be checked by the implementation
> >>> of invalidate_dcache_range().  
> >>
> >> Both invalidate_dcache_range() and flush_dcache_range() will perform no
> >> operation if either start or end address is not cache-line aligned.  
> > 
> > Where does this come from? I don't see this. ARMv8 goes directly to
> > assembly, where there is no check or bail out.  
> 
> So ARMv8 has custom behavior, but that is not how these functions are 
> supposed to work, look at check_cache_range() usage elsewhere. If the 
> documentation is lacking, see 397b5697ad ("arm: Move check_cache_range() 
> into a common place") and co.

Well, check_cache_range() is a separate discussion I would say. I would
argue that it's less useful for flush (because clean&invalidates
happen everytime). Also, in reality, it just pushes drivers to just
round the arguments, to pass this check.

And, I wonder if this: "if (!check_cache_range(start, stop)) return;"
is the right answer to this idea then. I am not sure that skipping the
cache maintenance is really better than allowing an unaligned address.

> 
> It actually aligns the
> > values passed in:
> >          /* x2 <- minimal cache line size in cache system */
> >          sub     x3, x2, #1
> >          bic     x0, x0, x3
> > 1:      dc      ivac, x0        /* invalidate data or unified cache */
> > (arch/arm/cpu/armv8/cache.S:__asm_invalidate_dcache_range)
> > I think it does this to simplify the algorithm to walk over the range.
> > 
> > On v7 there is indeed a check, but just for inval, not for flush.  
> 
> Again, arch specific behavior on which you cannot depend in generic driver.

Your commit message says: " ... which permits the nvme to work on
arm64."

> > Can you say what the reason for this v7 check was? I would say it's a
> > heads up to the programmer, to make sure we invalidate only a certain
> > buffer, but nothing else.  
> 
> The reason for those checks is because not all architectures can 
> flush/invalidate at sub-cache-line granularity, which results in obscure 
> problems.

When you say "can flush/invalidate at sub-cache-line granularity", do
you mean "will fault"? Because by design I think you can only ever
flush/invalidate whole cache lines, everywhere. The question is whether
all architectures permit unaligned addresses (ARM does). This could be
handled by the implementation (I think it actually is).

> 
> > BUT: just aligning the address for the *cache operation* is missing
> > the point, as the instruction works on the whole cache line anyway. We
> > need to make sure that no other data (stack, heap, buffer) is sharing
> > the same cache line *and* is dirty, at the time when we invalidate.
> > Just rounding the addresses lets you merely pass the check_cache_range()
> > function, but will not solve the actual issue.
> > 
> > Take an example:
> >     nvmeq->cqes = (void *)memalign(4096, NVME_CQ_SIZE(depth))
> >       will result in this allocation (0x4000 is just some address):
> >   0x4000       0x4010       0x4020
> >      |  cqes[0]   |   cqes[1]  |
> > 
> > So struct nvme_completion seems to be exactly 16 bytes long, so both
> > entries will occupy the same cache line (cache line size >= 32 bytes).
> > With the common 64 byte size any data after that (at 0x4020) would be
> > affected as well.
> > So if you invalidate one entry, the other will also be invalidated. Any
> > address between 0x4000 and 0x403f will do that, whether this is the
> > start address of cqes[1], cqes[0] or any of those, rounded down.  
> 
> And is either of the neighboring cges[] used, so does this cause any 
> issues ? I believe the answer is no.

I agree, and in *this* case we checked carefully, and can indeed round.
But this should rather be done like I sketched above, by explicitly
invalidating the whole array.

> > We need to be aware of that. Looking again more closely at this
> > case, it seems fine here, as we only ever seem to *read* from those
> > (after initialisation), so they would never become dirty. Invalidating
> > neighboring entries should do no harm then.
> > As for other data after the buffer sharing the cache line: This is fine
> > *here* because there is a memalign(4096, ...) after this memalign above,
> > so the (4096 - 32) bytes after cqes[1] are not used. But honestly this
> > should either be documented or forced, by blowing up the size
> > of the allocation to a multiple of the cache line size.
> > 
> > So to summarise: The *whole* of the CQ entries are correctly aligned and
> > padded, but only because of the *next* memalign has an alignment
> > bigger than the cache line size.
> > A *single* CQ entry is not fine, as it's never starting *and* ending at
> > a cache line boundary. In this case this is fine, as all CQ entries are
> > only ever read, so invalidating those always clean cache lines does no
> > harm.  
> 
> Good, I am happy to hear we finally reached the same conclusion.
> 
> So what do you propose is adjusted in this bugfix ?

That we just do this in nvme_read_completion_status(), ideally as in the
snippet above. The other invalidates don't need rounding, and doing the
blanket rounding papers over any potential bugs, namely unaligned
buffer addresses. They should be already page aligned, AFAICT.

Need to have a closer look at the flush's.
 
> >> If a
> >> driver passes such unaligned address to either function, then the driver
> >> must be fixed, because it will fail to perform cache flush/invalidate
> >> ops silently and then fail silently, e.g. like this nvme driver does.  
> > 
> > If a driver *uses* such unaligned address for a *pure invalidate*, it
> > should be carefully investigated if this is safe, or if anything else
> > is potentially sharing the same cache line. A blanket rounding will not
> > help with this, and will just paper over the issue.  
> 
> See check_cache_range(), the sole reason for its existence was to notify 
> driver developers about various cache alignment problems. I believe it 
> should be added on armv8 too.

That could be a discussion to have, yes, at least for invalidate. But
from what I saw in drivers, they just work around this check, by forcing
alignment on the *arguments*, without carefully checking that the
actual buffers are aligned and padded.

Cheers,
Andre

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-04 10:26           ` Andre Przywara
@ 2021-02-04 16:57             ` Tom Rini
  2021-02-07 18:20               ` Marek Vasut
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Rini @ 2021-02-04 16:57 UTC (permalink / raw)
  To: u-boot

On Thu, Feb 04, 2021 at 10:26:36AM +0000, Andre Przywara wrote:
> On Wed, 3 Feb 2021 14:08:39 +0100
> Marek Vasut <marek.vasut@gmail.com> wrote:
> 
> Hi Marek,
> 
> I think our opinions actually don't differ that much, but we might be
> either misunderstanding ourselves or talk about different things.
> See below.
[snip]
> > >>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > >>>>> +{
> > >>>>> +       unsigned long s, e;
> > >>>>> +       nvme_align_dcache_range(start, size, &s, &e);
> > >>>>> +       flush_dcache_range(s, e);  
> > >>>
> > >>> There is no good reason for alignment restrictions when it comes to
> > >>> clean (& invalidate), so there is no need for this wrapper.  
> > >>
> > >> Is that on ARM64-specific or is that applicable in general ? The driver
> > >> is expected to work on any CPU.  
> > > 
> > > Cache clean (actually: cache clean&invalidate) is what happens on evictions
> > > all of the time, at the cache controller's discretion. So there is no
> > > real harm in that operation per se. When an eviction happens on a
> > > *clean* cache line, this is basically just an invalidate, which is also not
> > > harmful.
> > > 
> > > There are harmful cases when buffers sharing a cache line are both "just invalidated"
> > > and "cleaned" at different points in time.  
> > 
> > Is that on ARM64-specific or is that applicable in general ? (the above 
> > does not answer that question)
> 
> I would say that's a property of *every* write-back cache
> implementation:
> https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg

I've been reading and digesting the thread as it goes, and the only
thing I do want to chime in on here right now is that yes, U-Boot
does and will continue to support every CPU that someone wants to run it
on, and one of the takeaways I see from this thread is we need some
better documented abstractions around cache, as it's very tricky to get
right all the time.

-- 
Tom
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 659 bytes
Desc: not available
URL: <https://lists.denx.de/pipermail/u-boot/attachments/20210204/7366d237/attachment.sig>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-04 16:57             ` Tom Rini
@ 2021-02-07 18:20               ` Marek Vasut
  2021-02-07 19:13                 ` Tom Rini
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-07 18:20 UTC (permalink / raw)
  To: u-boot

On 2/4/21 5:57 PM, Tom Rini wrote:
[...]

>>>>>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
>>>>>>>> +{
>>>>>>>> +       unsigned long s, e;
>>>>>>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>>>>>>> +       flush_dcache_range(s, e);
>>>>>>
>>>>>> There is no good reason for alignment restrictions when it comes to
>>>>>> clean (& invalidate), so there is no need for this wrapper.
>>>>>
>>>>> Is that on ARM64-specific or is that applicable in general ? The driver
>>>>> is expected to work on any CPU.
>>>>
>>>> Cache clean (actually: cache clean&invalidate) is what happens on evictions
>>>> all of the time, at the cache controller's discretion. So there is no
>>>> real harm in that operation per se. When an eviction happens on a
>>>> *clean* cache line, this is basically just an invalidate, which is also not
>>>> harmful.
>>>>
>>>> There are harmful cases when buffers sharing a cache line are both "just invalidated"
>>>> and "cleaned" at different points in time.
>>>
>>> Is that on ARM64-specific or is that applicable in general ? (the above
>>> does not answer that question)
>>
>> I would say that's a property of *every* write-back cache
>> implementation:
>> https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg
> 
> I've been reading and digesting the thread as it goes, and the only
> thing I do want to chime in on here right now is that yes, U-Boot
> does and will continue to support every CPU that someone wants to run it
> on, and one of the takeaways I see from this thread is we need some
> better documented abstractions around cache, as it's very tricky to get
> right all the time.

Documenting the u-boot cache function behavior precisely is fine by me, 
but that is somewhat separate topic from this bugfix.

So I will ask a simple question -- is there anything blocking this 
bugfix from being applied ?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-07 18:20               ` Marek Vasut
@ 2021-02-07 19:13                 ` Tom Rini
  2021-02-08 13:32                   ` Andre Przywara
  0 siblings, 1 reply; 22+ messages in thread
From: Tom Rini @ 2021-02-07 19:13 UTC (permalink / raw)
  To: u-boot

On Sun, Feb 07, 2021 at 07:20:14PM +0100, Marek Vasut wrote:
> On 2/4/21 5:57 PM, Tom Rini wrote:
> [...]
> 
> > > > > > > > > +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > > > > > > > > +{
> > > > > > > > > +       unsigned long s, e;
> > > > > > > > > +       nvme_align_dcache_range(start, size, &s, &e);
> > > > > > > > > +       flush_dcache_range(s, e);
> > > > > > > 
> > > > > > > There is no good reason for alignment restrictions when it comes to
> > > > > > > clean (& invalidate), so there is no need for this wrapper.
> > > > > > 
> > > > > > Is that on ARM64-specific or is that applicable in general ? The driver
> > > > > > is expected to work on any CPU.
> > > > > 
> > > > > Cache clean (actually: cache clean&invalidate) is what happens on evictions
> > > > > all of the time, at the cache controller's discretion. So there is no
> > > > > real harm in that operation per se. When an eviction happens on a
> > > > > *clean* cache line, this is basically just an invalidate, which is also not
> > > > > harmful.
> > > > > 
> > > > > There are harmful cases when buffers sharing a cache line are both "just invalidated"
> > > > > and "cleaned" at different points in time.
> > > > 
> > > > Is that on ARM64-specific or is that applicable in general ? (the above
> > > > does not answer that question)
> > > 
> > > I would say that's a property of *every* write-back cache
> > > implementation:
> > > https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg
> > 
> > I've been reading and digesting the thread as it goes, and the only
> > thing I do want to chime in on here right now is that yes, U-Boot
> > does and will continue to support every CPU that someone wants to run it
> > on, and one of the takeaways I see from this thread is we need some
> > better documented abstractions around cache, as it's very tricky to get
> > right all the time.
> 
> Documenting the u-boot cache function behavior precisely is fine by me, but
> that is somewhat separate topic from this bugfix.
> 
> So I will ask a simple question -- is there anything blocking this bugfix
> from being applied ?

While I can fix the typo, I'm waiting for an Ack/Reviewed-by from Bin
(as he's spent the most time on this driver of late) and I'd really like
one from Andre, or at least him agreeing this patch is a step in the
right direction.

-- 
Tom
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 659 bytes
Desc: not available
URL: <https://lists.denx.de/pipermail/u-boot/attachments/20210207/03e386c4/attachment.sig>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-07 19:13                 ` Tom Rini
@ 2021-02-08 13:32                   ` Andre Przywara
  2021-02-08 15:11                     ` Bin Meng
  2021-02-08 15:49                     ` Marek Vasut
  0 siblings, 2 replies; 22+ messages in thread
From: Andre Przywara @ 2021-02-08 13:32 UTC (permalink / raw)
  To: u-boot

On Sun, 7 Feb 2021 14:13:37 -0500
Tom Rini <trini@konsulko.com> wrote:

Hi Tom, Marek,

> On Sun, Feb 07, 2021 at 07:20:14PM +0100, Marek Vasut wrote:
> > On 2/4/21 5:57 PM, Tom Rini wrote:
> > [...]
> >   
> > > > > > > > > > +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > > > > > > > > > +{
> > > > > > > > > > +       unsigned long s, e;
> > > > > > > > > > +       nvme_align_dcache_range(start, size, &s, &e);
> > > > > > > > > > +       flush_dcache_range(s, e);  
> > > > > > > > 
> > > > > > > > There is no good reason for alignment restrictions when it comes to
> > > > > > > > clean (& invalidate), so there is no need for this wrapper.  
> > > > > > > 
> > > > > > > Is that on ARM64-specific or is that applicable in general ? The driver
> > > > > > > is expected to work on any CPU.  
> > > > > > 
> > > > > > Cache clean (actually: cache clean&invalidate) is what happens on evictions
> > > > > > all of the time, at the cache controller's discretion. So there is no
> > > > > > real harm in that operation per se. When an eviction happens on a
> > > > > > *clean* cache line, this is basically just an invalidate, which is also not
> > > > > > harmful.
> > > > > > 
> > > > > > There are harmful cases when buffers sharing a cache line are both "just invalidated"
> > > > > > and "cleaned" at different points in time.  
> > > > > 
> > > > > Is that on ARM64-specific or is that applicable in general ? (the above
> > > > > does not answer that question)  
> > > > 
> > > > I would say that's a property of *every* write-back cache
> > > > implementation:
> > > > https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg  
> > > 
> > > I've been reading and digesting the thread as it goes, and the only
> > > thing I do want to chime in on here right now is that yes, U-Boot
> > > does and will continue to support every CPU that someone wants to run it
> > > on, and one of the takeaways I see from this thread is we need some
> > > better documented abstractions around cache, as it's very tricky to get
> > > right all the time.  
> > 
> > Documenting the u-boot cache function behavior precisely is fine by me, but
> > that is somewhat separate topic from this bugfix.
> > 
> > So I will ask a simple question -- is there anything blocking this bugfix
> > from being applied ?  
> 
> While I can fix the typo, I'm waiting for an Ack/Reviewed-by from Bin
> (as he's spent the most time on this driver of late) and I'd really like
> one from Andre, or at least him agreeing this patch is a step in the
> right direction.

As I said: I don't see how this patch changes anything on arm64, which
the commit messages claims to be the reason for this post.
If someone please can confirm, but invalidate_dcache_range() always
works on arm64, in fact does the very rounding already that this patch
introduces here. So what is the actual fix here?

Plus, I consider this blanket rounding more harmful than helpful, since
it actually just silences the check_cache_range() check.

If we include armv7 here, which in fact would ignore(!) an unaligned
invalidate, this is my analysis (just for invalidate):
nvme_read_completion_status():		NEEDS ALIGNMENT
	size smaller than cache line, cqes[1] base address unaligned.
	fix needed, rounding *could* be a temporary fix with comments
	as of why this is legitimate in this case.
	Better alternative sketched in a previous email.
nvme_identify():			OK
	struct nvme_id_ctrl is 4K, if I counted correctly, so is fine.
	buffer comes the callers, the in-file users pass an aligned
	address, the only external user I see is in nvme_show.c, which
	also explicitly aligns the buffer.
nvme_blk_rw():				OK
	total_len seems to be a multiple of block size, so is hopefully
	already cache line aligned.
	buffer comes from the upper layers, I guess it's page
	aligned already (at least 512b sector aligned)?

I turned my sketch for the cqes[] fix into a proper patch and will send
this in a minute.

Cheers,
Andre

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-08 13:32                   ` Andre Przywara
@ 2021-02-08 15:11                     ` Bin Meng
  2021-02-08 15:51                       ` Marek Vasut
  2021-02-08 15:49                     ` Marek Vasut
  1 sibling, 1 reply; 22+ messages in thread
From: Bin Meng @ 2021-02-08 15:11 UTC (permalink / raw)
  To: u-boot

Hi Andre,

On Mon, Feb 8, 2021 at 9:33 PM Andre Przywara <andre.przywara@arm.com> wrote:
>
> On Sun, 7 Feb 2021 14:13:37 -0500
> Tom Rini <trini@konsulko.com> wrote:
>
> Hi Tom, Marek,
>
> > On Sun, Feb 07, 2021 at 07:20:14PM +0100, Marek Vasut wrote:
> > > On 2/4/21 5:57 PM, Tom Rini wrote:
> > > [...]
> > >
> > > > > > > > > > > +static void nvme_flush_dcache_range(void *start, unsigned long size)
> > > > > > > > > > > +{
> > > > > > > > > > > +       unsigned long s, e;
> > > > > > > > > > > +       nvme_align_dcache_range(start, size, &s, &e);
> > > > > > > > > > > +       flush_dcache_range(s, e);
> > > > > > > > >
> > > > > > > > > There is no good reason for alignment restrictions when it comes to
> > > > > > > > > clean (& invalidate), so there is no need for this wrapper.
> > > > > > > >
> > > > > > > > Is that on ARM64-specific or is that applicable in general ? The driver
> > > > > > > > is expected to work on any CPU.
> > > > > > >
> > > > > > > Cache clean (actually: cache clean&invalidate) is what happens on evictions
> > > > > > > all of the time, at the cache controller's discretion. So there is no
> > > > > > > real harm in that operation per se. When an eviction happens on a
> > > > > > > *clean* cache line, this is basically just an invalidate, which is also not
> > > > > > > harmful.
> > > > > > >
> > > > > > > There are harmful cases when buffers sharing a cache line are both "just invalidated"
> > > > > > > and "cleaned" at different points in time.
> > > > > >
> > > > > > Is that on ARM64-specific or is that applicable in general ? (the above
> > > > > > does not answer that question)
> > > > >
> > > > > I would say that's a property of *every* write-back cache
> > > > > implementation:
> > > > > https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg
> > > >
> > > > I've been reading and digesting the thread as it goes, and the only
> > > > thing I do want to chime in on here right now is that yes, U-Boot
> > > > does and will continue to support every CPU that someone wants to run it
> > > > on, and one of the takeaways I see from this thread is we need some
> > > > better documented abstractions around cache, as it's very tricky to get
> > > > right all the time.
> > >
> > > Documenting the u-boot cache function behavior precisely is fine by me, but
> > > that is somewhat separate topic from this bugfix.
> > >
> > > So I will ask a simple question -- is there anything blocking this bugfix
> > > from being applied ?
> >
> > While I can fix the typo, I'm waiting for an Ack/Reviewed-by from Bin
> > (as he's spent the most time on this driver of late) and I'd really like
> > one from Andre, or at least him agreeing this patch is a step in the
> > right direction.
>
> As I said: I don't see how this patch changes anything on arm64, which
> the commit messages claims to be the reason for this post.
> If someone please can confirm, but invalidate_dcache_range() always
> works on arm64, in fact does the very rounding already that this patch
> introduces here. So what is the actual fix here?
>
> Plus, I consider this blanket rounding more harmful than helpful, since
> it actually just silences the check_cache_range() check.
>
> If we include armv7 here, which in fact would ignore(!) an unaligned
> invalidate, this is my analysis (just for invalidate):
> nvme_read_completion_status():          NEEDS ALIGNMENT
>         size smaller than cache line, cqes[1] base address unaligned.
>         fix needed, rounding *could* be a temporary fix with comments
>         as of why this is legitimate in this case.
>         Better alternative sketched in a previous email.

This is exactly what I mentioned before [1]. The only problematic
routine is the nvme_read_completion_status() but I didn't have time to
experiment with a fix.

> nvme_identify():                        OK
>         struct nvme_id_ctrl is 4K, if I counted correctly, so is fine.
>         buffer comes the callers, the in-file users pass an aligned
>         address, the only external user I see is in nvme_show.c, which
>         also explicitly aligns the buffer.
> nvme_blk_rw():                          OK
>         total_len seems to be a multiple of block size, so is hopefully
>         already cache line aligned.
>         buffer comes from the upper layers, I guess it's page
>         aligned already (at least 512b sector aligned)?
>
> I turned my sketch for the cqes[] fix into a proper patch and will send
> this in a minute.

Thanks for looking into this.

[1] https://lists.denx.de/pipermail/u-boot/2021-February/439580.html

Regards,
Bin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-08 13:32                   ` Andre Przywara
  2021-02-08 15:11                     ` Bin Meng
@ 2021-02-08 15:49                     ` Marek Vasut
  2021-02-08 16:30                       ` Andre Przywara
  1 sibling, 1 reply; 22+ messages in thread
From: Marek Vasut @ 2021-02-08 15:49 UTC (permalink / raw)
  To: u-boot

On 2/8/21 2:32 PM, Andre Przywara wrote:
[...]
>>>>>>>>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
>>>>>>>>>>> +{
>>>>>>>>>>> +       unsigned long s, e;
>>>>>>>>>>> +       nvme_align_dcache_range(start, size, &s, &e);
>>>>>>>>>>> +       flush_dcache_range(s, e);
>>>>>>>>>
>>>>>>>>> There is no good reason for alignment restrictions when it comes to
>>>>>>>>> clean (& invalidate), so there is no need for this wrapper.
>>>>>>>>
>>>>>>>> Is that on ARM64-specific or is that applicable in general ? The driver
>>>>>>>> is expected to work on any CPU.
>>>>>>>
>>>>>>> Cache clean (actually: cache clean&invalidate) is what happens on evictions
>>>>>>> all of the time, at the cache controller's discretion. So there is no
>>>>>>> real harm in that operation per se. When an eviction happens on a
>>>>>>> *clean* cache line, this is basically just an invalidate, which is also not
>>>>>>> harmful.
>>>>>>>
>>>>>>> There are harmful cases when buffers sharing a cache line are both "just invalidated"
>>>>>>> and "cleaned" at different points in time.
>>>>>>
>>>>>> Is that on ARM64-specific or is that applicable in general ? (the above
>>>>>> does not answer that question)
>>>>>
>>>>> I would say that's a property of *every* write-back cache
>>>>> implementation:
>>>>> https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg
>>>>
>>>> I've been reading and digesting the thread as it goes, and the only
>>>> thing I do want to chime in on here right now is that yes, U-Boot
>>>> does and will continue to support every CPU that someone wants to run it
>>>> on, and one of the takeaways I see from this thread is we need some
>>>> better documented abstractions around cache, as it's very tricky to get
>>>> right all the time.
>>>
>>> Documenting the u-boot cache function behavior precisely is fine by me, but
>>> that is somewhat separate topic from this bugfix.
>>>
>>> So I will ask a simple question -- is there anything blocking this bugfix
>>> from being applied ?
>>
>> While I can fix the typo, I'm waiting for an Ack/Reviewed-by from Bin
>> (as he's spent the most time on this driver of late) and I'd really like
>> one from Andre, or at least him agreeing this patch is a step in the
>> right direction.
> 
> As I said: I don't see how this patch changes anything on arm64, which
> the commit messages claims to be the reason for this post.
> If someone please can confirm, but invalidate_dcache_range() always
> works on arm64, in fact does the very rounding already that this patch
> introduces here. So what is the actual fix here?

You can NOT depend on the behavior of cache functions on specific 
architecture, U-Boot is universal and this must work on every single 
architecture, not just aarch64.

The entire point of this patch is to call flush/invalidate only with 
cacheline-aligned addresses, which is what they expect, otherwise these 
functions do no operation at all.

> Plus, I consider this blanket rounding more harmful than helpful, since
> it actually just silences the check_cache_range() check.

Can you back this claim with something ? Because I spent my fair share 
of time analyzing the driver and the rounding is exactly sufficient to 
make the cache ops work correctly.

> If we include armv7 here, which in fact would ignore(!) an unaligned
> invalidate

Yes, on armv7 and older, the cache ops behave as they were designed to 
behave.

>, this is my analysis (just for invalidate):
> nvme_read_completion_status():		NEEDS ALIGNMENT
> 	size smaller than cache line, cqes[1] base address unaligned.
> 	fix needed, rounding *could* be a temporary fix with comments
> 	as of why this is legitimate in this case.
> 	Better alternative sketched in a previous email.
> nvme_identify():			OK
> 	struct nvme_id_ctrl is 4K, if I counted correctly, so is fine.
> 	buffer comes the callers, the in-file users pass an aligned
> 	address, the only external user I see is in nvme_show.c, which
> 	also explicitly aligns the buffer.
> nvme_blk_rw():				OK
> 	total_len seems to be a multiple of block size, so is hopefully
> 	already cache line aligned.
> 	buffer comes from the upper layers, I guess it's page
> 	aligned already (at least 512b sector aligned)?
> 
> I turned my sketch for the cqes[] fix into a proper patch and will send
> this in a minute
Please add check_cache_range() to armv8 cache ops and test whether there 
are no warnings from it, thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-08 15:11                     ` Bin Meng
@ 2021-02-08 15:51                       ` Marek Vasut
  0 siblings, 0 replies; 22+ messages in thread
From: Marek Vasut @ 2021-02-08 15:51 UTC (permalink / raw)
  To: u-boot

On 2/8/21 4:11 PM, Bin Meng wrote:
[...]
>> As I said: I don't see how this patch changes anything on arm64, which
>> the commit messages claims to be the reason for this post.
>> If someone please can confirm, but invalidate_dcache_range() always
>> works on arm64, in fact does the very rounding already that this patch
>> introduces here. So what is the actual fix here?
>>
>> Plus, I consider this blanket rounding more harmful than helpful, since
>> it actually just silences the check_cache_range() check.
>>
>> If we include armv7 here, which in fact would ignore(!) an unaligned
>> invalidate, this is my analysis (just for invalidate):
>> nvme_read_completion_status():          NEEDS ALIGNMENT
>>          size smaller than cache line, cqes[1] base address unaligned.
>>          fix needed, rounding *could* be a temporary fix with comments
>>          as of why this is legitimate in this case.
>>          Better alternative sketched in a previous email.
> 
> This is exactly what I mentioned before [1]. The only problematic
> routine is the nvme_read_completion_status() but I didn't have time to
> experiment with a fix.

I believe it is better to have one single function to handle all the 
cache alignment and operations in the driver instead of having copy of 
the same alignment thing all over the driver.
[...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] nvme: Fix cache alignment
  2021-02-08 15:49                     ` Marek Vasut
@ 2021-02-08 16:30                       ` Andre Przywara
  0 siblings, 0 replies; 22+ messages in thread
From: Andre Przywara @ 2021-02-08 16:30 UTC (permalink / raw)
  To: u-boot

On Mon, 8 Feb 2021 16:49:58 +0100
Marek Vasut <marek.vasut@gmail.com> wrote:

Hi,

> On 2/8/21 2:32 PM, Andre Przywara wrote:
> [...]
> >>>>>>>>>>> +static void nvme_flush_dcache_range(void *start, unsigned long size)
> >>>>>>>>>>> +{
> >>>>>>>>>>> +       unsigned long s, e;
> >>>>>>>>>>> +       nvme_align_dcache_range(start, size, &s, &e);
> >>>>>>>>>>> +       flush_dcache_range(s, e);  
> >>>>>>>>>
> >>>>>>>>> There is no good reason for alignment restrictions when it comes to
> >>>>>>>>> clean (& invalidate), so there is no need for this wrapper.  
> >>>>>>>>
> >>>>>>>> Is that on ARM64-specific or is that applicable in general ? The driver
> >>>>>>>> is expected to work on any CPU.  
> >>>>>>>
> >>>>>>> Cache clean (actually: cache clean&invalidate) is what happens on evictions
> >>>>>>> all of the time, at the cache controller's discretion. So there is no
> >>>>>>> real harm in that operation per se. When an eviction happens on a
> >>>>>>> *clean* cache line, this is basically just an invalidate, which is also not
> >>>>>>> harmful.
> >>>>>>>
> >>>>>>> There are harmful cases when buffers sharing a cache line are both "just invalidated"
> >>>>>>> and "cleaned" at different points in time.  
> >>>>>>
> >>>>>> Is that on ARM64-specific or is that applicable in general ? (the above
> >>>>>> does not answer that question)  
> >>>>>
> >>>>> I would say that's a property of *every* write-back cache
> >>>>> implementation:
> >>>>> https://en.wikipedia.org/wiki/Cache_(computing)#/media/File:Write-back_with_write-allocation.svg  
> >>>>
> >>>> I've been reading and digesting the thread as it goes, and the only
> >>>> thing I do want to chime in on here right now is that yes, U-Boot
> >>>> does and will continue to support every CPU that someone wants to run it
> >>>> on, and one of the takeaways I see from this thread is we need some
> >>>> better documented abstractions around cache, as it's very tricky to get
> >>>> right all the time.  
> >>>
> >>> Documenting the u-boot cache function behavior precisely is fine by me, but
> >>> that is somewhat separate topic from this bugfix.
> >>>
> >>> So I will ask a simple question -- is there anything blocking this bugfix
> >>> from being applied ?  
> >>
> >> While I can fix the typo, I'm waiting for an Ack/Reviewed-by from Bin
> >> (as he's spent the most time on this driver of late) and I'd really like
> >> one from Andre, or at least him agreeing this patch is a step in the
> >> right direction.  
> > 
> > As I said: I don't see how this patch changes anything on arm64, which
> > the commit messages claims to be the reason for this post.
> > If someone please can confirm, but invalidate_dcache_range() always
> > works on arm64, in fact does the very rounding already that this patch
> > introduces here. So what is the actual fix here?  
> 
> You can NOT depend on the behavior of cache functions on specific 
> architecture, U-Boot is universal and this must work on every single 
> architecture, not just aarch64.

I totally understand and support that! See my patch to address the
problem.

What I am wondering is how your patch fixes anything on *arm64*, when
there is no actual change in the argument to the "dc ivac" instruction?
 
> The entire point of this patch is to call flush/invalidate only with 
> cacheline-aligned addresses, which is what they expect, otherwise
> these functions do no operation at all.

First, "doing no operation at all" is some debatable behaviour, I'd
rather see it panic or at least always print an error.
Secondly: this is not the case in this particular case (arm64) that you
mention in the commit message: The invalidate goes through anyway.
 
> > Plus, I consider this blanket rounding more harmful than helpful, since
> > it actually just silences the check_cache_range() check.  
> 
> Can you back this claim with something ? Because I spent my fair share 
> of time analyzing the driver and the rounding is exactly sufficient to 
> make the cache ops work correctly.

So here is my point (again): When we want to invalidate a buffer, it
must be cache line aligned to work correctly - not (only) because of
U-Boot's check, but because you run into problems with invalidating
*other* (potentially dirty) data sharing the same cache line otherwise.
That is the whole point of check_cache_range() in the first place.
And I think we agree on this.

Now: How do you prevent this problem by just rounding the *addresses*
you pass to the cache maintenance instruction?
If a buffer address is not aligned, you want to know about it! Normally
all buffer addresses should be *always* aligned, the driver needs to be
designed this way. So there is no need for rounding.
When now an unaligned address sneaks in - either due to a bug or an
inherent driver design problem - I actually want to see fireworks! At
least the check_cache_range() check should trigger. With blanket
rounding you deliberately disable this check, and invalidate more than
you want to invalidate - without any warnings.
 
> > If we include armv7 here, which in fact would ignore(!) an unaligned
> > invalidate  
> 
> Yes, on armv7 and older, the cache ops behave as they were designed to 
> behave.

Fair enough, we can discuss enabling those checks for armv8 as well,
but at the moment they are not in place yet, so play no role in this
particular problem.

> >, this is my analysis (just for invalidate):
> > nvme_read_completion_status():		NEEDS ALIGNMENT
> > 	size smaller than cache line, cqes[1] base address unaligned.
> > 	fix needed, rounding *could* be a temporary fix with comments
> > 	as of why this is legitimate in this case.
> > 	Better alternative sketched in a previous email.
> > nvme_identify():			OK
> > 	struct nvme_id_ctrl is 4K, if I counted correctly, so is fine.
> > 	buffer comes the callers, the in-file users pass an aligned
> > 	address, the only external user I see is in nvme_show.c, which
> > 	also explicitly aligns the buffer.
> > nvme_blk_rw():				OK
> > 	total_len seems to be a multiple of block size, so is hopefully
> > 	already cache line aligned.
> > 	buffer comes from the upper layers, I guess it's page
> > 	aligned already (at least 512b sector aligned)?
> > 
> > I turned my sketch for the cqes[] fix into a proper patch and will send
> > this in a minute  
> Please add check_cache_range() to armv8 cache ops and test whether there 
> are no warnings from it, thanks.

I don't have a working setup at the moment, I need to first find my
NVMe adaptor and stuff that into a Juno - which in the *middle* of a
pile of boxes :-(

And did *you* do this? Can you report what is the particular problem?
Which of the functions trigger the check?

From how I understand the code, enabling the check (just the check!)
would indeed trigger the warning at the moment, from
nvme_read_completion_status(), but wouldn't change anything otherwise:
cache.S: __asm_invalidate_dcache_range already rounds the address
passed in, so you end up invalidating the same area, before and after.

Cheers,
Andre

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2021-02-08 16:30 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-30 17:53 [PATCH] nvme: Fix cache alignment Marek Vasut
2021-02-02  3:55 ` Bin Meng
2021-02-02  8:05   ` Marek Vasut
2021-02-02  8:54     ` Bin Meng
2021-02-02  9:04       ` Marek Vasut
2021-02-02  9:12         ` Bin Meng
2021-02-02 16:09           ` Marek Vasut
2021-02-02 13:04   ` Andre Przywara
2021-02-02 16:08     ` Marek Vasut
2021-02-02 16:23   ` Andre Przywara
2021-02-02 21:18     ` Marek Vasut
2021-02-03 10:42       ` Andre Przywara
2021-02-03 13:08         ` Marek Vasut
2021-02-04 10:26           ` Andre Przywara
2021-02-04 16:57             ` Tom Rini
2021-02-07 18:20               ` Marek Vasut
2021-02-07 19:13                 ` Tom Rini
2021-02-08 13:32                   ` Andre Przywara
2021-02-08 15:11                     ` Bin Meng
2021-02-08 15:51                       ` Marek Vasut
2021-02-08 15:49                     ` Marek Vasut
2021-02-08 16:30                       ` Andre Przywara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.