All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: fix struct page layout on 32-bit systems
@ 2021-05-16 12:18 Matthew Wilcox (Oracle)
  2021-05-16 12:31 ` Matthew Wilcox
  2021-05-16 16:29 ` Linus Torvalds
  0 siblings, 2 replies; 16+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-05-16 12:18 UTC (permalink / raw)
  To: stable
  Cc: Matthew Wilcox (Oracle),
	Ilias Apalodimas, Jesper Dangaard Brouer, Vlastimil Babka,
	Matteo Croce, Andrew Morton, Linus Torvalds

commit 9ddb3c14afba8bc5950ed297f02d4ae05ff35cd1 upstream

32-bit architectures which expect 8-byte alignment for 8-byte integers and
need 64-bit DMA addresses (arm, mips, ppc) had their struct page
inadvertently expanded in 2019.  When the dma_addr_t was added, it forced
the alignment of the union to 8 bytes, which inserted a 4 byte gap between
'flags' and the union.

Fix this by storing the dma_addr_t in one or two adjacent unsigned longs.
This restores the alignment to that of an unsigned long.  We always
store the low bits in the first word to prevent the PageTail bit from
being inadvertently set on a big endian platform.  If that happened,
get_user_pages_fast() racing against a page which was freed and
reallocated to the page_pool could dereference a bogus compound_head(),
which would be hard to trace back to this cause.

Link: https://lkml.kernel.org/r/20210510153211.1504886-1-willy@infradead.org
Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Matteo Croce <mcroce@linux.microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  4 ++--
 include/net/page_pool.h  | 12 +++++++++++-
 net/core/page_pool.c     | 12 +++++++-----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..5aacc1c10a45 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -97,10 +97,10 @@ struct page {
 		};
 		struct {	/* page_pool used by netstack */
 			/**
-			 * @dma_addr: might require a 64-bit value even on
+			 * @dma_addr: might require a 64-bit value on
 			 * 32-bit architectures.
 			 */
-			dma_addr_t dma_addr;
+			unsigned long dma_addr[2];
 		};
 		struct {	/* slab, slob and slub */
 			union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..e05744b9a1bc 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-	return page->dma_addr;
+	dma_addr_t ret = page->dma_addr[0];
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+	return ret;
+}
+
+static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+	page->dma_addr[0] = addr;
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		page->dma_addr[1] = upper_32_bits(addr);
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ad8b0707af04..f014fd8c19a6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
 					  struct page *page,
 					  unsigned int dma_sync_size)
 {
+	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 					 pool->p.offset, dma_sync_size,
 					 pool->p.dma_dir);
 }
@@ -226,7 +228,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 		put_page(page);
 		return NULL;
 	}
-	page->dma_addr = dma;
+	page_pool_set_dma_addr(page, dma);
 
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
@@ -294,13 +296,13 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 		 */
 		goto skip_dma_unmap;
 
-	dma = page->dma_addr;
+	dma = page_pool_get_dma_addr(page);
 
-	/* When page is unmapped, it cannot be returned our pool */
+	/* When page is unmapped, it cannot be returned to our pool */
 	dma_unmap_page_attrs(pool->p.dev, dma,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 			     DMA_ATTR_SKIP_CPU_SYNC);
-	page->dma_addr = 0;
+	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 12:18 [PATCH] mm: fix struct page layout on 32-bit systems Matthew Wilcox (Oracle)
@ 2021-05-16 12:31 ` Matthew Wilcox
  2021-05-17 10:06   ` Greg KH
  2021-05-16 16:29 ` Linus Torvalds
  1 sibling, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-16 12:31 UTC (permalink / raw)
  To: stable

Forgot to label this one -- it applies to 5.10/5.11/5.12.
See other patch for 5.4.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 12:18 [PATCH] mm: fix struct page layout on 32-bit systems Matthew Wilcox (Oracle)
  2021-05-16 12:31 ` Matthew Wilcox
@ 2021-05-16 16:29 ` Linus Torvalds
  2021-05-16 18:22   ` Matthew Wilcox
  1 sibling, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-16 16:29 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 5:19 AM Matthew Wilcox (Oracle)
<willy@infradead.org> wrote:
>
> 32-bit architectures which expect 8-byte alignment for 8-byte integers and
> need 64-bit DMA addresses (arm, mips, ppc) had their struct page
> inadvertently expanded in 2019.  When the dma_addr_t was added, it forced
> the alignment of the union to 8 bytes, which inserted a 4 byte gap between
> 'flags' and the union.

So I already have this in my tree, but this stable submission made me go "Hmm".

Why do we actually want a full 64-bit DMA address on 32-bit architectures here?

It strikes me that the address is page-aligned, and I suspect we could
just use a 32-bit "DMA page frame number" instead in 'struct page'?

So instead of that odd

+       if (sizeof(dma_addr_t) > sizeof(unsigned long))
+               ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;

maybe we could just do effectively

        ret + (dma_addr_t)page->dma_frame_nr << PAGE_SHIFT;

and simplify this all. We could do it on 64-bit too, just to not have
any opdd special cases (even if we'd have the full 64 bits available).

Hmm?

                Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 16:29 ` Linus Torvalds
@ 2021-05-16 18:22   ` Matthew Wilcox
  2021-05-16 18:27     ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-16 18:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 09:29:41AM -0700, Linus Torvalds wrote:
> On Sun, May 16, 2021 at 5:19 AM Matthew Wilcox (Oracle)
> <willy@infradead.org> wrote:
> >
> > 32-bit architectures which expect 8-byte alignment for 8-byte integers and
> > need 64-bit DMA addresses (arm, mips, ppc) had their struct page
> > inadvertently expanded in 2019.  When the dma_addr_t was added, it forced
> > the alignment of the union to 8 bytes, which inserted a 4 byte gap between
> > 'flags' and the union.
> 
> So I already have this in my tree, but this stable submission made me go "Hmm".
> 
> Why do we actually want a full 64-bit DMA address on 32-bit architectures here?
> 
> It strikes me that the address is page-aligned, and I suspect we could
> just use a 32-bit "DMA page frame number" instead in 'struct page'?

Nobody's been willing to guarantee that all 32-bit architectures keep the
top 20 bits clear for their DMA addresses.  I've certainly seen hardware
(maybe PA-RISC?  MIPS?) which uses the top few bits of the DMA address to
indicate things like "coherent" or "bypasses IOMMU".  Rather than trying
to find out, I thought this was the safer option.  It only affects
32-bit architectures with PAE, and I'd rather not introduce a shift on
64-bit architectures to work around a 32-bit PAE problem.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 18:22   ` Matthew Wilcox
@ 2021-05-16 18:27     ` Linus Torvalds
  2021-05-16 19:09       ` Matthew Wilcox
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-16 18:27 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 11:22 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> Nobody's been willing to guarantee that all 32-bit architectures keep the
> top 20 bits clear for their DMA addresses.  I've certainly seen hardware
> (maybe PA-RISC?  MIPS?) which uses the top few bits of the DMA address to
> indicate things like "coherent" or "bypasses IOMMU".  Rather than trying
> to find out, I thought this was the safer option.

Fair enough. I just find it somewhat odd.

But I still find this a bit ugly. Maybe we could just have made that
one sub-structure "__aligned(4)", and avoided this all, and let the
compiler generate the split load (or, more likely, just let the
compiler generate a regular load from an unaligned location).

IOW, just

                struct {        /* page_pool used by netstack */
                        /**
                         * @dma_addr: might require a 64-bit value even on
                         * 32-bit architectures.
                         */
                        dma_addr_t dma_addr;
                } __aligned((4));

without the magic shifting games?

              Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 18:27     ` Linus Torvalds
@ 2021-05-16 19:09       ` Matthew Wilcox
  2021-05-16 19:22         ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-16 19:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 11:27:10AM -0700, Linus Torvalds wrote:
> On Sun, May 16, 2021 at 11:22 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > Nobody's been willing to guarantee that all 32-bit architectures keep the
> > top 20 bits clear for their DMA addresses.  I've certainly seen hardware
> > (maybe PA-RISC?  MIPS?) which uses the top few bits of the DMA address to
> > indicate things like "coherent" or "bypasses IOMMU".  Rather than trying
> > to find out, I thought this was the safer option.
> 
> Fair enough. I just find it somewhat odd.
> 
> But I still find this a bit ugly. Maybe we could just have made that
> one sub-structure "__aligned(4)", and avoided this all, and let the
> compiler generate the split load (or, more likely, just let the
> compiler generate a regular load from an unaligned location).
> 
> IOW, just
> 
>                 struct {        /* page_pool used by netstack */
>                         /**
>                          * @dma_addr: might require a 64-bit value even on
>                          * 32-bit architectures.
>                          */
>                         dma_addr_t dma_addr;
>                 } __aligned((4));
> 
> without the magic shifting games?

That was the other problem fixed by this patch -- on big-endian 32-bit
platforms with 64-bit dma_addr_t (mips, ppc), a DMA address with bit 32 set
inadvertently sets the PageTail bit.  So we need to store the low bits
in the first word, even on big-endian platforms.

There's an upcoming patch to move dma_addr out of the union with
compound_head, but that requires changing page_is_pfmemalloc() to
use an indicator other than page->index == -1.  Once we do that,
we can have fun with __aligned().  Since we knew it would have to
be backported, this felt like the least risky patch to start with.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 19:09       ` Matthew Wilcox
@ 2021-05-16 19:22         ` Linus Torvalds
  2021-05-17  2:42           ` Matthew Wilcox
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-16 19:22 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 12:09 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> That was the other problem fixed by this patch -- on big-endian 32-bit
> platforms with 64-bit dma_addr_t (mips, ppc), a DMA address with bit 32 set
> inadvertently sets the PageTail bit.  So we need to store the low bits
> in the first word, even on big-endian platforms.

Ouch. And yes, that would have shot down the "dma page frame number" model too.

Oh how I wish PageTail was in "flags". Yes, our compound_head() thing
is "clever", but it's a pain,

That said, that union entry is "5 words", so the dma_addr_t thing
could easily just have had a dummy word at the beginning.

          Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 19:22         ` Linus Torvalds
@ 2021-05-17  2:42           ` Matthew Wilcox
  2021-05-17 22:18             ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-17  2:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 12:22:43PM -0700, Linus Torvalds wrote:
> On Sun, May 16, 2021 at 12:09 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > That was the other problem fixed by this patch -- on big-endian 32-bit
> > platforms with 64-bit dma_addr_t (mips, ppc), a DMA address with bit 32 set
> > inadvertently sets the PageTail bit.  So we need to store the low bits
> > in the first word, even on big-endian platforms.
> 
> Ouch. And yes, that would have shot down the "dma page frame number" model too.
> 
> Oh how I wish PageTail was in "flags". Yes, our compound_head() thing
> is "clever", but it's a pain,
> 
> That said, that union entry is "5 words", so the dma_addr_t thing
> could easily just have had a dummy word at the beginning.

Ah, if you just put one dummy word in front, then dma_addr_t overlaps with
page->mapping, which used to be fine, but now we can map network queues
to userspace, page->mapping has to be NULL.  So there's only two places to
put dma_addr; either overlapping compound_head or overlapping pfmemalloc.

I don't think PageTail is movable -- the issue is needing an atomic read
of both PageTail _and_ the location of the head page.  Even if x86 has
something, there are a lot of architectures that don't.

While I've got you on the subject of compound_head ... have you had a look
at the folio work?  It decreases the number of calls to compound_head()
by about 25%, as well as shrinking the (compiled size) of the kernel and
laying the groundwork for supporting things like 32kB anonymous pages
and adaptive page sizes in the page cache.  Andrew's a bit nervous of
it, probably because it's such a large change.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-16 12:31 ` Matthew Wilcox
@ 2021-05-17 10:06   ` Greg KH
  0 siblings, 0 replies; 16+ messages in thread
From: Greg KH @ 2021-05-17 10:06 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: stable

On Sun, May 16, 2021 at 01:31:05PM +0100, Matthew Wilcox wrote:
> Forgot to label this one -- it applies to 5.10/5.11/5.12.
> See other patch for 5.4.
> 

Thanks, all now queued up.

greg k-h

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-17  2:42           ` Matthew Wilcox
@ 2021-05-17 22:18             ` Linus Torvalds
  2021-05-17 23:02               ` Matthew Wilcox
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-17 22:18 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Sun, May 16, 2021 at 7:42 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> Ah, if you just put one dummy word in front, then dma_addr_t overlaps with
> page->mapping, which used to be fine, but now we can map network queues
> to userspace, page->mapping has to be NULL.

Well, that's a problem in itself. We shouldn't have those kinds of
"oh, it uses one field from one union, and another field from
another".

At least that "low bit of the first word" thing is documented, but
this kind of "oh, it uses dma_addr from one union and mapping from
another" really needs to go away. Because that's just not acceptable.

So that network stack thing should just make the mapping thing explicit then.

Possibly by making mapping/index be the first fields (for both the
page cache and the page_pool thing) and then have the LRU list and the
dma_addr be after that?

> While I've got you on the subject of compound_head ... have you had a look
> at the folio work?  It decreases the number of calls to compound_head()
> by about 25%, as well as shrinking the (compiled size) of the kernel and
> laying the groundwork for supporting things like 32kB anonymous pages
> and adaptive page sizes in the page cache.  Andrew's a bit nervous of
> it, probably because it's such a large change.

I guess I need to take a look. Mind sending me another pointer to the series?

                Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-17 22:18             ` Linus Torvalds
@ 2021-05-17 23:02               ` Matthew Wilcox
  2021-05-17 23:37                 ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-17 23:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Mon, May 17, 2021 at 03:18:38PM -0700, Linus Torvalds wrote:
> On Sun, May 16, 2021 at 7:42 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > Ah, if you just put one dummy word in front, then dma_addr_t overlaps with
> > page->mapping, which used to be fine, but now we can map network queues
> > to userspace, page->mapping has to be NULL.
> 
> Well, that's a problem in itself. We shouldn't have those kinds of
> "oh, it uses one field from one union, and another field from
> another".

No, I agree.  I wasn't aware of that particular problem until recently,
and adjusting the contents of struct page always makes me a little
nervous, so I haven't done it yet.

> At least that "low bit of the first word" thing is documented, but
> this kind of "oh, it uses dma_addr from one union and mapping from
> another" really needs to go away. Because that's just not acceptable.
> 
> So that network stack thing should just make the mapping thing explicit then.

One of the pending patches for next merge window has this:

                struct {        /* page_pool used by netstack */
-                       /**
-                        * @dma_addr: might require a 64-bit value on
-                        * 32-bit architectures.
-                        */
+                       unsigned long pp_magic;
+                       struct page_pool *pp;
+                       unsigned long _pp_mapping_pad;
                        unsigned long dma_addr[2];
                };

(people are still bikeshedding the comments, etc, but fundamentally this
is the new struct layout).  It's not the networking stack that uses
page->mapping, it's if somebody decides to do something like call
get_user_pages() on a mapped page_pool page, which then calls
page_mapping() ... if that doesn't return NULL, then things go wrong.

Andrey Ryabinin found the path:
: Some implementation of the flush_dcache_page(), also set_page_dirty()
: can be called on userspace-mapped vmalloc pages during unmap -
: zap_pte_range() -> set_page_dirty()

> Possibly by making mapping/index be the first fields (for both the
> page cache and the page_pool thing) and then have the LRU list and the
> dma_addr be after that?

Unfortunately, we also use the bottom two bits of ->mapping for PageAnon
/ PageKSM / PageMovable.  We could move ->mapping to the fifth word of
the union, where it's less in the way.

> > While I've got you on the subject of compound_head ... have you had a look
> > at the folio work?  It decreases the number of calls to compound_head()
> > by about 25%, as well as shrinking the (compiled size) of the kernel and
> > laying the groundwork for supporting things like 32kB anonymous pages
> > and adaptive page sizes in the page cache.  Andrew's a bit nervous of
> > it, probably because it's such a large change.
> 
> I guess I need to take a look. Mind sending me another pointer to the series?

This is probably a good place to start:
https://lore.kernel.org/lkml/20210505150628.111735-10-willy@infradead.org/
or if you'd rather look at a git tree,
https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-17 23:02               ` Matthew Wilcox
@ 2021-05-17 23:37                 ` Linus Torvalds
  2021-05-18  0:02                   ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-17 23:37 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Mon, May 17, 2021 at 4:02 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> or if you'd rather look at a git tree,
> https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio

Well, I wanted to check a git tree, but this seems to be based on some
random linux-next commit. Which is a bit annoying.

Looking at it, I get the very strong feeling that the *only* thing
there is "don't call compound_head()".

I have only gone through the beginning of the series, but it really
strikes me as "that's just a page pointer with the rule being that you
always use the head pointer".

I don't mind that rule, but what's the advantage of introducing a new
name for that? IOW, I get the feeling that almost all of this
could/should just be "don't use non-head pages".

Is there something else that appears later?

              Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-17 23:37                 ` Linus Torvalds
@ 2021-05-18  0:02                   ` Linus Torvalds
  2021-05-18  0:53                     ` Matthew Wilcox
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2021-05-18  0:02 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Mon, May 17, 2021 at 4:37 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I don't mind that rule, but what's the advantage of introducing a new
> name for that? IOW, I get the feeling that almost all of this
> could/should just be "don't use non-head pages".

Put another way: I've often wanted to remove the (quite expensive)
"compund_head()" calls out of the code page functions, and move them
into the callers (and in most cases they probably just end up
disappearing entirely, because the callers fundamentally always have a
proper head page).

It feels like this is what the folio patches do, they just spend a
*lot* of effort doing so incrementally by renaming things and
duplicating functionality, rather than just do it (again
incrementally) by just doing one compound_head() movement at a time..

              Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: fix struct page layout on 32-bit systems
  2021-05-18  0:02                   ` Linus Torvalds
@ 2021-05-18  0:53                     ` Matthew Wilcox
  0 siblings, 0 replies; 16+ messages in thread
From: Matthew Wilcox @ 2021-05-18  0:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, Matteo Croce, Andrew Morton

On Mon, May 17, 2021 at 05:02:01PM -0700, Linus Torvalds wrote:
> On Mon, May 17, 2021 at 4:37 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > I don't mind that rule, but what's the advantage of introducing a new
> > name for that? IOW, I get the feeling that almost all of this
> > could/should just be "don't use non-head pages".
> 
> Put another way: I've often wanted to remove the (quite expensive)
> "compund_head()" calls out of the code page functions, and move them
> into the callers (and in most cases they probably just end up
> disappearing entirely, because the callers fundamentally always have a
> proper head page).
> 
> It feels like this is what the folio patches do, they just spend a
> *lot* of effort doing so incrementally by renaming things and
> duplicating functionality, rather than just do it (again
> incrementally) by just doing one compound_head() movement at a time..

I tried that, and I ended up in whack-a-mole hell trying to
figure out all the places that weren't expecting to see a
head page.  If you look at the master branch from that repo:
https://git.infradead.org/users/willy/pagecache.git/shortlog

it basically redefines a THP to be arbitrary order and then tries to
ram through using head pages everywhere.  It's definitely missing a few
spots in the writeback code that get the accounting entirely wrong.
So I decided a new type was in order to distinguish between the
places which do need to see a struct page (vmf->page being one)
and those that are dealing with pages for accounting purposes.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: Fix struct page layout on 32-bit systems
  2021-05-10 15:32 [PATCH] mm: Fix " Matthew Wilcox (Oracle)
@ 2021-05-10 16:38 ` Matteo Croce
  0 siblings, 0 replies; 16+ messages in thread
From: Matteo Croce @ 2021-05-10 16:38 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: linux-mm, stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka, netdev

On Mon, 10 May 2021 16:32:11 +0100
"Matthew Wilcox (Oracle)" <willy@infradead.org> wrote:

> 32-bit architectures which expect 8-byte alignment for 8-byte integers
> and need 64-bit DMA addresses (arm, mips, ppc) had their struct page
> inadvertently expanded in 2019.  When the dma_addr_t was added, it
> forced the alignment of the union to 8 bytes, which inserted a 4 byte
> gap between 'flags' and the union.
> 
> Fix this by storing the dma_addr_t in one or two adjacent unsigned
> longs. This restores the alignment to that of an unsigned long.  We
> always store the low bits in the first word to prevent the PageTail
> bit from being inadvertently set on a big endian platform.  If that
> happened, get_user_pages_fast() racing against a page which was freed
> and reallocated to the page_pool could dereference a bogus
> compound_head(), which would be hard to trace back to this cause.
> 
> Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
> Cc: stable@vger.kernel.org
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  include/linux/mm_types.h |  4 ++--
>  include/net/page_pool.h  | 12 +++++++++++-
>  net/core/page_pool.c     | 12 +++++++-----
>  3 files changed, 20 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 6613b26a8894..5aacc1c10a45 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -97,10 +97,10 @@ struct page {
>  		};
>  		struct {	/* page_pool used by netstack */
>  			/**
> -			 * @dma_addr: might require a 64-bit value
> even on
> +			 * @dma_addr: might require a 64-bit value on
>  			 * 32-bit architectures.
>  			 */
> -			dma_addr_t dma_addr;
> +			unsigned long dma_addr[2];
>  		};
>  		struct {	/* slab, slob and slub */
>  			union {
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 6d517a37c18b..b4b6de909c93 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -198,7 +198,17 @@ static inline void
> page_pool_recycle_direct(struct page_pool *pool, 
>  static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
>  {
> -	return page->dma_addr;
> +	dma_addr_t ret = page->dma_addr[0];
> +	if (sizeof(dma_addr_t) > sizeof(unsigned long))
> +		ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
> +	return ret;
> +}
> +
> +static inline void page_pool_set_dma_addr(struct page *page,
> dma_addr_t addr) +{
> +	page->dma_addr[0] = addr;
> +	if (sizeof(dma_addr_t) > sizeof(unsigned long))
> +		page->dma_addr[1] = upper_32_bits(addr);
>  }
>  
>  static inline bool is_page_pool_compiled_in(void)
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 9ec1aa9640ad..3c4c4c7a0402 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct
> page_pool *pool, struct page *page,
>  					  unsigned int dma_sync_size)
>  {
> +	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> +
>  	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> -	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
> +	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
>  					 pool->p.offset,
> dma_sync_size, pool->p.dma_dir);
>  }
> @@ -195,7 +197,7 @@ static bool page_pool_dma_map(struct page_pool
> *pool, struct page *page) if (dma_mapping_error(pool->p.dev, dma))
>  		return false;
>  
> -	page->dma_addr = dma;
> +	page_pool_set_dma_addr(page, dma);
>  
>  	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
>  		page_pool_dma_sync_for_device(pool, page,
> pool->p.max_len); @@ -331,13 +333,13 @@ void
> page_pool_release_page(struct page_pool *pool, struct page *page) */
>  		goto skip_dma_unmap;
>  
> -	dma = page->dma_addr;
> +	dma = page_pool_get_dma_addr(page);
>  
> -	/* When page is unmapped, it cannot be returned our pool */
> +	/* When page is unmapped, it cannot be returned to our pool
> */ dma_unmap_page_attrs(pool->p.dev, dma,
>  			     PAGE_SIZE << pool->p.order,
> pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
> -	page->dma_addr = 0;
> +	page_pool_set_dma_addr(page, 0);
>  skip_dma_unmap:
>  	/* This may be the last page returned, releasing the pool, so
>  	 * it is not safe to reference pool afterwards.

I rebased the skb recycling series against this change. I have it
running since a few days on a machine (MacchiatoBIN Double shot) which
uses the mvpp2 driver.
No issues so far.

Regards,
-- 
per aspera ad upstream

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] mm: Fix struct page layout on 32-bit systems
@ 2021-05-10 15:32 Matthew Wilcox (Oracle)
  2021-05-10 16:38 ` Matteo Croce
  0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-05-10 15:32 UTC (permalink / raw)
  To: linux-mm
  Cc: Matthew Wilcox (Oracle),
	stable, Ilias Apalodimas, Jesper Dangaard Brouer,
	Vlastimil Babka

32-bit architectures which expect 8-byte alignment for 8-byte integers
and need 64-bit DMA addresses (arm, mips, ppc) had their struct page
inadvertently expanded in 2019.  When the dma_addr_t was added, it forced
the alignment of the union to 8 bytes, which inserted a 4 byte gap between
'flags' and the union.

Fix this by storing the dma_addr_t in one or two adjacent unsigned longs.
This restores the alignment to that of an unsigned long.  We always
store the low bits in the first word to prevent the PageTail bit from
being inadvertently set on a big endian platform.  If that happened,
get_user_pages_fast() racing against a page which was freed and
reallocated to the page_pool could dereference a bogus compound_head(),
which would be hard to trace back to this cause.

Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm_types.h |  4 ++--
 include/net/page_pool.h  | 12 +++++++++++-
 net/core/page_pool.c     | 12 +++++++-----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..5aacc1c10a45 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -97,10 +97,10 @@ struct page {
 		};
 		struct {	/* page_pool used by netstack */
 			/**
-			 * @dma_addr: might require a 64-bit value even on
+			 * @dma_addr: might require a 64-bit value on
 			 * 32-bit architectures.
 			 */
-			dma_addr_t dma_addr;
+			unsigned long dma_addr[2];
 		};
 		struct {	/* slab, slob and slub */
 			union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 6d517a37c18b..b4b6de909c93 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-	return page->dma_addr;
+	dma_addr_t ret = page->dma_addr[0];
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+	return ret;
+}
+
+static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+	page->dma_addr[0] = addr;
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		page->dma_addr[1] = upper_32_bits(addr);
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9ec1aa9640ad..3c4c4c7a0402 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
 					  struct page *page,
 					  unsigned int dma_sync_size)
 {
+	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 					 pool->p.offset, dma_sync_size,
 					 pool->p.dma_dir);
 }
@@ -195,7 +197,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
-	page->dma_addr = dma;
+	page_pool_set_dma_addr(page, dma);
 
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
@@ -331,13 +333,13 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 		 */
 		goto skip_dma_unmap;
 
-	dma = page->dma_addr;
+	dma = page_pool_get_dma_addr(page);
 
-	/* When page is unmapped, it cannot be returned our pool */
+	/* When page is unmapped, it cannot be returned to our pool */
 	dma_unmap_page_attrs(pool->p.dev, dma,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 			     DMA_ATTR_SKIP_CPU_SYNC);
-	page->dma_addr = 0;
+	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-05-18  0:53 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-16 12:18 [PATCH] mm: fix struct page layout on 32-bit systems Matthew Wilcox (Oracle)
2021-05-16 12:31 ` Matthew Wilcox
2021-05-17 10:06   ` Greg KH
2021-05-16 16:29 ` Linus Torvalds
2021-05-16 18:22   ` Matthew Wilcox
2021-05-16 18:27     ` Linus Torvalds
2021-05-16 19:09       ` Matthew Wilcox
2021-05-16 19:22         ` Linus Torvalds
2021-05-17  2:42           ` Matthew Wilcox
2021-05-17 22:18             ` Linus Torvalds
2021-05-17 23:02               ` Matthew Wilcox
2021-05-17 23:37                 ` Linus Torvalds
2021-05-18  0:02                   ` Linus Torvalds
2021-05-18  0:53                     ` Matthew Wilcox
  -- strict thread matches above, loose matches on Subject: below --
2021-05-10 15:32 [PATCH] mm: Fix " Matthew Wilcox (Oracle)
2021-05-10 16:38 ` Matteo Croce

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.