All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
@ 2018-12-05 17:20 Christoph Hellwig
  2018-12-05 21:50 ` Dave Chinner
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2018-12-05 17:20 UTC (permalink / raw)
  To: linux-xfs; +Cc: tom.leiming, vkuznets

XFS currently uses a slab allocator for buffers smaller than the page
size to avoid wasting too much memory.  But this can cause a problem
with slub debugging turned on as the allocations might not be naturally
aligned.  On block devices that require sector size alignment this can
lead to data corruption.

Give that our smaller than page size buffers are always sector sized
on a live file system, we can just create a kmem_cache with an
explicitly specified alignment requirement for this case to fix this
case without much effort.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c   | 23 +++++++++--------------
 fs/xfs/xfs_mount.h |  2 ++
 fs/xfs/xfs_super.c | 18 ++++++++++++++++++
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b21ea2ba768d..904d45f93ce7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -339,8 +339,10 @@ xfs_buf_free(
 
 			__free_page(page);
 		}
-	} else if (bp->b_flags & _XBF_KMEM)
-		kmem_free(bp->b_addr);
+	} else if (bp->b_flags & _XBF_KMEM) {
+		kmem_cache_free(bp->b_target->bt_mount->m_sector_cache,
+				bp->b_addr);
+	}
 	_xfs_buf_free_pages(bp);
 	xfs_buf_free_maps(bp);
 	kmem_zone_free(xfs_buf_zone, bp);
@@ -354,6 +356,7 @@ xfs_buf_allocate_memory(
 	xfs_buf_t		*bp,
 	uint			flags)
 {
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	size_t			size;
 	size_t			nbytes, offset;
 	gfp_t			gfp_mask = xb_to_gfp(flags);
@@ -362,25 +365,17 @@ xfs_buf_allocate_memory(
 	int			error;
 
 	/*
-	 * for buffers that are contained within a single page, just allocate
-	 * the memory from the heap - there's no need for the complexity of
-	 * page arrays to keep allocation down to order 0.
+	 * Use a special slab cache for sector sized buffers when sectors are
+	 * small than a page to avoid wasting lots of memory.
 	 */
 	size = BBTOB(bp->b_length);
-	if (size < PAGE_SIZE) {
-		bp->b_addr = kmem_alloc(size, KM_NOFS);
+	if (size < PAGE_SIZE && size == mp->m_sb.sb_sectsize) {
+		bp->b_addr = kmem_cache_alloc(mp->m_sector_cache, GFP_NOFS);
 		if (!bp->b_addr) {
 			/* low memory - use alloc_page loop instead */
 			goto use_alloc_page;
 		}
 
-		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
-		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
-			/* b_addr spans two pages - use alloc_page instead */
-			kmem_free(bp->b_addr);
-			bp->b_addr = NULL;
-			goto use_alloc_page;
-		}
 		bp->b_offset = offset_in_page(bp->b_addr);
 		bp->b_pages = bp->b_page_array;
 		bp->b_pages[0] = virt_to_page(bp->b_addr);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7964513c3128..83d76271c2d4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -93,6 +93,8 @@ typedef struct xfs_mount {
 	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
 	struct xfs_inode	*m_rootip;	/* pointer to root directory */
 	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	struct kmem_cache	*m_sector_cache;/* sector sized buf data */
+	char			*m_sector_cache_name;
 	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
 	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
 	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f4d34749505e..6b9f3e8d38d5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -706,6 +706,10 @@ xfs_close_devices(
 		fs_put_dax(dax_rtdev);
 	}
 	xfs_free_buftarg(mp->m_ddev_targp);
+	if (mp->m_sector_cache) {
+		kmem_cache_destroy(mp->m_sector_cache);
+		kfree(mp->m_sector_cache_name);
+	}
 	fs_put_dax(dax_ddev);
 }
 
@@ -804,6 +808,20 @@ xfs_setup_devices(
 {
 	int			error;
 
+	if (mp->m_sb.sb_sectsize < PAGE_SIZE) {
+		mp->m_sector_cache_name = kasprintf(GFP_KERNEL, "%s-sector",
+				mp->m_fsname);
+		if (!mp->m_sector_cache_name)
+			return -ENOMEM;
+		mp->m_sector_cache = kmem_cache_create(mp->m_sector_cache_name,
+				mp->m_sb.sb_sectsize, mp->m_sb.sb_sectsize,
+				0, NULL);
+		if (!mp->m_sector_cache) {
+			kfree(mp->m_sector_cache_name);
+			return -ENOMEM;
+		}
+	}
+
 	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
 	if (error)
 		return error;
-- 
2.19.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 17:20 [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data Christoph Hellwig
@ 2018-12-05 21:50 ` Dave Chinner
  2018-12-05 21:56   ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2018-12-05 21:50 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, tom.leiming, vkuznets

On Wed, Dec 05, 2018 at 09:20:23AM -0800, Christoph Hellwig wrote:
> XFS currently uses a slab allocator for buffers smaller than the page

Currently uses heap allocated memory?

> size to avoid wasting too much memory.  But this can cause a problem
> with slub debugging turned on as the allocations might not be naturally
> aligned.  On block devices that require sector size alignment this can
> lead to data corruption.
> 
> Give that our smaller than page size buffers are always sector sized
> on a live file system, we can just create a kmem_cache with an
> explicitly specified alignment requirement for this case to fix this
> case without much effort.

I thought you were looking at using the page_frag infrastructure for
this so we didn't need a slab per filesystem for this?

I think there's a problem with this code - we can have different
sector sizes for different devices in the filesystem. e.g. the log
can have a different sector size to the data device, and this may
matter for log recovery which does a lot of single sector IO.

Hence I think this slab really needs to be managed as a property of
the of the buftarg rather than be a global property of the
xfs_mount. in most cases the log and data devices are the same
buftarg, but we should attempt to handle this special case
correctly...

Other than that, the code looks fine to me.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 21:50 ` Dave Chinner
@ 2018-12-05 21:56   ` Christoph Hellwig
  2018-12-05 22:31     ` Dave Chinner
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2018-12-05 21:56 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, linux-xfs, tom.leiming, vkuznets

On Thu, Dec 06, 2018 at 08:50:00AM +1100, Dave Chinner wrote:
> On Wed, Dec 05, 2018 at 09:20:23AM -0800, Christoph Hellwig wrote:
> > XFS currently uses a slab allocator for buffers smaller than the page
> 
> Currently uses heap allocated memory?

Or kmalloc, yeah.

> I thought you were looking at using the page_frag infrastructure for
> this so we didn't need a slab per filesystem for this?

I did look into it, but using page_frag for buffers creates an inherent
memory leak given that page_frag never reuses a freed fragement.  So
until all fragments in a page are free all the others bits effectively
leak, which is pretty bad on the buffer cache.

> I think there's a problem with this code - we can have different
> sector sizes for different devices in the filesystem. e.g. the log
> can have a different sector size to the data device, and this may
> matter for log recovery which does a lot of single sector IO.

That's fine because log recovery is one I/O at a time, and we always
free the current buffer before allocating the next one, so we'd waste
the memory one way or another.

> Hence I think this slab really needs to be managed as a property of
> the of the buftarg rather than be a global property of the
> xfs_mount. in most cases the log and data devices are the same
> buftarg, but we should attempt to handle this special case
> correctly...

We could waste another slab cache on the block device, but as said
we don't really save any memory by creating a new cache, we actually
waste memory due to the overhead.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 21:56   ` Christoph Hellwig
@ 2018-12-05 22:31     ` Dave Chinner
  2018-12-05 22:33       ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2018-12-05 22:31 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, tom.leiming, vkuznets

On Wed, Dec 05, 2018 at 10:56:51PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 08:50:00AM +1100, Dave Chinner wrote:
> > On Wed, Dec 05, 2018 at 09:20:23AM -0800, Christoph Hellwig wrote:
> > > XFS currently uses a slab allocator for buffers smaller than the page
> > 
> > Currently uses heap allocated memory?
> 
> Or kmalloc, yeah.
> 
> > I thought you were looking at using the page_frag infrastructure for
> > this so we didn't need a slab per filesystem for this?
> 
> I did look into it, but using page_frag for buffers creates an inherent
> memory leak given that page_frag never reuses a freed fragement.  So
> until all fragments in a page are free all the others bits effectively
> leak, which is pretty bad on the buffer cache.

Ok, that's definitely a showstopper. I didn't notice this
behaviour when I had a quick look at it back when it was first
mentioned.

> > I think there's a problem with this code - we can have different
> > sector sizes for different devices in the filesystem. e.g. the log
> > can have a different sector size to the data device, and this may
> > matter for log recovery which does a lot of single sector IO.
> 
> That's fine because log recovery is one I/O at a time, and we always
> free the current buffer before allocating the next one, so we'd waste
> the memory one way or another.

True. Can you mention this in the commit message so it gets recorded
with the change and doesn't get lost in the mists of time?

> > Hence I think this slab really needs to be managed as a property of
> > the of the buftarg rather than be a global property of the
> > xfs_mount. in most cases the log and data devices are the same
> > buftarg, but we should attempt to handle this special case
> > correctly...
> 
> We could waste another slab cache on the block device, but as said
> we don't really save any memory by creating a new cache, we actually
> waste memory due to the overhead.

Slub will merge all into the one cache per sector size, anyway, so
there really isn't any wasted memory to speak of here.

Regardless, with a commit message update to explain why a single
cache is fine, then I'm happy with this.

Reviewed-by: Dave Chinner <dchinner@redhat.com>

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 22:31     ` Dave Chinner
@ 2018-12-05 22:33       ` Christoph Hellwig
  2018-12-05 22:35         ` Darrick J. Wong
  2018-12-05 22:51         ` Dave Chinner
  0 siblings, 2 replies; 7+ messages in thread
From: Christoph Hellwig @ 2018-12-05 22:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, linux-xfs, tom.leiming, vkuznets

On Thu, Dec 06, 2018 at 09:31:39AM +1100, Dave Chinner wrote:
> > 
> > That's fine because log recovery is one I/O at a time, and we always
> > free the current buffer before allocating the next one, so we'd waste
> > the memory one way or another.
> 
> True. Can you mention this in the commit message so it gets recorded
> with the change and doesn't get lost in the mists of time?

How about a comment instead of the commit message?  Either way I can
document this decision for v2.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 22:33       ` Christoph Hellwig
@ 2018-12-05 22:35         ` Darrick J. Wong
  2018-12-05 22:51         ` Dave Chinner
  1 sibling, 0 replies; 7+ messages in thread
From: Darrick J. Wong @ 2018-12-05 22:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, linux-xfs, tom.leiming, vkuznets

On Wed, Dec 05, 2018 at 11:33:46PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 09:31:39AM +1100, Dave Chinner wrote:
> > > 
> > > That's fine because log recovery is one I/O at a time, and we always
> > > free the current buffer before allocating the next one, so we'd waste
> > > the memory one way or another.
> > 
> > True. Can you mention this in the commit message so it gets recorded
> > with the change and doesn't get lost in the mists of time?
> 
> How about a comment instead of the commit message?  Either way I can
> document this decision for v2.

A comment would be very welcome.

--D

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data
  2018-12-05 22:33       ` Christoph Hellwig
  2018-12-05 22:35         ` Darrick J. Wong
@ 2018-12-05 22:51         ` Dave Chinner
  1 sibling, 0 replies; 7+ messages in thread
From: Dave Chinner @ 2018-12-05 22:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, tom.leiming, vkuznets

On Wed, Dec 05, 2018 at 11:33:46PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 09:31:39AM +1100, Dave Chinner wrote:
> > > 
> > > That's fine because log recovery is one I/O at a time, and we always
> > > free the current buffer before allocating the next one, so we'd waste
> > > the memory one way or another.
> > 
> > True. Can you mention this in the commit message so it gets recorded
> > with the change and doesn't get lost in the mists of time?
> 
> How about a comment instead of the commit message?  Either way I can
> document this decision for v2.

That's fine by me, too.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-12-05 22:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-05 17:20 [PATCH] xfs: use a dedicated SLAB cache for sector sized buffer data Christoph Hellwig
2018-12-05 21:50 ` Dave Chinner
2018-12-05 21:56   ` Christoph Hellwig
2018-12-05 22:31     ` Dave Chinner
2018-12-05 22:33       ` Christoph Hellwig
2018-12-05 22:35         ` Darrick J. Wong
2018-12-05 22:51         ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.