stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
@ 2019-07-11 12:58 Jan Kara
  2019-07-12  0:04 ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Kara @ 2019-07-11 12:58 UTC (permalink / raw)
  To: linux-mm; +Cc: mgorman, mhocko, Andrew Morton, Jan Kara, stable

buffer_migrate_page_norefs() can race with bh users in a following way:

CPU1					CPU2
buffer_migrate_page_norefs()
  buffer_migrate_lock_buffers()
  checks bh refs
  spin_unlock(&mapping->private_lock)
					__find_get_block()
					  spin_lock(&mapping->private_lock)
					  grab bh ref
					  spin_unlock(&mapping->private_lock)
  move page				  do bh work

This can result in various issues like lost updates to buffers (i.e.
metadata corruption) or use after free issues for the old page.

Closing this race window is relatively difficult. We could hold
mapping->private_lock in buffer_migrate_page_norefs() until we are
finished with migrating the page but the lock hold times would be rather
big. So let's revert to a more careful variant of page migration requiring
eviction of buffers on migrated page. This is effectively
fallback_migrate_page() that additionally invalidates bh LRUs in case
try_to_free_buffers() failed.

CC: stable@vger.kernel.org
Fixes: 89cb0888ca14 "mm: migrate: provide buffer_migrate_page_norefs()"
Signed-off-by: Jan Kara <jack@suse.cz>
---
 mm/migrate.c | 161 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 78 insertions(+), 83 deletions(-)

I've lightly tested this with config-workload-thpfioscale which didn't
show any obvious issue but the patch probably needs more testing (especially
to verify that memory unplug is still able to succeed in reasonable time).
That's why this is RFC.

diff --git a/mm/migrate.c b/mm/migrate.c
index e9594bc0d406..893698d37d50 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -697,6 +697,47 @@ int migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(migrate_page);
 
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.for_reclaim = 1
+	};
+	int rc;
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return -EINVAL;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return -EAGAIN;
+
+	/*
+	 * A dirty page may imply that the underlying filesystem has
+	 * the page on some queue. So the page must be clean for
+	 * migration. Writeout may mean we loose the lock and the
+	 * page state is no longer what we checked for earlier.
+	 * At this point we know that the migration attempt cannot
+	 * be successful.
+	 */
+	remove_migration_ptes(page, page, false);
+
+	rc = mapping->a_ops->writepage(page, &wbc);
+
+	if (rc != AOP_WRITEPAGE_ACTIVATE)
+		/* unlocked. Relock */
+		lock_page(page);
+
+	return (rc < 0) ? -EIO : -EAGAIN;
+}
+
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -736,9 +777,14 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 	return true;
 }
 
-static int __buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		bool check_refs)
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode)
 {
 	struct buffer_head *bh, *head;
 	int rc;
@@ -756,33 +802,6 @@ static int __buffer_migrate_page(struct address_space *mapping,
 	if (!buffer_migrate_lock_buffers(head, mode))
 		return -EAGAIN;
 
-	if (check_refs) {
-		bool busy;
-		bool invalidated = false;
-
-recheck_buffers:
-		busy = false;
-		spin_lock(&mapping->private_lock);
-		bh = head;
-		do {
-			if (atomic_read(&bh->b_count)) {
-				busy = true;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		spin_unlock(&mapping->private_lock);
-		if (busy) {
-			if (invalidated) {
-				rc = -EAGAIN;
-				goto unlock_buffers;
-			}
-			invalidate_bh_lrus();
-			invalidated = true;
-			goto recheck_buffers;
-		}
-	}
-
 	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		goto unlock_buffers;
@@ -818,72 +837,48 @@ static int __buffer_migrate_page(struct address_space *mapping,
 
 	return rc;
 }
-
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist. For example attached buffer heads are accessed only under page lock.
- */
-int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode)
-{
-	return __buffer_migrate_page(mapping, newpage, page, mode, false);
-}
 EXPORT_SYMBOL(buffer_migrate_page);
 
 /*
- * Same as above except that this variant is more careful and checks that there
- * are also no buffer head references. This function is the right one for
- * mappings where buffer heads are directly looked up and referenced (such as
- * block device mappings).
+ * Same as above except that this variant is more careful.  This function is
+ * the right one for mappings where buffer heads are directly looked up and
+ * referenced (such as block device mappings).
  */
 int buffer_migrate_page_norefs(struct address_space *mapping,
 		struct page *newpage, struct page *page, enum migrate_mode mode)
 {
-	return __buffer_migrate_page(mapping, newpage, page, mode, true);
-}
-#endif
-
-/*
- * Writeback a page to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct page *page)
-{
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_NONE,
-		.nr_to_write = 1,
-		.range_start = 0,
-		.range_end = LLONG_MAX,
-		.for_reclaim = 1
-	};
-	int rc;
-
-	if (!mapping->a_ops->writepage)
-		/* No write method for the address space */
-		return -EINVAL;
+	bool invalidated = false;
 
-	if (!clear_page_dirty_for_io(page))
-		/* Someone else already triggered a write */
-		return -EAGAIN;
+	if (PageDirty(page)) {
+		/* Only writeback pages in full synchronous migration */
+		switch (mode) {
+		case MIGRATE_SYNC:
+		case MIGRATE_SYNC_NO_COPY:
+			break;
+		default:
+			return -EBUSY;
+		}
+		return writeout(mapping, page);
+	}
 
+retry:
 	/*
-	 * A dirty page may imply that the underlying filesystem has
-	 * the page on some queue. So the page must be clean for
-	 * migration. Writeout may mean we loose the lock and the
-	 * page state is no longer what we checked for earlier.
-	 * At this point we know that the migration attempt cannot
-	 * be successful.
+	 * Buffers may be managed in a filesystem specific way.
+	 * We must have no buffers or drop them.
 	 */
-	remove_migration_ptes(page, page, false);
-
-	rc = mapping->a_ops->writepage(page, &wbc);
-
-	if (rc != AOP_WRITEPAGE_ACTIVATE)
-		/* unlocked. Relock */
-		lock_page(page);
+	if (page_has_private(page) &&
+	    !try_to_release_page(page, GFP_KERNEL)) {
+		if (!invalidated) {
+			invalidate_bh_lrus();
+			invalidated = true;
+			goto retry;
+		}
+		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
+	}
 
-	return (rc < 0) ? -EIO : -EAGAIN;
+	return migrate_page(mapping, newpage, page, mode);
 }
+#endif
 
 /*
  * Default handling if a filesystem does not provide a migration function.
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-11 12:58 [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration Jan Kara
@ 2019-07-12  0:04 ` Andrew Morton
  2019-07-12  8:04   ` Mel Gorman
  2019-07-12  9:17   ` Jan Kara
  0 siblings, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2019-07-12  0:04 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, mgorman, mhocko, stable

On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:

> buffer_migrate_page_norefs() can race with bh users in a following way:
> 
> CPU1					CPU2
> buffer_migrate_page_norefs()
>   buffer_migrate_lock_buffers()
>   checks bh refs
>   spin_unlock(&mapping->private_lock)
> 					__find_get_block()
> 					  spin_lock(&mapping->private_lock)
> 					  grab bh ref
> 					  spin_unlock(&mapping->private_lock)
>   move page				  do bh work
> 
> This can result in various issues like lost updates to buffers (i.e.
> metadata corruption) or use after free issues for the old page.
> 
> Closing this race window is relatively difficult. We could hold
> mapping->private_lock in buffer_migrate_page_norefs() until we are
> finished with migrating the page but the lock hold times would be rather
> big. So let's revert to a more careful variant of page migration requiring
> eviction of buffers on migrated page. This is effectively
> fallback_migrate_page() that additionally invalidates bh LRUs in case
> try_to_free_buffers() failed.

Is this premature optimization?  Holding ->private_lock while messing
with the buffers would be the standard way of addressing this.  The
longer hold times *might* be an issue, but we don't know this, do we? 
If there are indeed such problems then they could be improved by, say,
doing more of the newpage preparation prior to taking ->private_lock.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12  0:04 ` Andrew Morton
@ 2019-07-12  8:04   ` Mel Gorman
  2019-07-12  9:17   ` Jan Kara
  1 sibling, 0 replies; 9+ messages in thread
From: Mel Gorman @ 2019-07-12  8:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, linux-mm, mhocko, stable

On Thu, Jul 11, 2019 at 05:04:55PM -0700, Andrew Morton wrote:
> On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:
> 
> > buffer_migrate_page_norefs() can race with bh users in a following way:
> > 
> > CPU1					CPU2
> > buffer_migrate_page_norefs()
> >   buffer_migrate_lock_buffers()
> >   checks bh refs
> >   spin_unlock(&mapping->private_lock)
> > 					__find_get_block()
> > 					  spin_lock(&mapping->private_lock)
> > 					  grab bh ref
> > 					  spin_unlock(&mapping->private_lock)
> >   move page				  do bh work
> > 
> > This can result in various issues like lost updates to buffers (i.e.
> > metadata corruption) or use after free issues for the old page.
> > 
> > Closing this race window is relatively difficult. We could hold
> > mapping->private_lock in buffer_migrate_page_norefs() until we are
> > finished with migrating the page but the lock hold times would be rather
> > big. So let's revert to a more careful variant of page migration requiring
> > eviction of buffers on migrated page. This is effectively
> > fallback_migrate_page() that additionally invalidates bh LRUs in case
> > try_to_free_buffers() failed.
> 
> Is this premature optimization?  Holding ->private_lock while messing
> with the buffers would be the standard way of addressing this.  The
> longer hold times *might* be an issue, but we don't know this, do we? 
> If there are indeed such problems then they could be improved by, say,
> doing more of the newpage preparation prior to taking ->private_lock.
> 

To some extent, we do not know how much of a problem this patch will
be either or what impact avoiding dirty block pages during migration
is either. So both approaches have their downsides.

However, failing a high-order allocation is typically benign and it is an
inevitable problem that depends on the workload. I don't think we could
ever hit a case whereby there was enough spinning to cause a soft lockup
but on the other hand, I don't think there is much scope for doing more
of the preparation steps before acquiring private_lock either.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12  0:04 ` Andrew Morton
  2019-07-12  8:04   ` Mel Gorman
@ 2019-07-12  9:17   ` Jan Kara
  2019-07-12 10:10     ` Mel Gorman
  1 sibling, 1 reply; 9+ messages in thread
From: Jan Kara @ 2019-07-12  9:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, linux-mm, mgorman, mhocko, stable

On Thu 11-07-19 17:04:55, Andrew Morton wrote:
> On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:
> 
> > buffer_migrate_page_norefs() can race with bh users in a following way:
> > 
> > CPU1					CPU2
> > buffer_migrate_page_norefs()
> >   buffer_migrate_lock_buffers()
> >   checks bh refs
> >   spin_unlock(&mapping->private_lock)
> > 					__find_get_block()
> > 					  spin_lock(&mapping->private_lock)
> > 					  grab bh ref
> > 					  spin_unlock(&mapping->private_lock)
> >   move page				  do bh work
> > 
> > This can result in various issues like lost updates to buffers (i.e.
> > metadata corruption) or use after free issues for the old page.
> > 
> > Closing this race window is relatively difficult. We could hold
> > mapping->private_lock in buffer_migrate_page_norefs() until we are
> > finished with migrating the page but the lock hold times would be rather
> > big. So let's revert to a more careful variant of page migration requiring
> > eviction of buffers on migrated page. This is effectively
> > fallback_migrate_page() that additionally invalidates bh LRUs in case
> > try_to_free_buffers() failed.
> 
> Is this premature optimization?  Holding ->private_lock while messing
> with the buffers would be the standard way of addressing this.  The
> longer hold times *might* be an issue, but we don't know this, do we? 
> If there are indeed such problems then they could be improved by, say,
> doing more of the newpage preparation prior to taking ->private_lock.

I didn't check how long the private_lock hold times would actually be, it
just seems there's a lot of work done before the page is fully migrated a
we could release the lock. And since the lock blocks bh lookup,
set_page_dirty(), etc. for the whole device, it just seemed as a bad idea.
I don't think much of a newpage setup can be moved outside of private_lock
- in particular page cache replacement, page copying, page state migration
all need to be there so that bh code doesn't get confused.

But I guess it's fair to measure at least ballpark numbers of what the lock
hold times would be to get idea whether the contention concern is
substantiated or not.

Finally, I guess I should mention there's one more approach to the problem
I was considering: Modify bh code to fully rely on page lock instead of
private_lock for bh lookup. That would make sense scalability-wise on its
own. The problem with it is that __find_get_block() would become a sleeping
function. There aren't that many places calling the function and most of
them seem fine with it but still it is non-trivial amount of work to do the
conversion and it can have some fallout so it didn't seem like a good
solution for a data-corruption issue that needs to go to stable...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12  9:17   ` Jan Kara
@ 2019-07-12 10:10     ` Mel Gorman
  2019-07-12 11:20       ` Jan Kara
  0 siblings, 1 reply; 9+ messages in thread
From: Mel Gorman @ 2019-07-12 10:10 UTC (permalink / raw)
  To: Jan Kara; +Cc: Andrew Morton, linux-mm, mhocko, stable

On Fri, Jul 12, 2019 at 11:17:46AM +0200, Jan Kara wrote:
> On Thu 11-07-19 17:04:55, Andrew Morton wrote:
> > On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:
> > 
> > > buffer_migrate_page_norefs() can race with bh users in a following way:
> > > 
> > > CPU1					CPU2
> > > buffer_migrate_page_norefs()
> > >   buffer_migrate_lock_buffers()
> > >   checks bh refs
> > >   spin_unlock(&mapping->private_lock)
> > > 					__find_get_block()
> > > 					  spin_lock(&mapping->private_lock)
> > > 					  grab bh ref
> > > 					  spin_unlock(&mapping->private_lock)
> > >   move page				  do bh work
> > > 
> > > This can result in various issues like lost updates to buffers (i.e.
> > > metadata corruption) or use after free issues for the old page.
> > > 
> > > Closing this race window is relatively difficult. We could hold
> > > mapping->private_lock in buffer_migrate_page_norefs() until we are
> > > finished with migrating the page but the lock hold times would be rather
> > > big. So let's revert to a more careful variant of page migration requiring
> > > eviction of buffers on migrated page. This is effectively
> > > fallback_migrate_page() that additionally invalidates bh LRUs in case
> > > try_to_free_buffers() failed.
> > 
> > Is this premature optimization?  Holding ->private_lock while messing
> > with the buffers would be the standard way of addressing this.  The
> > longer hold times *might* be an issue, but we don't know this, do we? 
> > If there are indeed such problems then they could be improved by, say,
> > doing more of the newpage preparation prior to taking ->private_lock.
> 
> I didn't check how long the private_lock hold times would actually be, it
> just seems there's a lot of work done before the page is fully migrated a
> we could release the lock. And since the lock blocks bh lookup,
> set_page_dirty(), etc. for the whole device, it just seemed as a bad idea.
> I don't think much of a newpage setup can be moved outside of private_lock
> - in particular page cache replacement, page copying, page state migration
> all need to be there so that bh code doesn't get confused.
> 
> But I guess it's fair to measure at least ballpark numbers of what the lock
> hold times would be to get idea whether the contention concern is
> substantiated or not.
> 

I think it would be tricky to measure and quantify how much the contention
is an issue. While it would be possible to construct a microbenchmark that
should illustrate the problem, it would tell us relatively little about
how much of a problem it is generally. It would be relatively difficult
to detect the contention and stalls in block lookups due to migration
would be tricky to spot. Careful use of lock_stat might help but
enabling that has consequences of its own.

However, a rise in allocation failures due to dirty pages not being
migrated is relatively easy to detect and the consequences are relatively
benign -- failed high-order allocation that is usually ok versus a stall
on block lookups that could have a wider general impact.

On that basis, I think the patch you proposed is the more appropriate as
a fix for the race which has the potential for data corruption. So;

Acked-by: Mel Gorman <mgorman@techsingularity.net>

> Finally, I guess I should mention there's one more approach to the problem
> I was considering: Modify bh code to fully rely on page lock instead of
> private_lock for bh lookup. That would make sense scalability-wise on its
> own. The problem with it is that __find_get_block() would become a sleeping
> function. There aren't that many places calling the function and most of
> them seem fine with it but still it is non-trivial amount of work to do the
> conversion and it can have some fallout so it didn't seem like a good
> solution for a data-corruption issue that needs to go to stable...
> 

Maybe *if* it's shown there is a major issue with increased high-order
allocation failures, it would be worth looking into but right now, I
think it's overkill with relatively high risk and closing the potential
race is more important.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12 10:10     ` Mel Gorman
@ 2019-07-12 11:20       ` Jan Kara
  2019-07-12 12:39         ` Mel Gorman
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Kara @ 2019-07-12 11:20 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Jan Kara, Andrew Morton, linux-mm, mhocko, stable

[-- Attachment #1: Type: text/plain, Size: 5851 bytes --]

On Fri 12-07-19 11:10:43, Mel Gorman wrote:
> On Fri, Jul 12, 2019 at 11:17:46AM +0200, Jan Kara wrote:
> > On Thu 11-07-19 17:04:55, Andrew Morton wrote:
> > > On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:
> > > 
> > > > buffer_migrate_page_norefs() can race with bh users in a following way:
> > > > 
> > > > CPU1					CPU2
> > > > buffer_migrate_page_norefs()
> > > >   buffer_migrate_lock_buffers()
> > > >   checks bh refs
> > > >   spin_unlock(&mapping->private_lock)
> > > > 					__find_get_block()
> > > > 					  spin_lock(&mapping->private_lock)
> > > > 					  grab bh ref
> > > > 					  spin_unlock(&mapping->private_lock)
> > > >   move page				  do bh work
> > > > 
> > > > This can result in various issues like lost updates to buffers (i.e.
> > > > metadata corruption) or use after free issues for the old page.
> > > > 
> > > > Closing this race window is relatively difficult. We could hold
> > > > mapping->private_lock in buffer_migrate_page_norefs() until we are
> > > > finished with migrating the page but the lock hold times would be rather
> > > > big. So let's revert to a more careful variant of page migration requiring
> > > > eviction of buffers on migrated page. This is effectively
> > > > fallback_migrate_page() that additionally invalidates bh LRUs in case
> > > > try_to_free_buffers() failed.
> > > 
> > > Is this premature optimization?  Holding ->private_lock while messing
> > > with the buffers would be the standard way of addressing this.  The
> > > longer hold times *might* be an issue, but we don't know this, do we? 
> > > If there are indeed such problems then they could be improved by, say,
> > > doing more of the newpage preparation prior to taking ->private_lock.
> > 
> > I didn't check how long the private_lock hold times would actually be, it
> > just seems there's a lot of work done before the page is fully migrated a
> > we could release the lock. And since the lock blocks bh lookup,
> > set_page_dirty(), etc. for the whole device, it just seemed as a bad idea.
> > I don't think much of a newpage setup can be moved outside of private_lock
> > - in particular page cache replacement, page copying, page state migration
> > all need to be there so that bh code doesn't get confused.
> > 
> > But I guess it's fair to measure at least ballpark numbers of what the lock
> > hold times would be to get idea whether the contention concern is
> > substantiated or not.
> > 
> 
> I think it would be tricky to measure and quantify how much the contention
> is an issue. While it would be possible to construct a microbenchmark that
> should illustrate the problem, it would tell us relatively little about
> how much of a problem it is generally. It would be relatively difficult
> to detect the contention and stalls in block lookups due to migration
> would be tricky to spot. Careful use of lock_stat might help but
> enabling that has consequences of its own.
> 
> However, a rise in allocation failures due to dirty pages not being
> migrated is relatively easy to detect and the consequences are relatively
> benign -- failed high-order allocation that is usually ok versus a stall
> on block lookups that could have a wider general impact.
> 
> On that basis, I think the patch you proposed is the more appropriate as
> a fix for the race which has the potential for data corruption. So;
> 
> Acked-by: Mel Gorman <mgorman@techsingularity.net>

Thanks. And I agree with you that detecting failed migrations is generally
easier than detecting private_lock contention. Anyway, out of curiosity, I
did run thpfioscale workload in mmtests with some additional metadata
workload on the system to increase proportion of bdev page cache and added
tracepoints to see how long the relevant part of __buffer_migrate_page()
lasts (patch attached). The longest duration of the critical section was
311 us which is significant. But that was an outlier by far. The most of
times critical section lasted couple of us. The full histogram is here:

[min - 0.000006]: 2907 93.202950%
(0.000006 - 0.000011]: 105 3.366464%
(0.000011 - 0.000016]: 36 1.154216%
(0.000016 - 0.000021]: 45 1.442770%
(0.000021 - 0.000026]: 13 0.416800%
(0.000026 - 0.000031]: 4 0.128246%
(0.000031 - 0.000036]: 2 0.064123%
(0.000036 - 0.000041]: 2 0.064123%
(0.000041 - 0.000046]: 1 0.032062%
(0.000046 - 0.000050]: 1 0.032062%
(0.000050 - 0.000055]: 0 0.000000%
(0.000055 - 0.000060]: 0 0.000000%
(0.000060 - 0.000065]: 0 0.000000%
(0.000065 - 0.000070]: 0 0.000000%
(0.000070 - 0.000075]: 2 0.064123%
(0.000075 - 0.000080]: 0 0.000000%
(0.000080 - 0.000085]: 0 0.000000%
(0.000085 - 0.000090]: 0 0.000000%
(0.000090 - 0.000095]: 0 0.000000%
(0.000095 - max]: 1 0.032062%

So although I still think that just failing the migration if we cannot
invalidate buffer heads is a safer choice, just extending the private_lock
protected section does not seem as bad as I was afraid.

> > Finally, I guess I should mention there's one more approach to the problem
> > I was considering: Modify bh code to fully rely on page lock instead of
> > private_lock for bh lookup. That would make sense scalability-wise on its
> > own. The problem with it is that __find_get_block() would become a sleeping
> > function. There aren't that many places calling the function and most of
> > them seem fine with it but still it is non-trivial amount of work to do the
> > conversion and it can have some fallout so it didn't seem like a good
> > solution for a data-corruption issue that needs to go to stable...
> > 
> 
> Maybe *if* it's shown there is a major issue with increased high-order
> allocation failures, it would be worth looking into but right now, I
> think it's overkill with relatively high risk and closing the potential
> race is more important.

Agreed.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

[-- Attachment #2: 0001-mm-migrate-Fix-race-with-__find_get_block.patch --]
[-- Type: text/x-patch, Size: 2893 bytes --]

From fd584fb6fa6d48e1fae1077d2ab0021ae9c98edf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 10 Jul 2019 11:31:01 +0200
Subject: [PATCH] mm: migrate: Fix race with __find_get_block()

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/trace/events/migrate.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/migrate.c                   |  8 +++++++-
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 705b33d1e395..15473a508216 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -70,6 +70,48 @@ TRACE_EVENT(mm_migrate_pages,
 		__print_symbolic(__entry->mode, MIGRATE_MODE),
 		__print_symbolic(__entry->reason, MIGRATE_REASON))
 );
+
+TRACE_EVENT(mm_migrate_buffers_begin,
+
+	TP_PROTO(struct address_space *mapping, unsigned long index),
+
+	TP_ARGS(mapping, index),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	mapping)
+		__field(unsigned long,	index)
+	),
+
+	TP_fast_assign(
+		__entry->mapping	= (unsigned long)mapping;
+		__entry->index		= index;
+	),
+
+	TP_printk("mapping=%lx index=%lu", __entry->mapping, __entry->index)
+);
+
+TRACE_EVENT(mm_migrate_buffers_end,
+
+	TP_PROTO(struct address_space *mapping, unsigned long index, int rc),
+
+	TP_ARGS(mapping, index, rc),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	mapping)
+		__field(unsigned long,	index)
+		__field(int,		rc)
+	),
+
+	TP_fast_assign(
+		__entry->mapping	= (unsigned long)mapping;
+		__entry->index		= index;
+		__entry->rc		= rc;
+	),
+
+	TP_printk("mapping=%lx index=%lu rc=%d", __entry->mapping, __entry->index, __entry->rc)
+);
+
+
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
diff --git a/mm/migrate.c b/mm/migrate.c
index e9594bc0d406..bce0f1b03a60 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -763,6 +763,7 @@ static int __buffer_migrate_page(struct address_space *mapping,
 recheck_buffers:
 		busy = false;
 		spin_lock(&mapping->private_lock);
+		trace_mm_migrate_buffers_begin(mapping, page->index);
 		bh = head;
 		do {
 			if (atomic_read(&bh->b_count)) {
@@ -771,12 +772,13 @@ static int __buffer_migrate_page(struct address_space *mapping,
 			}
 			bh = bh->b_this_page;
 		} while (bh != head);
-		spin_unlock(&mapping->private_lock);
 		if (busy) {
 			if (invalidated) {
 				rc = -EAGAIN;
 				goto unlock_buffers;
 			}
+			trace_mm_migrate_buffers_end(mapping, page->index, -EAGAIN);
+			spin_unlock(&mapping->private_lock);
 			invalidate_bh_lrus();
 			invalidated = true;
 			goto recheck_buffers;
@@ -809,6 +811,10 @@ static int __buffer_migrate_page(struct address_space *mapping,
 
 	rc = MIGRATEPAGE_SUCCESS;
 unlock_buffers:
+	if (check_refs) {
+		trace_mm_migrate_buffers_end(mapping, page->index, rc);
+		spin_unlock(&mapping->private_lock);
+	}
 	bh = head;
 	do {
 		unlock_buffer(bh);
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12 11:20       ` Jan Kara
@ 2019-07-12 12:39         ` Mel Gorman
  2019-07-12 21:21           ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Mel Gorman @ 2019-07-12 12:39 UTC (permalink / raw)
  To: Jan Kara; +Cc: Andrew Morton, linux-mm, mhocko, stable

On Fri, Jul 12, 2019 at 01:20:56PM +0200, Jan Kara wrote:
> On Fri 12-07-19 11:10:43, Mel Gorman wrote:
> > On Fri, Jul 12, 2019 at 11:17:46AM +0200, Jan Kara wrote:
> > > On Thu 11-07-19 17:04:55, Andrew Morton wrote:
> > > > On Thu, 11 Jul 2019 14:58:38 +0200 Jan Kara <jack@suse.cz> wrote:
> > > > 
> > > > > buffer_migrate_page_norefs() can race with bh users in a following way:
> > > > > 
> > > > > CPU1					CPU2
> > > > > buffer_migrate_page_norefs()
> > > > >   buffer_migrate_lock_buffers()
> > > > >   checks bh refs
> > > > >   spin_unlock(&mapping->private_lock)
> > > > > 					__find_get_block()
> > > > > 					  spin_lock(&mapping->private_lock)
> > > > > 					  grab bh ref
> > > > > 					  spin_unlock(&mapping->private_lock)
> > > > >   move page				  do bh work
> > > > > 
> > > > > This can result in various issues like lost updates to buffers (i.e.
> > > > > metadata corruption) or use after free issues for the old page.
> > > > > 
> > > > > Closing this race window is relatively difficult. We could hold
> > > > > mapping->private_lock in buffer_migrate_page_norefs() until we are
> > > > > finished with migrating the page but the lock hold times would be rather
> > > > > big. So let's revert to a more careful variant of page migration requiring
> > > > > eviction of buffers on migrated page. This is effectively
> > > > > fallback_migrate_page() that additionally invalidates bh LRUs in case
> > > > > try_to_free_buffers() failed.
> > > > 
> > > > Is this premature optimization?  Holding ->private_lock while messing
> > > > with the buffers would be the standard way of addressing this.  The
> > > > longer hold times *might* be an issue, but we don't know this, do we? 
> > > > If there are indeed such problems then they could be improved by, say,
> > > > doing more of the newpage preparation prior to taking ->private_lock.
> > > 
> > > I didn't check how long the private_lock hold times would actually be, it
> > > just seems there's a lot of work done before the page is fully migrated a
> > > we could release the lock. And since the lock blocks bh lookup,
> > > set_page_dirty(), etc. for the whole device, it just seemed as a bad idea.
> > > I don't think much of a newpage setup can be moved outside of private_lock
> > > - in particular page cache replacement, page copying, page state migration
> > > all need to be there so that bh code doesn't get confused.
> > > 
> > > But I guess it's fair to measure at least ballpark numbers of what the lock
> > > hold times would be to get idea whether the contention concern is
> > > substantiated or not.
> > > 
> > 
> > I think it would be tricky to measure and quantify how much the contention
> > is an issue. While it would be possible to construct a microbenchmark that
> > should illustrate the problem, it would tell us relatively little about
> > how much of a problem it is generally. It would be relatively difficult
> > to detect the contention and stalls in block lookups due to migration
> > would be tricky to spot. Careful use of lock_stat might help but
> > enabling that has consequences of its own.
> > 
> > However, a rise in allocation failures due to dirty pages not being
> > migrated is relatively easy to detect and the consequences are relatively
> > benign -- failed high-order allocation that is usually ok versus a stall
> > on block lookups that could have a wider general impact.
> > 
> > On that basis, I think the patch you proposed is the more appropriate as
> > a fix for the race which has the potential for data corruption. So;
> > 
> > Acked-by: Mel Gorman <mgorman@techsingularity.net>
> 
> Thanks. And I agree with you that detecting failed migrations is generally
> easier than detecting private_lock contention. Anyway, out of curiosity, I
> did run thpfioscale workload in mmtests with some additional metadata
> workload on the system to increase proportion of bdev page cache and added
> tracepoints to see how long the relevant part of __buffer_migrate_page()
> lasts (patch attached). The longest duration of the critical section was
> 311 us which is significant. But that was an outlier by far. The most of
> times critical section lasted couple of us. The full histogram is here:
> 
> [min - 0.000006]: 2907 93.202950%
> (0.000006 - 0.000011]: 105 3.366464%
> (0.000011 - 0.000016]: 36 1.154216%
> (0.000016 - 0.000021]: 45 1.442770%
> (0.000021 - 0.000026]: 13 0.416800%
> (0.000026 - 0.000031]: 4 0.128246%
> (0.000031 - 0.000036]: 2 0.064123%
> (0.000036 - 0.000041]: 2 0.064123%
> (0.000041 - 0.000046]: 1 0.032062%
> (0.000046 - 0.000050]: 1 0.032062%
> (0.000050 - 0.000055]: 0 0.000000%
> (0.000055 - 0.000060]: 0 0.000000%
> (0.000060 - 0.000065]: 0 0.000000%
> (0.000065 - 0.000070]: 0 0.000000%
> (0.000070 - 0.000075]: 2 0.064123%
> (0.000075 - 0.000080]: 0 0.000000%
> (0.000080 - 0.000085]: 0 0.000000%
> (0.000085 - 0.000090]: 0 0.000000%
> (0.000090 - 0.000095]: 0 0.000000%
> (0.000095 - max]: 1 0.032062%
> 
> So although I still think that just failing the migration if we cannot
> invalidate buffer heads is a safer choice, just extending the private_lock
> protected section does not seem as bad as I was afraid.
> 

That does not seem too bad and your revised patch looks functionally
fine. I'd leave out the tracepoints though because a perf probe would have
got roughly the same data and the tracepoint may be too specific to track
another class of problem. Whether the tracepoint survives or not and
with a changelog added;

Acked-by: Mel Gorman <mgorman@techsingularity.net>

Andrew, which version do you want to go with, the original version or
this one that holds private_lock for slightly longer during migration?

> From fd584fb6fa6d48e1fae1077d2ab0021ae9c98edf Mon Sep 17 00:00:00 2001
> From: Jan Kara <jack@suse.cz>
> Date: Wed, 10 Jul 2019 11:31:01 +0200
> Subject: [PATCH] mm: migrate: Fix race with __find_get_block()
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/trace/events/migrate.h | 42 ++++++++++++++++++++++++++++++++++++++++++
>  mm/migrate.c                   |  8 +++++++-
>  2 files changed, 49 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
> index 705b33d1e395..15473a508216 100644
> --- a/include/trace/events/migrate.h
> +++ b/include/trace/events/migrate.h
> @@ -70,6 +70,48 @@ TRACE_EVENT(mm_migrate_pages,
>  		__print_symbolic(__entry->mode, MIGRATE_MODE),
>  		__print_symbolic(__entry->reason, MIGRATE_REASON))
>  );
> +
> +TRACE_EVENT(mm_migrate_buffers_begin,
> +
> +	TP_PROTO(struct address_space *mapping, unsigned long index),
> +
> +	TP_ARGS(mapping, index),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long,	mapping)
> +		__field(unsigned long,	index)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->mapping	= (unsigned long)mapping;
> +		__entry->index		= index;
> +	),
> +
> +	TP_printk("mapping=%lx index=%lu", __entry->mapping, __entry->index)
> +);
> +
> +TRACE_EVENT(mm_migrate_buffers_end,
> +
> +	TP_PROTO(struct address_space *mapping, unsigned long index, int rc),
> +
> +	TP_ARGS(mapping, index, rc),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long,	mapping)
> +		__field(unsigned long,	index)
> +		__field(int,		rc)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->mapping	= (unsigned long)mapping;
> +		__entry->index		= index;
> +		__entry->rc		= rc;
> +	),
> +
> +	TP_printk("mapping=%lx index=%lu rc=%d", __entry->mapping, __entry->index, __entry->rc)
> +);
> +
> +
>  #endif /* _TRACE_MIGRATE_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e9594bc0d406..bce0f1b03a60 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -763,6 +763,7 @@ static int __buffer_migrate_page(struct address_space *mapping,
>  recheck_buffers:
>  		busy = false;
>  		spin_lock(&mapping->private_lock);
> +		trace_mm_migrate_buffers_begin(mapping, page->index);
>  		bh = head;
>  		do {
>  			if (atomic_read(&bh->b_count)) {
> @@ -771,12 +772,13 @@ static int __buffer_migrate_page(struct address_space *mapping,
>  			}
>  			bh = bh->b_this_page;
>  		} while (bh != head);
> -		spin_unlock(&mapping->private_lock);
>  		if (busy) {
>  			if (invalidated) {
>  				rc = -EAGAIN;
>  				goto unlock_buffers;
>  			}
> +			trace_mm_migrate_buffers_end(mapping, page->index, -EAGAIN);
> +			spin_unlock(&mapping->private_lock);
>  			invalidate_bh_lrus();
>  			invalidated = true;
>  			goto recheck_buffers;
> @@ -809,6 +811,10 @@ static int __buffer_migrate_page(struct address_space *mapping,
>  
>  	rc = MIGRATEPAGE_SUCCESS;
>  unlock_buffers:
> +	if (check_refs) {
> +		trace_mm_migrate_buffers_end(mapping, page->index, rc);
> +		spin_unlock(&mapping->private_lock);
> +	}
>  	bh = head;
>  	do {
>  		unlock_buffer(bh);
> -- 
> 2.16.4
> 


-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12 12:39         ` Mel Gorman
@ 2019-07-12 21:21           ` Andrew Morton
  2019-07-14 21:20             ` Mel Gorman
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2019-07-12 21:21 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Jan Kara, linux-mm, mhocko, stable

On Fri, 12 Jul 2019 13:39:35 +0100 Mel Gorman <mgorman@suse.de> wrote:

> > So although I still think that just failing the migration if we cannot
> > invalidate buffer heads is a safer choice, just extending the private_lock
> > protected section does not seem as bad as I was afraid.
> > 
> 
> That does not seem too bad and your revised patch looks functionally
> fine. I'd leave out the tracepoints though because a perf probe would have
> got roughly the same data and the tracepoint may be too specific to track
> another class of problem. Whether the tracepoint survives or not and
> with a changelog added;
> 
> Acked-by: Mel Gorman <mgorman@techsingularity.net>
> 
> Andrew, which version do you want to go with, the original version or
> this one that holds private_lock for slightly longer during migration?

The revised version looks much more appealing for a -stable backport. 
I expect any mild performance issues can be address in the usual
fashion.  My main concern is not to put a large performance regression
into mainline and stable kernels.  How confident are we that this is
(will be) sufficiently tested from that point of view?



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration
  2019-07-12 21:21           ` Andrew Morton
@ 2019-07-14 21:20             ` Mel Gorman
  0 siblings, 0 replies; 9+ messages in thread
From: Mel Gorman @ 2019-07-14 21:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, linux-mm, mhocko, stable

On Fri, Jul 12, 2019 at 02:21:11PM -0700, Andrew Morton wrote:
> On Fri, 12 Jul 2019 13:39:35 +0100 Mel Gorman <mgorman@suse.de> wrote:
> 
> > > So although I still think that just failing the migration if we cannot
> > > invalidate buffer heads is a safer choice, just extending the private_lock
> > > protected section does not seem as bad as I was afraid.
> > > 
> > 
> > That does not seem too bad and your revised patch looks functionally
> > fine. I'd leave out the tracepoints though because a perf probe would have
> > got roughly the same data and the tracepoint may be too specific to track
> > another class of problem. Whether the tracepoint survives or not and
> > with a changelog added;
> > 
> > Acked-by: Mel Gorman <mgorman@techsingularity.net>
> > 
> > Andrew, which version do you want to go with, the original version or
> > this one that holds private_lock for slightly longer during migration?
> 
> The revised version looks much more appealing for a -stable backport. 
> I expect any mild performance issues can be address in the usual
> fashion.  My main concern is not to put a large performance regression
> into mainline and stable kernels.  How confident are we that this is
> (will be) sufficiently tested from that point of view?
> 

Fairly confident. If we agree on this patch in principle (build tested
only), I'll make sure it gets tested from a functional point of view
and queue up a few migration-intensive tests while a metadata workload
is running in the background to see what falls out. Furthermore, not all
filesystems even take this path even for migration-intensive situations
so some setups will never notice a difference.

---8<---
From a4f07d789ba5742cd2fe6bcfb635502f2a1de004 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 10 Jul 2019 11:31:01 +0200
Subject: [PATCH] mm: migrate: Fix race with __find_get_block()

buffer_migrate_page_norefs() can race with bh users in a following way:

CPU1                                    CPU2
buffer_migrate_page_norefs()
  buffer_migrate_lock_buffers()
  checks bh refs
  spin_unlock(&mapping->private_lock)
                                        __find_get_block()
                                          spin_lock(&mapping->private_lock)
                                          grab bh ref
                                          spin_unlock(&mapping->private_lock)
  move page                               do bh work

This can result in various issues like lost updates to buffers (i.e.
metadata corruption) or use after free issues for the old page.

This patch closes the race by holding mapping->private_lock while the
mapping is being moved to a new page. Ordinarily, a reference can be taken
outside of the private_lock using the per-cpu BH LRU but the references
are checked and the LRU invalidated if necessary. The private_lock is held
once the references are known so the buffer lookup slow path will spin
on the private_lock. Between the page lock and private_lock, it should
be impossible for other references to be acquired and updates to happen
during the migration.

[mgorman@techsingularity.net: Changelog, removed tracing]
Fixes: 89cb0888ca14 "mm: migrate: provide buffer_migrate_page_norefs()"
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/migrate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index e9594bc0d406..a59e4aed6d2e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -771,12 +771,12 @@ static int __buffer_migrate_page(struct address_space *mapping,
 			}
 			bh = bh->b_this_page;
 		} while (bh != head);
-		spin_unlock(&mapping->private_lock);
 		if (busy) {
 			if (invalidated) {
 				rc = -EAGAIN;
 				goto unlock_buffers;
 			}
+			spin_unlock(&mapping->private_lock);
 			invalidate_bh_lrus();
 			invalidated = true;
 			goto recheck_buffers;
@@ -809,6 +809,8 @@ static int __buffer_migrate_page(struct address_space *mapping,
 
 	rc = MIGRATEPAGE_SUCCESS;
 unlock_buffers:
+	if (check_refs)
+		spin_unlock(&mapping->private_lock);
 	bh = head;
 	do {
 		unlock_buffer(bh);

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-07-14 21:20 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-11 12:58 [PATCH RFC] mm: migrate: Fix races of __find_get_block() and page migration Jan Kara
2019-07-12  0:04 ` Andrew Morton
2019-07-12  8:04   ` Mel Gorman
2019-07-12  9:17   ` Jan Kara
2019-07-12 10:10     ` Mel Gorman
2019-07-12 11:20       ` Jan Kara
2019-07-12 12:39         ` Mel Gorman
2019-07-12 21:21           ` Andrew Morton
2019-07-14 21:20             ` Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).