All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
@ 2019-10-02 10:15 Mikulas Patocka
  2019-10-02 20:00 ` Nikos Tsironis
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Mikulas Patocka @ 2019-10-02 10:15 UTC (permalink / raw)
  To: Nikos Tsironis, Mike Snitzer, Alasdair Kergon; +Cc: dm-devel, guru2018

Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.

The implementation of this throttling mechanism is prone to a deadlock:

1. One or more threads write to the origin device causing COW, which is
   performed by kcopyd.

2. At some point some of these threads might reach the s->cow_count
   semaphore limit and block in down(&s->cow_count), holding a read lock
   on _origins_lock.

3. Someone tries to acquire a write lock on _origins_lock, e.g.,
   snapshot_ctr(), which blocks because the threads at step (2) already
   hold a read lock on it.

4. A COW operation completes and kcopyd runs dm-snapshot's completion
   callback, which ends up calling pending_complete().
   pending_complete() tries to resubmit any deferred origin bios. This
   requires acquiring a read lock on _origins_lock, which blocks.

   This happens because the read-write semaphore implementation gives
   priority to writers, meaning that as soon as a writer tries to enter
   the critical section, no readers will be allowed in, until all
   writers have completed their work.

   So, pending_complete() waits for the writer at step (3) to acquire
   and release the lock. This writer waits for the readers at step (2)
   to release the read lock and those readers wait for
   pending_complete() (the kcopyd thread) to signal the s->cow_count
   semaphore: DEADLOCK.

In order to fix the bug, I reworked limiting, so that it waits without 
holding any locks. The patch adds a variable in_progress that counts how 
many kcopyd jobs are running. A function wait_for_in_progress will sleep 
if the variable in_progress is over the limit. It drops _origins_lock in 
order to avoid the deadlock.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org	# v5.0+
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")

---
 drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 14 deletions(-)

Index: linux-2.6/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
+++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
@@ -18,7 +18,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/semaphore.h>
 
 #include "dm.h"
 
@@ -107,8 +106,8 @@ struct dm_snapshot {
 	/* The on disk metadata handler */
 	struct dm_exception_store *store;
 
-	/* Maximum number of in-flight COW jobs. */
-	struct semaphore cow_count;
+	unsigned in_progress;
+	struct wait_queue_head in_progress_wait;
 
 	struct dm_kcopyd_client *kcopyd_client;
 
@@ -162,8 +161,8 @@ struct dm_snapshot {
  */
 #define DEFAULT_COW_THRESHOLD 2048
 
-static int cow_threshold = DEFAULT_COW_THRESHOLD;
-module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
 MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
 
 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
@@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
 		goto bad_hash_tables;
 	}
 
-	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+	init_waitqueue_head(&s->in_progress_wait);
 
 	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(s->kcopyd_client)) {
@@ -1509,17 +1508,46 @@ static void snapshot_dtr(struct dm_targe
 
 	dm_put_device(ti, s->origin);
 
+	WARN_ON(s->in_progress);
+
 	kfree(s);
 }
 
 static void account_start_copy(struct dm_snapshot *s)
 {
-	down(&s->cow_count);
+	spin_lock(&s->in_progress_wait.lock);
+	s->in_progress++;
+	spin_unlock(&s->in_progress_wait.lock);
 }
 
 static void account_end_copy(struct dm_snapshot *s)
 {
-	up(&s->cow_count);
+	spin_lock(&s->in_progress_wait.lock);
+	BUG_ON(!s->in_progress);
+	s->in_progress--;
+	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
+		wake_up_locked(&s->in_progress_wait);
+	spin_unlock(&s->in_progress_wait.lock);
+}
+
+static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
+{
+	if (unlikely(s->in_progress > cow_threshold)) {
+		spin_lock(&s->in_progress_wait.lock);
+		if (likely(s->in_progress > cow_threshold)) {
+			DECLARE_WAITQUEUE(wait, current);
+			__add_wait_queue(&s->in_progress_wait, &wait);
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock(&s->in_progress_wait.lock);
+			if (unlock_origins)
+				up_read(&_origins_lock);
+			io_schedule();
+			remove_wait_queue(&s->in_progress_wait, &wait);
+			return false;
+		}
+		spin_unlock(&s->in_progress_wait.lock);
+	}
+	return true;
 }
 
 /*
@@ -1537,7 +1565,7 @@ static void flush_bios(struct bio *bio)
 	}
 }
 
-static int do_origin(struct dm_dev *origin, struct bio *bio);
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
 
 /*
  * Flush a list of buffers.
@@ -1550,7 +1578,7 @@ static void retry_origin_bios(struct dm_
 	while (bio) {
 		n = bio->bi_next;
 		bio->bi_next = NULL;
-		r = do_origin(s->origin, bio);
+		r = do_origin(s->origin, bio, false);
 		if (r == DM_MAPIO_REMAPPED)
 			generic_make_request(bio);
 		bio = n;
@@ -1926,6 +1954,10 @@ static int snapshot_map(struct dm_target
 	if (!s->valid)
 		return DM_MAPIO_KILL;
 
+	if (bio_data_dir(bio) == WRITE) {
+		while (unlikely(!wait_for_in_progress(s, false))) ;
+	}
+
 	down_read(&s->lock);
 	dm_exception_table_lock(&lock);
 
@@ -2122,7 +2154,7 @@ redirect_to_origin:
 
 	if (bio_data_dir(bio) == WRITE) {
 		up_write(&s->lock);
-		return do_origin(s->origin, bio);
+		return do_origin(s->origin, bio, false);
 	}
 
 out_unlock:
@@ -2497,15 +2529,24 @@ next_snapshot:
 /*
  * Called on a write from the origin driver.
  */
-static int do_origin(struct dm_dev *origin, struct bio *bio)
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
 {
 	struct origin *o;
 	int r = DM_MAPIO_REMAPPED;
 
+again:
 	down_read(&_origins_lock);
 	o = __lookup_origin(origin->bdev);
-	if (o)
+	if (o) {
+		struct dm_snapshot *s;
+		if (limit) {
+			list_for_each_entry(s, &o->snapshots, list)
+				if (unlikely(!wait_for_in_progress(s, true)))
+					goto again;
+		}
+
 		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+	}
 	up_read(&_origins_lock);
 
 	return r;
@@ -2618,7 +2659,7 @@ static int origin_map(struct dm_target *
 		dm_accept_partial_bio(bio, available_sectors);
 
 	/* Only tell snapshots if this is a write */
-	return do_origin(o->dev, bio);
+	return do_origin(o->dev, bio, true);
 }
 
 /*

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-02 10:15 [PATCH 2/2] dm-snapshot: Reimplement the cow limit Mikulas Patocka
@ 2019-10-02 20:00 ` Nikos Tsironis
  2019-10-03 20:06   ` Mikulas Patocka
  2019-10-10 11:42 ` Nikos Tsironis
  2019-10-11 12:43 ` Nikos Tsironis
  2 siblings, 1 reply; 7+ messages in thread
From: Nikos Tsironis @ 2019-10-02 20:00 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer, Alasdair Kergon; +Cc: dm-devel, guru2018

Hi Mikulas,

I agree that it's better to avoid holding any locks while waiting for
some pending kcopyd jobs to finish, but please see the comments below.

On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
> workqueue stalls") introduced a semaphore to limit the maximum number of
> in-flight kcopyd (COW) jobs.
> 
> The implementation of this throttling mechanism is prone to a deadlock:
> 
> 1. One or more threads write to the origin device causing COW, which is
>    performed by kcopyd.
> 
> 2. At some point some of these threads might reach the s->cow_count
>    semaphore limit and block in down(&s->cow_count), holding a read lock
>    on _origins_lock.
> 
> 3. Someone tries to acquire a write lock on _origins_lock, e.g.,
>    snapshot_ctr(), which blocks because the threads at step (2) already
>    hold a read lock on it.
> 
> 4. A COW operation completes and kcopyd runs dm-snapshot's completion
>    callback, which ends up calling pending_complete().
>    pending_complete() tries to resubmit any deferred origin bios. This
>    requires acquiring a read lock on _origins_lock, which blocks.
> 
>    This happens because the read-write semaphore implementation gives
>    priority to writers, meaning that as soon as a writer tries to enter
>    the critical section, no readers will be allowed in, until all
>    writers have completed their work.
> 
>    So, pending_complete() waits for the writer at step (3) to acquire
>    and release the lock. This writer waits for the readers at step (2)
>    to release the read lock and those readers wait for
>    pending_complete() (the kcopyd thread) to signal the s->cow_count
>    semaphore: DEADLOCK.
> 
> In order to fix the bug, I reworked limiting, so that it waits without 
> holding any locks. The patch adds a variable in_progress that counts how 
> many kcopyd jobs are running. A function wait_for_in_progress will sleep 
> if the variable in_progress is over the limit. It drops _origins_lock in 
> order to avoid the deadlock.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Cc: stable@vger.kernel.org	# v5.0+
> Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
> 
> ---
>  drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 55 insertions(+), 14 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-snap.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
> +++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
> @@ -18,7 +18,6 @@
>  #include <linux/vmalloc.h>
>  #include <linux/log2.h>
>  #include <linux/dm-kcopyd.h>
> -#include <linux/semaphore.h>
>  
>  #include "dm.h"
>  
> @@ -107,8 +106,8 @@ struct dm_snapshot {
>  	/* The on disk metadata handler */
>  	struct dm_exception_store *store;
>  
> -	/* Maximum number of in-flight COW jobs. */
> -	struct semaphore cow_count;
> +	unsigned in_progress;
> +	struct wait_queue_head in_progress_wait;
>  
>  	struct dm_kcopyd_client *kcopyd_client;
>  
> @@ -162,8 +161,8 @@ struct dm_snapshot {
>   */
>  #define DEFAULT_COW_THRESHOLD 2048
>  
> -static int cow_threshold = DEFAULT_COW_THRESHOLD;
> -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
> +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
> +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
>  MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
>  
>  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
> @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
>  		goto bad_hash_tables;
>  	}
>  
> -	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
> +	init_waitqueue_head(&s->in_progress_wait);
>  
>  	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
>  	if (IS_ERR(s->kcopyd_client)) {
> @@ -1509,17 +1508,46 @@ static void snapshot_dtr(struct dm_targe
>  
>  	dm_put_device(ti, s->origin);
>  
> +	WARN_ON(s->in_progress);
> +
>  	kfree(s);
>  }
>  
>  static void account_start_copy(struct dm_snapshot *s)
>  {
> -	down(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	s->in_progress++;
> +	spin_unlock(&s->in_progress_wait.lock);
>  }
>  
>  static void account_end_copy(struct dm_snapshot *s)
>  {
> -	up(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	BUG_ON(!s->in_progress);
> +	s->in_progress--;
> +	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
> +		wake_up_locked(&s->in_progress_wait);
> +	spin_unlock(&s->in_progress_wait.lock);
> +}
> +
> +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
> +{
> +	if (unlikely(s->in_progress > cow_threshold)) {
> +		spin_lock(&s->in_progress_wait.lock);
> +		if (likely(s->in_progress > cow_threshold)) {
> +			DECLARE_WAITQUEUE(wait, current);
> +			__add_wait_queue(&s->in_progress_wait, &wait);
> +			__set_current_state(TASK_UNINTERRUPTIBLE);
> +			spin_unlock(&s->in_progress_wait.lock);
> +			if (unlock_origins)
> +				up_read(&_origins_lock);
> +			io_schedule();
> +			remove_wait_queue(&s->in_progress_wait, &wait);
> +			return false;
> +		}
> +		spin_unlock(&s->in_progress_wait.lock);
> +	}
> +	return true;
>  }

wait_for_in_progress() doesn't take into account which chunk is written
and whether it has already been reallocated or it is currently
reallocating.

This means, if I am not missing something, that both origin_map() and
snapshot_map() might unnecessarily throttle writes that don't need any
COW to take place.

For example, if we have some writes coming in, that trigger COW and
cause the COW limit to be reached, and then some more writes come in for
chunks that have already been reallocated (and before any kcopyd job
finishes and decrements s->in_progress), the latter writes would be
delayed without a reason, as they don't require any COW to be performed.

It seems strange that the COW throttling mechanism might also throttle
writes that don't require any COW.

Moreover, if we have reached the COW limit and we get a write for a
chunk that is currently reallocating we will block the thread, when we
could just add the bio to the origin_bios list of the pending exception
and move on.

wait_for_in_progress() could check if the exception is already
reallocated or is being reallocated, but the extra locking in the
critical path might have an adverse effect on performance, especially in
multi-threaded workloads. Maybe some benchmarks would help clarify that.

As a final note, in case the devices are slow, there might be many
writers waiting in s->in_progress_wait. When a kcopyd job finishes, all
of them will wake up and in some cases we might end up issuing more COW
jobs than the cow_count limit, as the accounting for new COW jobs
doesn't happen atomically with the check for the cow_count limit in
wait_for_in_progress().

That said, I don't think having a few more COW jobs in flight, than the
defined limit, will cause any issues.

Nikos

>  
>  /*
> @@ -1537,7 +1565,7 @@ static void flush_bios(struct bio *bio)
>  	}
>  }
>  
> -static int do_origin(struct dm_dev *origin, struct bio *bio);
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
>  
>  /*
>   * Flush a list of buffers.
> @@ -1550,7 +1578,7 @@ static void retry_origin_bios(struct dm_
>  	while (bio) {
>  		n = bio->bi_next;
>  		bio->bi_next = NULL;
> -		r = do_origin(s->origin, bio);
> +		r = do_origin(s->origin, bio, false);
>  		if (r == DM_MAPIO_REMAPPED)
>  			generic_make_request(bio);
>  		bio = n;
> @@ -1926,6 +1954,10 @@ static int snapshot_map(struct dm_target
>  	if (!s->valid)
>  		return DM_MAPIO_KILL;
>  
> +	if (bio_data_dir(bio) == WRITE) {
> +		while (unlikely(!wait_for_in_progress(s, false))) ;
> +	}
> +
>  	down_read(&s->lock);
>  	dm_exception_table_lock(&lock);
>  
> @@ -2122,7 +2154,7 @@ redirect_to_origin:
>  
>  	if (bio_data_dir(bio) == WRITE) {
>  		up_write(&s->lock);
> -		return do_origin(s->origin, bio);
> +		return do_origin(s->origin, bio, false);
>  	}
>  
>  out_unlock:
> @@ -2497,15 +2529,24 @@ next_snapshot:
>  /*
>   * Called on a write from the origin driver.
>   */
> -static int do_origin(struct dm_dev *origin, struct bio *bio)
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
>  {
>  	struct origin *o;
>  	int r = DM_MAPIO_REMAPPED;
>  
> +again:
>  	down_read(&_origins_lock);
>  	o = __lookup_origin(origin->bdev);
> -	if (o)
> +	if (o) {
> +		struct dm_snapshot *s;
> +		if (limit) {
> +			list_for_each_entry(s, &o->snapshots, list)
> +				if (unlikely(!wait_for_in_progress(s, true)))
> +					goto again;
> +		}
> +
>  		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
> +	}
>  	up_read(&_origins_lock);
>  
>  	return r;
> @@ -2618,7 +2659,7 @@ static int origin_map(struct dm_target *
>  		dm_accept_partial_bio(bio, available_sectors);
>  
>  	/* Only tell snapshots if this is a write */
> -	return do_origin(o->dev, bio);
> +	return do_origin(o->dev, bio, true);
>  }
>  
>  /*
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-02 20:00 ` Nikos Tsironis
@ 2019-10-03 20:06   ` Mikulas Patocka
  2019-10-04  2:44     ` Mike Snitzer
  0 siblings, 1 reply; 7+ messages in thread
From: Mikulas Patocka @ 2019-10-03 20:06 UTC (permalink / raw)
  To: Nikos Tsironis; +Cc: dm-devel, Alasdair Kergon, Mike Snitzer, guru2018



On Wed, 2 Oct 2019, Nikos Tsironis wrote:

> Hi Mikulas,
> 
> I agree that it's better to avoid holding any locks while waiting for
> some pending kcopyd jobs to finish, but please see the comments below.
> 
> On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> > Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
> > workqueue stalls") introduced a semaphore to limit the maximum number of
> > in-flight kcopyd (COW) jobs.
> > 
> > The implementation of this throttling mechanism is prone to a deadlock:
> > 
> > 1. One or more threads write to the origin device causing COW, which is
> >    performed by kcopyd.
> > 
> > 2. At some point some of these threads might reach the s->cow_count
> >    semaphore limit and block in down(&s->cow_count), holding a read lock
> >    on _origins_lock.
> > 
> > 3. Someone tries to acquire a write lock on _origins_lock, e.g.,
> >    snapshot_ctr(), which blocks because the threads at step (2) already
> >    hold a read lock on it.
> > 
> > 4. A COW operation completes and kcopyd runs dm-snapshot's completion
> >    callback, which ends up calling pending_complete().
> >    pending_complete() tries to resubmit any deferred origin bios. This
> >    requires acquiring a read lock on _origins_lock, which blocks.
> > 
> >    This happens because the read-write semaphore implementation gives
> >    priority to writers, meaning that as soon as a writer tries to enter
> >    the critical section, no readers will be allowed in, until all
> >    writers have completed their work.
> > 
> >    So, pending_complete() waits for the writer at step (3) to acquire
> >    and release the lock. This writer waits for the readers at step (2)
> >    to release the read lock and those readers wait for
> >    pending_complete() (the kcopyd thread) to signal the s->cow_count
> >    semaphore: DEADLOCK.
> > 
> > In order to fix the bug, I reworked limiting, so that it waits without 
> > holding any locks. The patch adds a variable in_progress that counts how 
> > many kcopyd jobs are running. A function wait_for_in_progress will sleep 
> > if the variable in_progress is over the limit. It drops _origins_lock in 
> > order to avoid the deadlock.
> > 
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Cc: stable@vger.kernel.org	# v5.0+
> > Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
> > 
> > ---
> >  drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 55 insertions(+), 14 deletions(-)
> > 
> > Index: linux-2.6/drivers/md/dm-snap.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
> > +++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
> > @@ -18,7 +18,6 @@
> >  #include <linux/vmalloc.h>
> >  #include <linux/log2.h>
> >  #include <linux/dm-kcopyd.h>
> > -#include <linux/semaphore.h>
> >  
> >  #include "dm.h"
> >  
> > @@ -107,8 +106,8 @@ struct dm_snapshot {
> >  	/* The on disk metadata handler */
> >  	struct dm_exception_store *store;
> >  
> > -	/* Maximum number of in-flight COW jobs. */
> > -	struct semaphore cow_count;
> > +	unsigned in_progress;
> > +	struct wait_queue_head in_progress_wait;
> >  
> >  	struct dm_kcopyd_client *kcopyd_client;
> >  
> > @@ -162,8 +161,8 @@ struct dm_snapshot {
> >   */
> >  #define DEFAULT_COW_THRESHOLD 2048
> >  
> > -static int cow_threshold = DEFAULT_COW_THRESHOLD;
> > -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
> > +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
> > +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
> >  MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
> >  
> >  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
> > @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
> >  		goto bad_hash_tables;
> >  	}
> >  
> > -	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
> > +	init_waitqueue_head(&s->in_progress_wait);
> >  
> >  	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
> >  	if (IS_ERR(s->kcopyd_client)) {
> > @@ -1509,17 +1508,46 @@ static void snapshot_dtr(struct dm_targe
> >  
> >  	dm_put_device(ti, s->origin);
> >  
> > +	WARN_ON(s->in_progress);
> > +
> >  	kfree(s);
> >  }
> >  
> >  static void account_start_copy(struct dm_snapshot *s)
> >  {
> > -	down(&s->cow_count);
> > +	spin_lock(&s->in_progress_wait.lock);
> > +	s->in_progress++;
> > +	spin_unlock(&s->in_progress_wait.lock);
> >  }
> >  
> >  static void account_end_copy(struct dm_snapshot *s)
> >  {
> > -	up(&s->cow_count);
> > +	spin_lock(&s->in_progress_wait.lock);
> > +	BUG_ON(!s->in_progress);
> > +	s->in_progress--;
> > +	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
> > +		wake_up_locked(&s->in_progress_wait);
> > +	spin_unlock(&s->in_progress_wait.lock);
> > +}
> > +
> > +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
> > +{
> > +	if (unlikely(s->in_progress > cow_threshold)) {
> > +		spin_lock(&s->in_progress_wait.lock);
> > +		if (likely(s->in_progress > cow_threshold)) {
> > +			DECLARE_WAITQUEUE(wait, current);
> > +			__add_wait_queue(&s->in_progress_wait, &wait);
> > +			__set_current_state(TASK_UNINTERRUPTIBLE);
> > +			spin_unlock(&s->in_progress_wait.lock);
> > +			if (unlock_origins)
> > +				up_read(&_origins_lock);
> > +			io_schedule();
> > +			remove_wait_queue(&s->in_progress_wait, &wait);
> > +			return false;
> > +		}
> > +		spin_unlock(&s->in_progress_wait.lock);
> > +	}
> > +	return true;
> >  }
> 
> wait_for_in_progress() doesn't take into account which chunk is written
> and whether it has already been reallocated or it is currently
> reallocating.
> 
> This means, if I am not missing something, that both origin_map() and
> snapshot_map() might unnecessarily throttle writes that don't need any
> COW to take place.

I know about it, but I think it's not serious problem - if there are 2048 
outstanding I/Os the system is already heavily congested. It doesn't 
matter if you allow a few more writes or not.

Mikulas

> For example, if we have some writes coming in, that trigger COW and
> cause the COW limit to be reached, and then some more writes come in for
> chunks that have already been reallocated (and before any kcopyd job
> finishes and decrements s->in_progress), the latter writes would be
> delayed without a reason, as they don't require any COW to be performed.
> 
> It seems strange that the COW throttling mechanism might also throttle
> writes that don't require any COW.
> 
> Moreover, if we have reached the COW limit and we get a write for a
> chunk that is currently reallocating we will block the thread, when we
> could just add the bio to the origin_bios list of the pending exception
> and move on.
> 
> wait_for_in_progress() could check if the exception is already
> reallocated or is being reallocated, but the extra locking in the
> critical path might have an adverse effect on performance, especially in
> multi-threaded workloads. Maybe some benchmarks would help clarify that.
> 
> As a final note, in case the devices are slow, there might be many
> writers waiting in s->in_progress_wait. When a kcopyd job finishes, all
> of them will wake up and in some cases we might end up issuing more COW
> jobs than the cow_count limit, as the accounting for new COW jobs
> doesn't happen atomically with the check for the cow_count limit in
> wait_for_in_progress().
> 
> That said, I don't think having a few more COW jobs in flight, than the
> defined limit, will cause any issues.
> 
> Nikos
> 
> >  
> >  /*
> > @@ -1537,7 +1565,7 @@ static void flush_bios(struct bio *bio)
> >  	}
> >  }
> >  
> > -static int do_origin(struct dm_dev *origin, struct bio *bio);
> > +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
> >  
> >  /*
> >   * Flush a list of buffers.
> > @@ -1550,7 +1578,7 @@ static void retry_origin_bios(struct dm_
> >  	while (bio) {
> >  		n = bio->bi_next;
> >  		bio->bi_next = NULL;
> > -		r = do_origin(s->origin, bio);
> > +		r = do_origin(s->origin, bio, false);
> >  		if (r == DM_MAPIO_REMAPPED)
> >  			generic_make_request(bio);
> >  		bio = n;
> > @@ -1926,6 +1954,10 @@ static int snapshot_map(struct dm_target
> >  	if (!s->valid)
> >  		return DM_MAPIO_KILL;
> >  
> > +	if (bio_data_dir(bio) == WRITE) {
> > +		while (unlikely(!wait_for_in_progress(s, false))) ;
> > +	}
> > +
> >  	down_read(&s->lock);
> >  	dm_exception_table_lock(&lock);
> >  
> > @@ -2122,7 +2154,7 @@ redirect_to_origin:
> >  
> >  	if (bio_data_dir(bio) == WRITE) {
> >  		up_write(&s->lock);
> > -		return do_origin(s->origin, bio);
> > +		return do_origin(s->origin, bio, false);
> >  	}
> >  
> >  out_unlock:
> > @@ -2497,15 +2529,24 @@ next_snapshot:
> >  /*
> >   * Called on a write from the origin driver.
> >   */
> > -static int do_origin(struct dm_dev *origin, struct bio *bio)
> > +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
> >  {
> >  	struct origin *o;
> >  	int r = DM_MAPIO_REMAPPED;
> >  
> > +again:
> >  	down_read(&_origins_lock);
> >  	o = __lookup_origin(origin->bdev);
> > -	if (o)
> > +	if (o) {
> > +		struct dm_snapshot *s;
> > +		if (limit) {
> > +			list_for_each_entry(s, &o->snapshots, list)
> > +				if (unlikely(!wait_for_in_progress(s, true)))
> > +					goto again;
> > +		}
> > +
> >  		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
> > +	}
> >  	up_read(&_origins_lock);
> >  
> >  	return r;
> > @@ -2618,7 +2659,7 @@ static int origin_map(struct dm_target *
> >  		dm_accept_partial_bio(bio, available_sectors);
> >  
> >  	/* Only tell snapshots if this is a write */
> > -	return do_origin(o->dev, bio);
> > +	return do_origin(o->dev, bio, true);
> >  }
> >  
> >  /*
> > 
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-03 20:06   ` Mikulas Patocka
@ 2019-10-04  2:44     ` Mike Snitzer
  0 siblings, 0 replies; 7+ messages in thread
From: Mike Snitzer @ 2019-10-04  2:44 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Nikos Tsironis, Alasdair Kergon, guru2018

On Thu, Oct 03 2019 at  4:06P -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Wed, 2 Oct 2019, Nikos Tsironis wrote:
> 
> > Hi Mikulas,
> > 
> > I agree that it's better to avoid holding any locks while waiting for
> > some pending kcopyd jobs to finish, but please see the comments below.
> > 
> > On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> > > +
> > > +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
> > > +{
> > > +	if (unlikely(s->in_progress > cow_threshold)) {
> > > +		spin_lock(&s->in_progress_wait.lock);
> > > +		if (likely(s->in_progress > cow_threshold)) {
> > > +			DECLARE_WAITQUEUE(wait, current);
> > > +			__add_wait_queue(&s->in_progress_wait, &wait);
> > > +			__set_current_state(TASK_UNINTERRUPTIBLE);
> > > +			spin_unlock(&s->in_progress_wait.lock);
> > > +			if (unlock_origins)
> > > +				up_read(&_origins_lock);
> > > +			io_schedule();
> > > +			remove_wait_queue(&s->in_progress_wait, &wait);
> > > +			return false;
> > > +		}
> > > +		spin_unlock(&s->in_progress_wait.lock);
> > > +	}
> > > +	return true;
> > >  }
> > 
> > wait_for_in_progress() doesn't take into account which chunk is written
> > and whether it has already been reallocated or it is currently
> > reallocating.
> > 
> > This means, if I am not missing something, that both origin_map() and
> > snapshot_map() might unnecessarily throttle writes that don't need any
> > COW to take place.
> 
> I know about it, but I think it's not serious problem - if there are 2048 
> outstanding I/Os the system is already heavily congested. It doesn't 
> matter if you allow a few more writes or not.
> 
> Mikulas
> 
> > For example, if we have some writes coming in, that trigger COW and
> > cause the COW limit to be reached, and then some more writes come in for
> > chunks that have already been reallocated (and before any kcopyd job
> > finishes and decrements s->in_progress), the latter writes would be
> > delayed without a reason, as they don't require any COW to be performed.
> > 
> > It seems strange that the COW throttling mechanism might also throttle
> > writes that don't require any COW.
> > 
> > Moreover, if we have reached the COW limit and we get a write for a
> > chunk that is currently reallocating we will block the thread, when we
> > could just add the bio to the origin_bios list of the pending exception
> > and move on.
> > 
> > wait_for_in_progress() could check if the exception is already
> > reallocated or is being reallocated, but the extra locking in the
> > critical path might have an adverse effect on performance, especially in
> > multi-threaded workloads. Maybe some benchmarks would help clarify that.
> > 
> > As a final note, in case the devices are slow, there might be many
> > writers waiting in s->in_progress_wait. When a kcopyd job finishes, all
> > of them will wake up and in some cases we might end up issuing more COW
> > jobs than the cow_count limit, as the accounting for new COW jobs
> > doesn't happen atomically with the check for the cow_count limit in
> > wait_for_in_progress().
> > 
> > That said, I don't think having a few more COW jobs in flight, than the
> > defined limit, will cause any issues.

Nikos,

I looked at your concern even before Mikulas replied and found it to be
valid.  But in the end I struggled to imagine how imposing extra
throttling once above the thorttle threshold would significantly impact
performance.

So when I saw Mikulas' reply he definitely reinforced my thinking.  But
please feel free to explore whether further refinement is needed.  I
think your concern about extra locking in the hotpath (to check if a
chunk already triggered an exception) was a great observation but if
such a check is done I'm hopeful it won't be _that_ costly because we'll
have already reached the cow threshold and would already be taking the
lock (as the 2nd phase of the double checked locking).

Anyway, I folded these small tweaks into Mikulas' 2nd patch:

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 560b8cb38026..4fb1a40e68a0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1525,7 +1525,8 @@ static void account_end_copy(struct dm_snapshot *s)
 	spin_lock(&s->in_progress_wait.lock);
 	BUG_ON(!s->in_progress);
 	s->in_progress--;
-	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
+	if (likely(s->in_progress <= cow_threshold) &&
+	    unlikely(waitqueue_active(&s->in_progress_wait)))
 		wake_up_locked(&s->in_progress_wait);
 	spin_unlock(&s->in_progress_wait.lock);
 }
@@ -1535,6 +1536,13 @@ static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
 	if (unlikely(s->in_progress > cow_threshold)) {
 		spin_lock(&s->in_progress_wait.lock);
 		if (likely(s->in_progress > cow_threshold)) {
+			/*
+			 * NOTE: this throttle doesn't account for whether
+			 * the caller is servicing an IO that will trigger a COW
+			 * so excess throttling may result for chunks not required
+			 * to be COW'd.  But if cow_threshold was reached, extra
+			 * throttling is unlikely to negatively impact performance.
+			 */
 			DECLARE_WAITQUEUE(wait, current);
 			__add_wait_queue(&s->in_progress_wait, &wait);
 			__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1955,7 +1963,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_KILL;
 
 	if (bio_data_dir(bio) == WRITE) {
-		while (unlikely(!wait_for_in_progress(s, false))) ;
+		while (unlikely(!wait_for_in_progress(s, false)))
+			; /* wait_for_in_progress() has slept */
 	}
 
 	down_read(&s->lock);
@@ -2538,8 +2547,8 @@ static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
 	down_read(&_origins_lock);
 	o = __lookup_origin(origin->bdev);
 	if (o) {
-		struct dm_snapshot *s;
 		if (limit) {
+			struct dm_snapshot *s;
 			list_for_each_entry(s, &o->snapshots, list)
 				if (unlikely(!wait_for_in_progress(s, true)))
 					goto again;

and I've pushed the changes to linux-next via linux-dm.git's for-next
(with tweaked commit headers).  But if you or Mikulas find something
that would warrant destaging these changes I'd welcome that feedback.

Thanks,
Mike

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-02 10:15 [PATCH 2/2] dm-snapshot: Reimplement the cow limit Mikulas Patocka
  2019-10-02 20:00 ` Nikos Tsironis
@ 2019-10-10 11:42 ` Nikos Tsironis
  2019-10-11 12:43 ` Nikos Tsironis
  2 siblings, 0 replies; 7+ messages in thread
From: Nikos Tsironis @ 2019-10-10 11:42 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer, Alasdair Kergon; +Cc: dm-devel, guru2018

On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
> workqueue stalls") introduced a semaphore to limit the maximum number of
> in-flight kcopyd (COW) jobs.
> 
> The implementation of this throttling mechanism is prone to a deadlock:
> 
> 1. One or more threads write to the origin device causing COW, which is
>    performed by kcopyd.
> 
> 2. At some point some of these threads might reach the s->cow_count
>    semaphore limit and block in down(&s->cow_count), holding a read lock
>    on _origins_lock.
> 
> 3. Someone tries to acquire a write lock on _origins_lock, e.g.,
>    snapshot_ctr(), which blocks because the threads at step (2) already
>    hold a read lock on it.
> 
> 4. A COW operation completes and kcopyd runs dm-snapshot's completion
>    callback, which ends up calling pending_complete().
>    pending_complete() tries to resubmit any deferred origin bios. This
>    requires acquiring a read lock on _origins_lock, which blocks.
> 
>    This happens because the read-write semaphore implementation gives
>    priority to writers, meaning that as soon as a writer tries to enter
>    the critical section, no readers will be allowed in, until all
>    writers have completed their work.
> 
>    So, pending_complete() waits for the writer at step (3) to acquire
>    and release the lock. This writer waits for the readers at step (2)
>    to release the read lock and those readers wait for
>    pending_complete() (the kcopyd thread) to signal the s->cow_count
>    semaphore: DEADLOCK.
> 
> In order to fix the bug, I reworked limiting, so that it waits without 
> holding any locks. The patch adds a variable in_progress that counts how 
> many kcopyd jobs are running. A function wait_for_in_progress will sleep 
> if the variable in_progress is over the limit. It drops _origins_lock in 
> order to avoid the deadlock.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Cc: stable@vger.kernel.org	# v5.0+
> Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
> 

Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>

> ---
>  drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 55 insertions(+), 14 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-snap.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
> +++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
> @@ -18,7 +18,6 @@
>  #include <linux/vmalloc.h>
>  #include <linux/log2.h>
>  #include <linux/dm-kcopyd.h>
> -#include <linux/semaphore.h>
>  
>  #include "dm.h"
>  
> @@ -107,8 +106,8 @@ struct dm_snapshot {
>  	/* The on disk metadata handler */
>  	struct dm_exception_store *store;
>  
> -	/* Maximum number of in-flight COW jobs. */
> -	struct semaphore cow_count;
> +	unsigned in_progress;
> +	struct wait_queue_head in_progress_wait;
>  
>  	struct dm_kcopyd_client *kcopyd_client;
>  
> @@ -162,8 +161,8 @@ struct dm_snapshot {
>   */
>  #define DEFAULT_COW_THRESHOLD 2048
>  
> -static int cow_threshold = DEFAULT_COW_THRESHOLD;
> -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
> +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
> +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
>  MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
>  
>  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
> @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
>  		goto bad_hash_tables;
>  	}
>  
> -	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
> +	init_waitqueue_head(&s->in_progress_wait);
>  
>  	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
>  	if (IS_ERR(s->kcopyd_client)) {
> @@ -1509,17 +1508,46 @@ static void snapshot_dtr(struct dm_targe
>  
>  	dm_put_device(ti, s->origin);
>  
> +	WARN_ON(s->in_progress);
> +
>  	kfree(s);
>  }
>  
>  static void account_start_copy(struct dm_snapshot *s)
>  {
> -	down(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	s->in_progress++;
> +	spin_unlock(&s->in_progress_wait.lock);
>  }
>  
>  static void account_end_copy(struct dm_snapshot *s)
>  {
> -	up(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	BUG_ON(!s->in_progress);
> +	s->in_progress--;
> +	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
> +		wake_up_locked(&s->in_progress_wait);
> +	spin_unlock(&s->in_progress_wait.lock);
> +}
> +
> +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
> +{
> +	if (unlikely(s->in_progress > cow_threshold)) {
> +		spin_lock(&s->in_progress_wait.lock);
> +		if (likely(s->in_progress > cow_threshold)) {
> +			DECLARE_WAITQUEUE(wait, current);
> +			__add_wait_queue(&s->in_progress_wait, &wait);
> +			__set_current_state(TASK_UNINTERRUPTIBLE);
> +			spin_unlock(&s->in_progress_wait.lock);
> +			if (unlock_origins)
> +				up_read(&_origins_lock);
> +			io_schedule();
> +			remove_wait_queue(&s->in_progress_wait, &wait);
> +			return false;
> +		}
> +		spin_unlock(&s->in_progress_wait.lock);
> +	}
> +	return true;
>  }
>  
>  /*
> @@ -1537,7 +1565,7 @@ static void flush_bios(struct bio *bio)
>  	}
>  }
>  
> -static int do_origin(struct dm_dev *origin, struct bio *bio);
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
>  
>  /*
>   * Flush a list of buffers.
> @@ -1550,7 +1578,7 @@ static void retry_origin_bios(struct dm_
>  	while (bio) {
>  		n = bio->bi_next;
>  		bio->bi_next = NULL;
> -		r = do_origin(s->origin, bio);
> +		r = do_origin(s->origin, bio, false);
>  		if (r == DM_MAPIO_REMAPPED)
>  			generic_make_request(bio);
>  		bio = n;
> @@ -1926,6 +1954,10 @@ static int snapshot_map(struct dm_target
>  	if (!s->valid)
>  		return DM_MAPIO_KILL;
>  
> +	if (bio_data_dir(bio) == WRITE) {
> +		while (unlikely(!wait_for_in_progress(s, false))) ;
> +	}
> +
>  	down_read(&s->lock);
>  	dm_exception_table_lock(&lock);
>  
> @@ -2122,7 +2154,7 @@ redirect_to_origin:
>  
>  	if (bio_data_dir(bio) == WRITE) {
>  		up_write(&s->lock);
> -		return do_origin(s->origin, bio);
> +		return do_origin(s->origin, bio, false);
>  	}
>  
>  out_unlock:
> @@ -2497,15 +2529,24 @@ next_snapshot:
>  /*
>   * Called on a write from the origin driver.
>   */
> -static int do_origin(struct dm_dev *origin, struct bio *bio)
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
>  {
>  	struct origin *o;
>  	int r = DM_MAPIO_REMAPPED;
>  
> +again:
>  	down_read(&_origins_lock);
>  	o = __lookup_origin(origin->bdev);
> -	if (o)
> +	if (o) {
> +		struct dm_snapshot *s;
> +		if (limit) {
> +			list_for_each_entry(s, &o->snapshots, list)
> +				if (unlikely(!wait_for_in_progress(s, true)))
> +					goto again;
> +		}
> +
>  		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
> +	}
>  	up_read(&_origins_lock);
>  
>  	return r;
> @@ -2618,7 +2659,7 @@ static int origin_map(struct dm_target *
>  		dm_accept_partial_bio(bio, available_sectors);
>  
>  	/* Only tell snapshots if this is a write */
> -	return do_origin(o->dev, bio);
> +	return do_origin(o->dev, bio, true);
>  }
>  
>  /*
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-02 10:15 [PATCH 2/2] dm-snapshot: Reimplement the cow limit Mikulas Patocka
  2019-10-02 20:00 ` Nikos Tsironis
  2019-10-10 11:42 ` Nikos Tsironis
@ 2019-10-11 12:43 ` Nikos Tsironis
  2019-10-11 16:04   ` Mike Snitzer
  2 siblings, 1 reply; 7+ messages in thread
From: Nikos Tsironis @ 2019-10-11 12:43 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer, Alasdair Kergon; +Cc: dm-devel, guru2018

On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
> workqueue stalls") introduced a semaphore to limit the maximum number of
> in-flight kcopyd (COW) jobs.
> 
> The implementation of this throttling mechanism is prone to a deadlock:
> 
> 1. One or more threads write to the origin device causing COW, which is
>    performed by kcopyd.
> 
> 2. At some point some of these threads might reach the s->cow_count
>    semaphore limit and block in down(&s->cow_count), holding a read lock
>    on _origins_lock.
> 
> 3. Someone tries to acquire a write lock on _origins_lock, e.g.,
>    snapshot_ctr(), which blocks because the threads at step (2) already
>    hold a read lock on it.
> 
> 4. A COW operation completes and kcopyd runs dm-snapshot's completion
>    callback, which ends up calling pending_complete().
>    pending_complete() tries to resubmit any deferred origin bios. This
>    requires acquiring a read lock on _origins_lock, which blocks.
> 
>    This happens because the read-write semaphore implementation gives
>    priority to writers, meaning that as soon as a writer tries to enter
>    the critical section, no readers will be allowed in, until all
>    writers have completed their work.
> 
>    So, pending_complete() waits for the writer at step (3) to acquire
>    and release the lock. This writer waits for the readers at step (2)
>    to release the read lock and those readers wait for
>    pending_complete() (the kcopyd thread) to signal the s->cow_count
>    semaphore: DEADLOCK.
> 
> In order to fix the bug, I reworked limiting, so that it waits without 
> holding any locks. The patch adds a variable in_progress that counts how 
> many kcopyd jobs are running. A function wait_for_in_progress will sleep 
> if the variable in_progress is over the limit. It drops _origins_lock in 
> order to avoid the deadlock.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Cc: stable@vger.kernel.org	# v5.0+
> Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
> 
> ---
>  drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 55 insertions(+), 14 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-snap.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
> +++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
> @@ -18,7 +18,6 @@
>  #include <linux/vmalloc.h>
>  #include <linux/log2.h>
>  #include <linux/dm-kcopyd.h>
> -#include <linux/semaphore.h>
>  
>  #include "dm.h"
>  
> @@ -107,8 +106,8 @@ struct dm_snapshot {
>  	/* The on disk metadata handler */
>  	struct dm_exception_store *store;
>  
> -	/* Maximum number of in-flight COW jobs. */
> -	struct semaphore cow_count;
> +	unsigned in_progress;
> +	struct wait_queue_head in_progress_wait;
>  
>  	struct dm_kcopyd_client *kcopyd_client;
>  
> @@ -162,8 +161,8 @@ struct dm_snapshot {
>   */
>  #define DEFAULT_COW_THRESHOLD 2048
>  
> -static int cow_threshold = DEFAULT_COW_THRESHOLD;
> -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
> +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
> +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
>  MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
>  
>  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
> @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
>  		goto bad_hash_tables;
>  	}
>  
> -	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
> +	init_waitqueue_head(&s->in_progress_wait);
>  

's->in_progress = 0' is missing here.

I totally missed that during the review and d3775354 ("dm: Use kzalloc
for all structs with embedded biosets/mempools") changed the allocation
of 's' to using kzalloc(), so 'in_progress' was implicitly initialized
to zero and the tests ran fine.

Nikos

>  	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
>  	if (IS_ERR(s->kcopyd_client)) {
> @@ -1509,17 +1508,46 @@ static void snapshot_dtr(struct dm_targe
>  
>  	dm_put_device(ti, s->origin);
>  
> +	WARN_ON(s->in_progress);
> +
>  	kfree(s);
>  }
>  
>  static void account_start_copy(struct dm_snapshot *s)
>  {
> -	down(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	s->in_progress++;
> +	spin_unlock(&s->in_progress_wait.lock);
>  }
>  
>  static void account_end_copy(struct dm_snapshot *s)
>  {
> -	up(&s->cow_count);
> +	spin_lock(&s->in_progress_wait.lock);
> +	BUG_ON(!s->in_progress);
> +	s->in_progress--;
> +	if (likely(s->in_progress <= cow_threshold) && unlikely(waitqueue_active(&s->in_progress_wait)))
> +		wake_up_locked(&s->in_progress_wait);
> +	spin_unlock(&s->in_progress_wait.lock);
> +}
> +
> +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
> +{
> +	if (unlikely(s->in_progress > cow_threshold)) {
> +		spin_lock(&s->in_progress_wait.lock);
> +		if (likely(s->in_progress > cow_threshold)) {
> +			DECLARE_WAITQUEUE(wait, current);
> +			__add_wait_queue(&s->in_progress_wait, &wait);
> +			__set_current_state(TASK_UNINTERRUPTIBLE);
> +			spin_unlock(&s->in_progress_wait.lock);
> +			if (unlock_origins)
> +				up_read(&_origins_lock);
> +			io_schedule();
> +			remove_wait_queue(&s->in_progress_wait, &wait);
> +			return false;
> +		}
> +		spin_unlock(&s->in_progress_wait.lock);
> +	}
> +	return true;
>  }
>  
>  /*
> @@ -1537,7 +1565,7 @@ static void flush_bios(struct bio *bio)
>  	}
>  }
>  
> -static int do_origin(struct dm_dev *origin, struct bio *bio);
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
>  
>  /*
>   * Flush a list of buffers.
> @@ -1550,7 +1578,7 @@ static void retry_origin_bios(struct dm_
>  	while (bio) {
>  		n = bio->bi_next;
>  		bio->bi_next = NULL;
> -		r = do_origin(s->origin, bio);
> +		r = do_origin(s->origin, bio, false);
>  		if (r == DM_MAPIO_REMAPPED)
>  			generic_make_request(bio);
>  		bio = n;
> @@ -1926,6 +1954,10 @@ static int snapshot_map(struct dm_target
>  	if (!s->valid)
>  		return DM_MAPIO_KILL;
>  
> +	if (bio_data_dir(bio) == WRITE) {
> +		while (unlikely(!wait_for_in_progress(s, false))) ;
> +	}
> +
>  	down_read(&s->lock);
>  	dm_exception_table_lock(&lock);
>  
> @@ -2122,7 +2154,7 @@ redirect_to_origin:
>  
>  	if (bio_data_dir(bio) == WRITE) {
>  		up_write(&s->lock);
> -		return do_origin(s->origin, bio);
> +		return do_origin(s->origin, bio, false);
>  	}
>  
>  out_unlock:
> @@ -2497,15 +2529,24 @@ next_snapshot:
>  /*
>   * Called on a write from the origin driver.
>   */
> -static int do_origin(struct dm_dev *origin, struct bio *bio)
> +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
>  {
>  	struct origin *o;
>  	int r = DM_MAPIO_REMAPPED;
>  
> +again:
>  	down_read(&_origins_lock);
>  	o = __lookup_origin(origin->bdev);
> -	if (o)
> +	if (o) {
> +		struct dm_snapshot *s;
> +		if (limit) {
> +			list_for_each_entry(s, &o->snapshots, list)
> +				if (unlikely(!wait_for_in_progress(s, true)))
> +					goto again;
> +		}
> +
>  		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
> +	}
>  	up_read(&_origins_lock);
>  
>  	return r;
> @@ -2618,7 +2659,7 @@ static int origin_map(struct dm_target *
>  		dm_accept_partial_bio(bio, available_sectors);
>  
>  	/* Only tell snapshots if this is a write */
> -	return do_origin(o->dev, bio);
> +	return do_origin(o->dev, bio, true);
>  }
>  
>  /*
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dm-snapshot: Reimplement the cow limit.
  2019-10-11 12:43 ` Nikos Tsironis
@ 2019-10-11 16:04   ` Mike Snitzer
  0 siblings, 0 replies; 7+ messages in thread
From: Mike Snitzer @ 2019-10-11 16:04 UTC (permalink / raw)
  To: Nikos Tsironis; +Cc: dm-devel, Mikulas Patocka, Alasdair Kergon, guru2018

On Fri, Oct 11 2019 at  8:43am -0400,
Nikos Tsironis <ntsironis@arrikto.com> wrote:

> On 10/2/19 1:15 PM, Mikulas Patocka wrote:
> > Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
> > workqueue stalls") introduced a semaphore to limit the maximum number of
> > in-flight kcopyd (COW) jobs.
> > 
> > The implementation of this throttling mechanism is prone to a deadlock:
> > 
> > 1. One or more threads write to the origin device causing COW, which is
> >    performed by kcopyd.
> > 
> > 2. At some point some of these threads might reach the s->cow_count
> >    semaphore limit and block in down(&s->cow_count), holding a read lock
> >    on _origins_lock.
> > 
> > 3. Someone tries to acquire a write lock on _origins_lock, e.g.,
> >    snapshot_ctr(), which blocks because the threads at step (2) already
> >    hold a read lock on it.
> > 
> > 4. A COW operation completes and kcopyd runs dm-snapshot's completion
> >    callback, which ends up calling pending_complete().
> >    pending_complete() tries to resubmit any deferred origin bios. This
> >    requires acquiring a read lock on _origins_lock, which blocks.
> > 
> >    This happens because the read-write semaphore implementation gives
> >    priority to writers, meaning that as soon as a writer tries to enter
> >    the critical section, no readers will be allowed in, until all
> >    writers have completed their work.
> > 
> >    So, pending_complete() waits for the writer at step (3) to acquire
> >    and release the lock. This writer waits for the readers at step (2)
> >    to release the read lock and those readers wait for
> >    pending_complete() (the kcopyd thread) to signal the s->cow_count
> >    semaphore: DEADLOCK.
> > 
> > In order to fix the bug, I reworked limiting, so that it waits without 
> > holding any locks. The patch adds a variable in_progress that counts how 
> > many kcopyd jobs are running. A function wait_for_in_progress will sleep 
> > if the variable in_progress is over the limit. It drops _origins_lock in 
> > order to avoid the deadlock.
> > 
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Cc: stable@vger.kernel.org	# v5.0+
> > Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
> > 
> > ---
> >  drivers/md/dm-snap.c |   69 ++++++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 55 insertions(+), 14 deletions(-)
> > 
> > Index: linux-2.6/drivers/md/dm-snap.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-snap.c	2019-10-01 15:23:42.000000000 +0200
> > +++ linux-2.6/drivers/md/dm-snap.c	2019-10-02 12:01:23.000000000 +0200
> > @@ -18,7 +18,6 @@
> >  #include <linux/vmalloc.h>
> >  #include <linux/log2.h>
> >  #include <linux/dm-kcopyd.h>
> > -#include <linux/semaphore.h>
> >  
> >  #include "dm.h"
> >  
> > @@ -107,8 +106,8 @@ struct dm_snapshot {
> >  	/* The on disk metadata handler */
> >  	struct dm_exception_store *store;
> >  
> > -	/* Maximum number of in-flight COW jobs. */
> > -	struct semaphore cow_count;
> > +	unsigned in_progress;
> > +	struct wait_queue_head in_progress_wait;
> >  
> >  	struct dm_kcopyd_client *kcopyd_client;
> >  
> > @@ -162,8 +161,8 @@ struct dm_snapshot {
> >   */
> >  #define DEFAULT_COW_THRESHOLD 2048
> >  
> > -static int cow_threshold = DEFAULT_COW_THRESHOLD;
> > -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
> > +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
> > +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
> >  MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
> >  
> >  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
> > @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target
> >  		goto bad_hash_tables;
> >  	}
> >  
> > -	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
> > +	init_waitqueue_head(&s->in_progress_wait);
> >  
> 
> 's->in_progress = 0' is missing here.
> 
> I totally missed that during the review and d3775354 ("dm: Use kzalloc
> for all structs with embedded biosets/mempools") changed the allocation
> of 's' to using kzalloc(), so 'in_progress' was implicitly initialized
> to zero and the tests ran fine.

OK, so the need to explicitly initialize to zero only exists in older
kernel (e.g. the 4.4-stable kernel).  Either that or cherry-pick commit
d3775354 (even if only the hunk that modifies dm-snap.c)

Mike

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-10-11 16:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-02 10:15 [PATCH 2/2] dm-snapshot: Reimplement the cow limit Mikulas Patocka
2019-10-02 20:00 ` Nikos Tsironis
2019-10-03 20:06   ` Mikulas Patocka
2019-10-04  2:44     ` Mike Snitzer
2019-10-10 11:42 ` Nikos Tsironis
2019-10-11 12:43 ` Nikos Tsironis
2019-10-11 16:04   ` Mike Snitzer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.