All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
@ 2019-11-11 13:59 Mikulas Patocka
  2019-11-11 16:37 ` Nikos Tsironis
  2019-11-12 15:34 ` Mike Snitzer
  0 siblings, 2 replies; 9+ messages in thread
From: Mikulas Patocka @ 2019-11-11 13:59 UTC (permalink / raw)
  To: Mike Snitzer, Scott Wood; +Cc: dm-devel, Nikos Tsironis, Ilias Tsitsimpis

Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
hlist_bl is implemented as a raw spinlock and the code takes two non-raw
spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
in the realtime kernel, so they couldn't be taken inside a raw spinlock).

This patch fixes the problem by using non-raw spinlock
exception_table_lock instead of the hlist_bl lock.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")

---
 drivers/md/dm-snap.c |   65 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 23 deletions(-)

Index: linux-2.6/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-snap.c	2019-11-08 15:51:42.000000000 +0100
+++ linux-2.6/drivers/md/dm-snap.c	2019-11-08 15:54:58.000000000 +0100
@@ -141,6 +141,10 @@ struct dm_snapshot {
 	 * for them to be committed.
 	 */
 	struct bio_list bios_queued_during_merge;
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t exception_table_lock;
+#endif
 };
 
 /*
@@ -625,30 +629,42 @@ static uint32_t exception_hash(struct dm
 
 /* Lock to protect access to the completed and pending exception hash tables. */
 struct dm_exception_table_lock {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	struct hlist_bl_head *complete_slot;
 	struct hlist_bl_head *pending_slot;
+#endif
 };
 
 static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
 					 struct dm_exception_table_lock *lock)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	struct dm_exception_table *complete = &s->complete;
 	struct dm_exception_table *pending = &s->pending;
 
 	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
 	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+#endif
 }
 
-static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+static void dm_exception_table_lock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_lock(&s->exception_table_lock);
+#else
 	hlist_bl_lock(lock->complete_slot);
 	hlist_bl_lock(lock->pending_slot);
+#endif
 }
 
-static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+static void dm_exception_table_unlock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_unlock(&s->exception_table_lock);
+#else
 	hlist_bl_unlock(lock->pending_slot);
 	hlist_bl_unlock(lock->complete_slot);
+#endif
 }
 
 static int dm_exception_table_init(struct dm_exception_table *et,
@@ -835,9 +851,9 @@ static int dm_add_exception(void *contex
 	 */
 	dm_exception_table_lock_init(s, old, &lock);
 
-	dm_exception_table_lock(&lock);
+	dm_exception_table_lock(s, &lock);
 	dm_insert_exception(&s->complete, e);
-	dm_exception_table_unlock(&lock);
+	dm_exception_table_unlock(s, &lock);
 
 	return 0;
 }
@@ -1318,6 +1334,9 @@ static int snapshot_ctr(struct dm_target
 	s->first_merging_chunk = 0;
 	s->num_merging_chunks = 0;
 	bio_list_init(&s->bios_queued_during_merge);
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_lock_init(&s->exception_table_lock);
+#endif
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
@@ -1651,7 +1670,7 @@ static void pending_complete(void *conte
 		invalidate_snapshot(s, -EIO);
 		error = 1;
 
-		dm_exception_table_lock(&lock);
+		dm_exception_table_lock(s, &lock);
 		goto out;
 	}
 
@@ -1660,13 +1679,13 @@ static void pending_complete(void *conte
 		invalidate_snapshot(s, -ENOMEM);
 		error = 1;
 
-		dm_exception_table_lock(&lock);
+		dm_exception_table_lock(s, &lock);
 		goto out;
 	}
 	*e = pe->e;
 
 	down_read(&s->lock);
-	dm_exception_table_lock(&lock);
+	dm_exception_table_lock(s, &lock);
 	if (!s->valid) {
 		up_read(&s->lock);
 		free_completed_exception(e);
@@ -1687,16 +1706,16 @@ static void pending_complete(void *conte
 
 	/* Wait for conflicting reads to drain */
 	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
-		dm_exception_table_unlock(&lock);
+		dm_exception_table_unlock(s, &lock);
 		__check_for_conflicting_io(s, pe->e.old_chunk);
-		dm_exception_table_lock(&lock);
+		dm_exception_table_lock(s, &lock);
 	}
 
 out:
 	/* Remove the in-flight exception from the list */
 	dm_remove_exception(&pe->e);
 
-	dm_exception_table_unlock(&lock);
+	dm_exception_table_unlock(s, &lock);
 
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
@@ -1968,7 +1987,7 @@ static int snapshot_map(struct dm_target
 	}
 
 	down_read(&s->lock);
-	dm_exception_table_lock(&lock);
+	dm_exception_table_lock(s, &lock);
 
 	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
 	    bio_data_dir(bio) == WRITE)) {
@@ -1997,7 +2016,7 @@ static int snapshot_map(struct dm_target
 		remap_exception(s, e, bio, chunk);
 		if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
 		    io_overlaps_chunk(s, bio)) {
-			dm_exception_table_unlock(&lock);
+			dm_exception_table_unlock(s, &lock);
 			up_read(&s->lock);
 			zero_exception(s, e, bio, chunk);
 			r = DM_MAPIO_SUBMITTED; /* discard is not issued */
@@ -2024,9 +2043,9 @@ static int snapshot_map(struct dm_target
 	if (bio_data_dir(bio) == WRITE) {
 		pe = __lookup_pending_exception(s, chunk);
 		if (!pe) {
-			dm_exception_table_unlock(&lock);
+			dm_exception_table_unlock(s, &lock);
 			pe = alloc_pending_exception(s);
-			dm_exception_table_lock(&lock);
+			dm_exception_table_lock(s, &lock);
 
 			e = dm_lookup_exception(&s->complete, chunk);
 			if (e) {
@@ -2037,7 +2056,7 @@ static int snapshot_map(struct dm_target
 
 			pe = __find_pending_exception(s, pe, chunk);
 			if (!pe) {
-				dm_exception_table_unlock(&lock);
+				dm_exception_table_unlock(s, &lock);
 				up_read(&s->lock);
 
 				down_write(&s->lock);
@@ -2063,7 +2082,7 @@ static int snapshot_map(struct dm_target
 		if (!pe->started && io_overlaps_chunk(s, bio)) {
 			pe->started = 1;
 
-			dm_exception_table_unlock(&lock);
+			dm_exception_table_unlock(s, &lock);
 			up_read(&s->lock);
 
 			start_full_bio(pe, bio);
@@ -2076,7 +2095,7 @@ static int snapshot_map(struct dm_target
 			/* this is protected by the exception table lock */
 			pe->started = 1;
 
-			dm_exception_table_unlock(&lock);
+			dm_exception_table_unlock(s, &lock);
 			up_read(&s->lock);
 
 			start_copy(pe);
@@ -2088,7 +2107,7 @@ static int snapshot_map(struct dm_target
 	}
 
 out_unlock:
-	dm_exception_table_unlock(&lock);
+	dm_exception_table_unlock(s, &lock);
 	up_read(&s->lock);
 out:
 	return r;
@@ -2449,7 +2468,7 @@ static int __origin_write(struct list_he
 		dm_exception_table_lock_init(snap, chunk, &lock);
 
 		down_read(&snap->lock);
-		dm_exception_table_lock(&lock);
+		dm_exception_table_lock(snap, &lock);
 
 		/* Only deal with valid and active snapshots */
 		if (!snap->valid || !snap->active)
@@ -2466,9 +2485,9 @@ static int __origin_write(struct list_he
 			if (e)
 				goto next_snapshot;
 
-			dm_exception_table_unlock(&lock);
+			dm_exception_table_unlock(snap, &lock);
 			pe = alloc_pending_exception(snap);
-			dm_exception_table_lock(&lock);
+			dm_exception_table_lock(snap, &lock);
 
 			pe2 = __lookup_pending_exception(snap, chunk);
 
@@ -2481,7 +2500,7 @@ static int __origin_write(struct list_he
 
 				pe = __insert_pending_exception(snap, pe, chunk);
 				if (!pe) {
-					dm_exception_table_unlock(&lock);
+					dm_exception_table_unlock(snap, &lock);
 					up_read(&snap->lock);
 
 					invalidate_snapshot(snap, -ENOMEM);
@@ -2516,7 +2535,7 @@ static int __origin_write(struct list_he
 		}
 
 next_snapshot:
-		dm_exception_table_unlock(&lock);
+		dm_exception_table_unlock(snap, &lock);
 		up_read(&snap->lock);
 
 		if (pe_to_start_now) {

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-11 13:59 [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel Mikulas Patocka
@ 2019-11-11 16:37 ` Nikos Tsironis
  2019-11-12  1:14   ` Mike Snitzer
  2019-11-12 15:34 ` Mike Snitzer
  1 sibling, 1 reply; 9+ messages in thread
From: Nikos Tsironis @ 2019-11-11 16:37 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer, Scott Wood; +Cc: dm-devel, Ilias Tsitsimpis

On 11/11/19 3:59 PM, Mikulas Patocka wrote:
> Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> 
> This patch fixes the problem by using non-raw spinlock
> exception_table_lock instead of the hlist_bl lock.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> 

Hi Mikulas,

I wasn't aware that hlist_bl is implemented as a raw spinlock in the
real time kernel. I would expect it to be a standard non-raw spinlock,
so everything works as expected. But, after digging further in the real
time tree, I found commit ad7675b15fd87f1 ("list_bl: Make list head
locking RT safe") which suggests that such a conversion would break
other parts of the kernel.

That said,

  Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>

> ---
>  drivers/md/dm-snap.c |   65 ++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 42 insertions(+), 23 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-snap.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-snap.c	2019-11-08 15:51:42.000000000 +0100
> +++ linux-2.6/drivers/md/dm-snap.c	2019-11-08 15:54:58.000000000 +0100
> @@ -141,6 +141,10 @@ struct dm_snapshot {
>  	 * for them to be committed.
>  	 */
>  	struct bio_list bios_queued_during_merge;
> +
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spinlock_t exception_table_lock;
> +#endif
>  };
>  
>  /*
> @@ -625,30 +629,42 @@ static uint32_t exception_hash(struct dm
>  
>  /* Lock to protect access to the completed and pending exception hash tables. */
>  struct dm_exception_table_lock {
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  	struct hlist_bl_head *complete_slot;
>  	struct hlist_bl_head *pending_slot;
> +#endif
>  };
>  
>  static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
>  					 struct dm_exception_table_lock *lock)
>  {
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  	struct dm_exception_table *complete = &s->complete;
>  	struct dm_exception_table *pending = &s->pending;
>  
>  	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
>  	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
> +#endif
>  }
>  
> -static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
> +static void dm_exception_table_lock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
>  {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_lock(&s->exception_table_lock);
> +#else
>  	hlist_bl_lock(lock->complete_slot);
>  	hlist_bl_lock(lock->pending_slot);
> +#endif
>  }
>  
> -static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
> +static void dm_exception_table_unlock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
>  {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_unlock(&s->exception_table_lock);
> +#else
>  	hlist_bl_unlock(lock->pending_slot);
>  	hlist_bl_unlock(lock->complete_slot);
> +#endif
>  }
>  
>  static int dm_exception_table_init(struct dm_exception_table *et,
> @@ -835,9 +851,9 @@ static int dm_add_exception(void *contex
>  	 */
>  	dm_exception_table_lock_init(s, old, &lock);
>  
> -	dm_exception_table_lock(&lock);
> +	dm_exception_table_lock(s, &lock);
>  	dm_insert_exception(&s->complete, e);
> -	dm_exception_table_unlock(&lock);
> +	dm_exception_table_unlock(s, &lock);
>  
>  	return 0;
>  }
> @@ -1318,6 +1334,9 @@ static int snapshot_ctr(struct dm_target
>  	s->first_merging_chunk = 0;
>  	s->num_merging_chunks = 0;
>  	bio_list_init(&s->bios_queued_during_merge);
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_lock_init(&s->exception_table_lock);
> +#endif
>  
>  	/* Allocate hash table for COW data */
>  	if (init_hash_tables(s)) {
> @@ -1651,7 +1670,7 @@ static void pending_complete(void *conte
>  		invalidate_snapshot(s, -EIO);
>  		error = 1;
>  
> -		dm_exception_table_lock(&lock);
> +		dm_exception_table_lock(s, &lock);
>  		goto out;
>  	}
>  
> @@ -1660,13 +1679,13 @@ static void pending_complete(void *conte
>  		invalidate_snapshot(s, -ENOMEM);
>  		error = 1;
>  
> -		dm_exception_table_lock(&lock);
> +		dm_exception_table_lock(s, &lock);
>  		goto out;
>  	}
>  	*e = pe->e;
>  
>  	down_read(&s->lock);
> -	dm_exception_table_lock(&lock);
> +	dm_exception_table_lock(s, &lock);
>  	if (!s->valid) {
>  		up_read(&s->lock);
>  		free_completed_exception(e);
> @@ -1687,16 +1706,16 @@ static void pending_complete(void *conte
>  
>  	/* Wait for conflicting reads to drain */
>  	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
> -		dm_exception_table_unlock(&lock);
> +		dm_exception_table_unlock(s, &lock);
>  		__check_for_conflicting_io(s, pe->e.old_chunk);
> -		dm_exception_table_lock(&lock);
> +		dm_exception_table_lock(s, &lock);
>  	}
>  
>  out:
>  	/* Remove the in-flight exception from the list */
>  	dm_remove_exception(&pe->e);
>  
> -	dm_exception_table_unlock(&lock);
> +	dm_exception_table_unlock(s, &lock);
>  
>  	snapshot_bios = bio_list_get(&pe->snapshot_bios);
>  	origin_bios = bio_list_get(&pe->origin_bios);
> @@ -1968,7 +1987,7 @@ static int snapshot_map(struct dm_target
>  	}
>  
>  	down_read(&s->lock);
> -	dm_exception_table_lock(&lock);
> +	dm_exception_table_lock(s, &lock);
>  
>  	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
>  	    bio_data_dir(bio) == WRITE)) {
> @@ -1997,7 +2016,7 @@ static int snapshot_map(struct dm_target
>  		remap_exception(s, e, bio, chunk);
>  		if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
>  		    io_overlaps_chunk(s, bio)) {
> -			dm_exception_table_unlock(&lock);
> +			dm_exception_table_unlock(s, &lock);
>  			up_read(&s->lock);
>  			zero_exception(s, e, bio, chunk);
>  			r = DM_MAPIO_SUBMITTED; /* discard is not issued */
> @@ -2024,9 +2043,9 @@ static int snapshot_map(struct dm_target
>  	if (bio_data_dir(bio) == WRITE) {
>  		pe = __lookup_pending_exception(s, chunk);
>  		if (!pe) {
> -			dm_exception_table_unlock(&lock);
> +			dm_exception_table_unlock(s, &lock);
>  			pe = alloc_pending_exception(s);
> -			dm_exception_table_lock(&lock);
> +			dm_exception_table_lock(s, &lock);
>  
>  			e = dm_lookup_exception(&s->complete, chunk);
>  			if (e) {
> @@ -2037,7 +2056,7 @@ static int snapshot_map(struct dm_target
>  
>  			pe = __find_pending_exception(s, pe, chunk);
>  			if (!pe) {
> -				dm_exception_table_unlock(&lock);
> +				dm_exception_table_unlock(s, &lock);
>  				up_read(&s->lock);
>  
>  				down_write(&s->lock);
> @@ -2063,7 +2082,7 @@ static int snapshot_map(struct dm_target
>  		if (!pe->started && io_overlaps_chunk(s, bio)) {
>  			pe->started = 1;
>  
> -			dm_exception_table_unlock(&lock);
> +			dm_exception_table_unlock(s, &lock);
>  			up_read(&s->lock);
>  
>  			start_full_bio(pe, bio);
> @@ -2076,7 +2095,7 @@ static int snapshot_map(struct dm_target
>  			/* this is protected by the exception table lock */
>  			pe->started = 1;
>  
> -			dm_exception_table_unlock(&lock);
> +			dm_exception_table_unlock(s, &lock);
>  			up_read(&s->lock);
>  
>  			start_copy(pe);
> @@ -2088,7 +2107,7 @@ static int snapshot_map(struct dm_target
>  	}
>  
>  out_unlock:
> -	dm_exception_table_unlock(&lock);
> +	dm_exception_table_unlock(s, &lock);
>  	up_read(&s->lock);
>  out:
>  	return r;
> @@ -2449,7 +2468,7 @@ static int __origin_write(struct list_he
>  		dm_exception_table_lock_init(snap, chunk, &lock);
>  
>  		down_read(&snap->lock);
> -		dm_exception_table_lock(&lock);
> +		dm_exception_table_lock(snap, &lock);
>  
>  		/* Only deal with valid and active snapshots */
>  		if (!snap->valid || !snap->active)
> @@ -2466,9 +2485,9 @@ static int __origin_write(struct list_he
>  			if (e)
>  				goto next_snapshot;
>  
> -			dm_exception_table_unlock(&lock);
> +			dm_exception_table_unlock(snap, &lock);
>  			pe = alloc_pending_exception(snap);
> -			dm_exception_table_lock(&lock);
> +			dm_exception_table_lock(snap, &lock);
>  
>  			pe2 = __lookup_pending_exception(snap, chunk);
>  
> @@ -2481,7 +2500,7 @@ static int __origin_write(struct list_he
>  
>  				pe = __insert_pending_exception(snap, pe, chunk);
>  				if (!pe) {
> -					dm_exception_table_unlock(&lock);
> +					dm_exception_table_unlock(snap, &lock);
>  					up_read(&snap->lock);
>  
>  					invalidate_snapshot(snap, -ENOMEM);
> @@ -2516,7 +2535,7 @@ static int __origin_write(struct list_he
>  		}
>  
>  next_snapshot:
> -		dm_exception_table_unlock(&lock);
> +		dm_exception_table_unlock(snap, &lock);
>  		up_read(&snap->lock);
>  
>  		if (pe_to_start_now) {
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-11 16:37 ` Nikos Tsironis
@ 2019-11-12  1:14   ` Mike Snitzer
  2019-11-12  7:50     ` Mikulas Patocka
  0 siblings, 1 reply; 9+ messages in thread
From: Mike Snitzer @ 2019-11-12  1:14 UTC (permalink / raw)
  To: Nikos Tsironis; +Cc: dm-devel, Mikulas Patocka, Scott Wood, Ilias Tsitsimpis

On Mon, Nov 11 2019 at 11:37am -0500,
Nikos Tsironis <ntsironis@arrikto.com> wrote:

> On 11/11/19 3:59 PM, Mikulas Patocka wrote:
> > Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> > hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> > spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> > in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> > 
> > This patch fixes the problem by using non-raw spinlock
> > exception_table_lock instead of the hlist_bl lock.
> > 
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> > 
> 
> Hi Mikulas,
> 
> I wasn't aware that hlist_bl is implemented as a raw spinlock in the
> real time kernel. I would expect it to be a standard non-raw spinlock,
> so everything works as expected. But, after digging further in the real
> time tree, I found commit ad7675b15fd87f1 ("list_bl: Make list head
> locking RT safe") which suggests that such a conversion would break
> other parts of the kernel.

Right, the proper fix is to update list_bl to work on realtime (which I
assume the referenced commit does).  I do not want to take this
dm-snapshot specific workaround that open-codes what should be done
within hlist_{bl_lock,unlock}, etc.

I'm not yet sure which realtime mailing list and/or maintainers should
be cc'd to further the inclussion of commit ad7675b15fd87f1 -- Nikos do
you?

Thanks,
Mike

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-12  1:14   ` Mike Snitzer
@ 2019-11-12  7:50     ` Mikulas Patocka
  2019-11-12 11:45       ` Nikos Tsironis
  0 siblings, 1 reply; 9+ messages in thread
From: Mikulas Patocka @ 2019-11-12  7:50 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Nikos Tsironis, Scott Wood, Ilias Tsitsimpis, dm-devel,
	linux-kernel, linux-fsdevel



On Mon, 11 Nov 2019, Mike Snitzer wrote:

> On Mon, Nov 11 2019 at 11:37am -0500,
> Nikos Tsironis <ntsironis@arrikto.com> wrote:
> 
> > On 11/11/19 3:59 PM, Mikulas Patocka wrote:
> > > Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> > > hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> > > spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> > > in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> > > 
> > > This patch fixes the problem by using non-raw spinlock
> > > exception_table_lock instead of the hlist_bl lock.
> > > 
> > > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > > Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> > > 
> > 
> > Hi Mikulas,
> > 
> > I wasn't aware that hlist_bl is implemented as a raw spinlock in the
> > real time kernel. I would expect it to be a standard non-raw spinlock,
> > so everything works as expected. But, after digging further in the real
> > time tree, I found commit ad7675b15fd87f1 ("list_bl: Make list head
> > locking RT safe") which suggests that such a conversion would break
> > other parts of the kernel.
> 
> Right, the proper fix is to update list_bl to work on realtime (which I
> assume the referenced commit does).  I do not want to take this
> dm-snapshot specific workaround that open-codes what should be done
> within hlist_{bl_lock,unlock}, etc.

If we change list_bl to use non-raw spinlock, it fails in dentry lookup 
code. The dentry code takes a seqlock (which is implemented as preempt 
disable in the realtime kernel) and then takes a list_bl lock.

This is wrong from the real-time perspective (the chain in the hash could 
be arbitrarily long, so using non-raw spinlock could cause unbounded 
wait), however we can't do anything with it.

I think that fixing dm-snapshot is way easier than fixing the dentry code. 
If you have an idea how to fix the dentry code, tell us.

> I'm not yet sure which realtime mailing list and/or maintainers should
> be cc'd to further the inclussion of commit ad7675b15fd87f1 -- Nikos do
> you?
> 
> Thanks,
> Mike

Mikulas


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-12  7:50     ` Mikulas Patocka
@ 2019-11-12 11:45       ` Nikos Tsironis
  2019-11-13  6:01         ` Scott Wood
  0 siblings, 1 reply; 9+ messages in thread
From: Nikos Tsironis @ 2019-11-12 11:45 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer
  Cc: Scott Wood, Ilias Tsitsimpis, dm-devel, linux-kernel, linux-fsdevel

On 11/12/19 9:50 AM, Mikulas Patocka wrote:
> 
> 
> On Mon, 11 Nov 2019, Mike Snitzer wrote:
> 
>> On Mon, Nov 11 2019 at 11:37am -0500,
>> Nikos Tsironis <ntsironis@arrikto.com> wrote:
>>
>>> On 11/11/19 3:59 PM, Mikulas Patocka wrote:
>>>> Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
>>>> hlist_bl is implemented as a raw spinlock and the code takes two non-raw
>>>> spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
>>>> in the realtime kernel, so they couldn't be taken inside a raw spinlock).
>>>>
>>>> This patch fixes the problem by using non-raw spinlock
>>>> exception_table_lock instead of the hlist_bl lock.
>>>>
>>>> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>>>> Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
>>>>
>>>
>>> Hi Mikulas,
>>>
>>> I wasn't aware that hlist_bl is implemented as a raw spinlock in the
>>> real time kernel. I would expect it to be a standard non-raw spinlock,
>>> so everything works as expected. But, after digging further in the real
>>> time tree, I found commit ad7675b15fd87f1 ("list_bl: Make list head
>>> locking RT safe") which suggests that such a conversion would break
>>> other parts of the kernel.
>>
>> Right, the proper fix is to update list_bl to work on realtime (which I
>> assume the referenced commit does).  I do not want to take this
>> dm-snapshot specific workaround that open-codes what should be done
>> within hlist_{bl_lock,unlock}, etc.
> 
> If we change list_bl to use non-raw spinlock, it fails in dentry lookup 
> code. The dentry code takes a seqlock (which is implemented as preempt 
> disable in the realtime kernel) and then takes a list_bl lock.
> 
> This is wrong from the real-time perspective (the chain in the hash could 
> be arbitrarily long, so using non-raw spinlock could cause unbounded 
> wait), however we can't do anything with it.
> 
> I think that fixing dm-snapshot is way easier than fixing the dentry code. 
> If you have an idea how to fix the dentry code, tell us.
> 

I too think that it would be better to fix list_bl. dm-snapshot isn't
really broken. One should be able to acquire a spinlock while holding
another spinlock.

Moreover, apart from dm-snapshot, anyone ever using list_bl is at risk
of breaking the realtime kernel, if he or she is not aware of that
particular limitation of list_bl's implementation in the realtime tree.

But, I agree that it's a lot easier "fixing" dm-snapshot than fixing the
dentry code.

>> I'm not yet sure which realtime mailing list and/or maintainers should
>> be cc'd to further the inclussion of commit ad7675b15fd87f1 -- Nikos do
>> you?

No, unfortunately, I don't know for sure either. [1] and [2] suggest
that the relevant mailing lists are LKML and linux-rt-users and the
maintainers are Sebastian Siewior, Thomas Gleixner and Steven Rostedt.

I believe they are already Cc'd in the other thread regarding Mikulas'
"realtime: avoid BUG when the list is not locked" patch (for some reason
the thread doesn't properly appear in dm-devel archives and also my
mails to dm-devel have being failing since yesterday - Could there be an
issue with the mailing list?), so maybe we should Cc them in this thread
too.

Nikos

[1] https://wiki.linuxfoundation.org/realtime/communication/mailinglists
[2] https://wiki.linuxfoundation.org/realtime/communication/send_rt_patches

>>
>> Thanks,
>> Mike
> 
> Mikulas
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-11 13:59 [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel Mikulas Patocka
  2019-11-11 16:37 ` Nikos Tsironis
@ 2019-11-12 15:34 ` Mike Snitzer
  2019-11-12 15:57   ` Mikulas Patocka
  1 sibling, 1 reply; 9+ messages in thread
From: Mike Snitzer @ 2019-11-12 15:34 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Ilias Tsitsimpis, Scott Wood, Nikos Tsironis

On Mon, Nov 11 2019 at  8:59am -0500,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> 
> This patch fixes the problem by using non-raw spinlock
> exception_table_lock instead of the hlist_bl lock.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> 
> ---
>  drivers/md/dm-snap.c |   65 ++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 42 insertions(+), 23 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-snap.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-snap.c	2019-11-08 15:51:42.000000000 +0100
> +++ linux-2.6/drivers/md/dm-snap.c	2019-11-08 15:54:58.000000000 +0100
> @@ -141,6 +141,10 @@ struct dm_snapshot {
>  	 * for them to be committed.
>  	 */
>  	struct bio_list bios_queued_during_merge;
> +
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spinlock_t exception_table_lock;
> +#endif
>  };
>  
>  /*
> @@ -625,30 +629,42 @@ static uint32_t exception_hash(struct dm
>  
>  /* Lock to protect access to the completed and pending exception hash tables. */
>  struct dm_exception_table_lock {
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  	struct hlist_bl_head *complete_slot;
>  	struct hlist_bl_head *pending_slot;
> +#endif
>  };

Why not put the spinlock_t in 'struct dm_exception_table_lock' with the
member name 'lock'?
  
>  static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
>  					 struct dm_exception_table_lock *lock)
>  {
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  	struct dm_exception_table *complete = &s->complete;
>  	struct dm_exception_table *pending = &s->pending;
>  
>  	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
>  	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
> +#endif
>  }
>  
> -static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
> +static void dm_exception_table_lock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
>  {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_lock(&s->exception_table_lock);
> +#else
>  	hlist_bl_lock(lock->complete_slot);
>  	hlist_bl_lock(lock->pending_slot);
> +#endif
>  }
>  
> -static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
> +static void dm_exception_table_unlock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
>  {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_unlock(&s->exception_table_lock);
> +#else
>  	hlist_bl_unlock(lock->pending_slot);
>  	hlist_bl_unlock(lock->complete_slot);
> +#endif
>  }
>  
>  static int dm_exception_table_init(struct dm_exception_table *et,
> @@ -835,9 +851,9 @@ static int dm_add_exception(void *contex
>  	 */
>  	dm_exception_table_lock_init(s, old, &lock);
>  
> -	dm_exception_table_lock(&lock);
> +	dm_exception_table_lock(s, &lock);
>  	dm_insert_exception(&s->complete, e);
> -	dm_exception_table_unlock(&lock);
> +	dm_exception_table_unlock(s, &lock);
>  
>  	return 0;
>  }

That way you don't need the extra 'struct dm_snapshot' arg to all the
various dm_exception_table_{lock,unlock} calls.

> @@ -1318,6 +1334,9 @@ static int snapshot_ctr(struct dm_target
>  	s->first_merging_chunk = 0;
>  	s->num_merging_chunks = 0;
>  	bio_list_init(&s->bios_queued_during_merge);
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	spin_lock_init(&s->exception_table_lock);
> +#endif
>  
>  	/* Allocate hash table for COW data */
>  	if (init_hash_tables(s)) {

And this spin_lock_init() would go in dm_exception_table_lock_init()
in appropriate #ifdef with spin_lock_init(&lock->lock)

Doing it that way would seriously reduce the size of this patch.

Unless I'm missing something, please submit a v2 and cc linux-rt-user
mailing list and the other direct CCs suggested by others in reply to
patch 2/2.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-12 15:34 ` Mike Snitzer
@ 2019-11-12 15:57   ` Mikulas Patocka
  2019-11-12 16:06     ` Mike Snitzer
  0 siblings, 1 reply; 9+ messages in thread
From: Mikulas Patocka @ 2019-11-12 15:57 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Ilias Tsitsimpis, Scott Wood, Nikos Tsironis



On Tue, 12 Nov 2019, Mike Snitzer wrote:

> On Mon, Nov 11 2019 at  8:59am -0500,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> > hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> > spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> > in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> > 
> > This patch fixes the problem by using non-raw spinlock
> > exception_table_lock instead of the hlist_bl lock.
> > 
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> > 
> > ---
> >  drivers/md/dm-snap.c |   65 ++++++++++++++++++++++++++++++++-------------------
> >  1 file changed, 42 insertions(+), 23 deletions(-)
> > 
> > Index: linux-2.6/drivers/md/dm-snap.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-snap.c	2019-11-08 15:51:42.000000000 +0100
> > +++ linux-2.6/drivers/md/dm-snap.c	2019-11-08 15:54:58.000000000 +0100
> > @@ -141,6 +141,10 @@ struct dm_snapshot {
> >  	 * for them to be committed.
> >  	 */
> >  	struct bio_list bios_queued_during_merge;
> > +
> > +#ifdef CONFIG_PREEMPT_RT_BASE
> > +	spinlock_t exception_table_lock;
> > +#endif
> >  };
> >  
> >  /*
> > @@ -625,30 +629,42 @@ static uint32_t exception_hash(struct dm
> >  
> >  /* Lock to protect access to the completed and pending exception hash tables. */
> >  struct dm_exception_table_lock {
> > +#ifndef CONFIG_PREEMPT_RT_BASE
> >  	struct hlist_bl_head *complete_slot;
> >  	struct hlist_bl_head *pending_slot;
> > +#endif
> >  };
> 
> Why not put the spinlock_t in 'struct dm_exception_table_lock' with the
> member name 'lock'?

struct dm_exception_table_lock is allocated temporarily on the stack - we 
can't put locks into it, because every user uses different structurer.

However, I can put pointer to to the spinlock to this structure. It 
shortens the patch - because then we don't have to pass a pointer to 
struct dm_snapshot to dm_exception_table_lock and 
dm_exception_table_unlock.

> >  static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
> >  					 struct dm_exception_table_lock *lock)
> >  {
> > +#ifndef CONFIG_PREEMPT_RT_BASE
> >  	struct dm_exception_table *complete = &s->complete;
> >  	struct dm_exception_table *pending = &s->pending;
> >  
> >  	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
> >  	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
> > +#endif
> >  }
> >  
> > -static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
> > +static void dm_exception_table_lock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
> >  {
> > +#ifdef CONFIG_PREEMPT_RT_BASE
> > +	spin_lock(&s->exception_table_lock);
> > +#else
> >  	hlist_bl_lock(lock->complete_slot);
> >  	hlist_bl_lock(lock->pending_slot);
> > +#endif
> >  }
> >  
> > -static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
> > +static void dm_exception_table_unlock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
> >  {
> > +#ifdef CONFIG_PREEMPT_RT_BASE
> > +	spin_unlock(&s->exception_table_lock);
> > +#else
> >  	hlist_bl_unlock(lock->pending_slot);
> >  	hlist_bl_unlock(lock->complete_slot);
> > +#endif
> >  }
> >  
> >  static int dm_exception_table_init(struct dm_exception_table *et,
> > @@ -835,9 +851,9 @@ static int dm_add_exception(void *contex
> >  	 */
> >  	dm_exception_table_lock_init(s, old, &lock);
> >  
> > -	dm_exception_table_lock(&lock);
> > +	dm_exception_table_lock(s, &lock);
> >  	dm_insert_exception(&s->complete, e);
> > -	dm_exception_table_unlock(&lock);
> > +	dm_exception_table_unlock(s, &lock);
> >  
> >  	return 0;
> >  }
> 
> That way you don't need the extra 'struct dm_snapshot' arg to all the
> various dm_exception_table_{lock,unlock} calls.
> 
> > @@ -1318,6 +1334,9 @@ static int snapshot_ctr(struct dm_target
> >  	s->first_merging_chunk = 0;
> >  	s->num_merging_chunks = 0;
> >  	bio_list_init(&s->bios_queued_during_merge);
> > +#ifdef CONFIG_PREEMPT_RT_BASE
> > +	spin_lock_init(&s->exception_table_lock);
> > +#endif
> >  
> >  	/* Allocate hash table for COW data */
> >  	if (init_hash_tables(s)) {
> 
> And this spin_lock_init() would go in dm_exception_table_lock_init()
> in appropriate #ifdef with spin_lock_init(&lock->lock)

dm_exception_table_lock_init initializes an on-stack structure. It can't 
contain locks.

> Doing it that way would seriously reduce the size of this patch.

I reduced the size and I'll send next version.

> Unless I'm missing something, please submit a v2 and cc linux-rt-user
> mailing list and the other direct CCs suggested by others in reply to
> patch 2/2.
> 
> Thanks,
> Mike

Mikulas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-12 15:57   ` Mikulas Patocka
@ 2019-11-12 16:06     ` Mike Snitzer
  0 siblings, 0 replies; 9+ messages in thread
From: Mike Snitzer @ 2019-11-12 16:06 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Nikos Tsironis, dm-devel, Scott Wood, Ilias Tsitsimpis

On Tue, Nov 12 2019 at 10:57am -0500,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Tue, 12 Nov 2019, Mike Snitzer wrote:
> 
> > On Mon, Nov 11 2019 at  8:59am -0500,
> > Mikulas Patocka <mpatocka@redhat.com> wrote:
> > 
> > > Snapshot doesn't work with realtime kernels since the commit f79ae415b64c.
> > > hlist_bl is implemented as a raw spinlock and the code takes two non-raw
> > > spinlocks while holding hlist_bl (non-raw spinlocks are blocking mutexes
> > > in the realtime kernel, so they couldn't be taken inside a raw spinlock).
> > > 
> > > This patch fixes the problem by using non-raw spinlock
> > > exception_table_lock instead of the hlist_bl lock.
> > > 
> > > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > > Fixes: f79ae415b64c ("dm snapshot: Make exception tables scalable")
> > > 
> > > ---
> > >  drivers/md/dm-snap.c |   65 ++++++++++++++++++++++++++++++++-------------------
> > >  1 file changed, 42 insertions(+), 23 deletions(-)
> > > 
> > > Index: linux-2.6/drivers/md/dm-snap.c
> > > ===================================================================
> > > --- linux-2.6.orig/drivers/md/dm-snap.c	2019-11-08 15:51:42.000000000 +0100
> > > +++ linux-2.6/drivers/md/dm-snap.c	2019-11-08 15:54:58.000000000 +0100
> > > @@ -141,6 +141,10 @@ struct dm_snapshot {
> > >  	 * for them to be committed.
> > >  	 */
> > >  	struct bio_list bios_queued_during_merge;
> > > +
> > > +#ifdef CONFIG_PREEMPT_RT_BASE
> > > +	spinlock_t exception_table_lock;
> > > +#endif
> > >  };
> > >  
> > >  /*
> > > @@ -625,30 +629,42 @@ static uint32_t exception_hash(struct dm
> > >  
> > >  /* Lock to protect access to the completed and pending exception hash tables. */
> > >  struct dm_exception_table_lock {
> > > +#ifndef CONFIG_PREEMPT_RT_BASE
> > >  	struct hlist_bl_head *complete_slot;
> > >  	struct hlist_bl_head *pending_slot;
> > > +#endif
> > >  };
> > 
> > Why not put the spinlock_t in 'struct dm_exception_table_lock' with the
> > member name 'lock'?
> 
> struct dm_exception_table_lock is allocated temporarily on the stack - we 
> can't put locks into it, because every user uses different structurer.
> 
> However, I can put pointer to to the spinlock to this structure. It 
> shortens the patch - because then we don't have to pass a pointer to 
> struct dm_snapshot to dm_exception_table_lock and 
> dm_exception_table_unlock.

OK, I should've looked at the dm-snap.c code with more context, thanks
for clarifying.


> > >  static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
> > >  					 struct dm_exception_table_lock *lock)
> > >  {
> > > +#ifndef CONFIG_PREEMPT_RT_BASE
> > >  	struct dm_exception_table *complete = &s->complete;
> > >  	struct dm_exception_table *pending = &s->pending;
> > >  
> > >  	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
> > >  	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
> > > +#endif
> > >  }
> > >  
> > > -static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
> > > +static void dm_exception_table_lock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
> > >  {
> > > +#ifdef CONFIG_PREEMPT_RT_BASE
> > > +	spin_lock(&s->exception_table_lock);
> > > +#else
> > >  	hlist_bl_lock(lock->complete_slot);
> > >  	hlist_bl_lock(lock->pending_slot);
> > > +#endif
> > >  }
> > >  
> > > -static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
> > > +static void dm_exception_table_unlock(struct dm_snapshot *s, struct dm_exception_table_lock *lock)
> > >  {
> > > +#ifdef CONFIG_PREEMPT_RT_BASE
> > > +	spin_unlock(&s->exception_table_lock);
> > > +#else
> > >  	hlist_bl_unlock(lock->pending_slot);
> > >  	hlist_bl_unlock(lock->complete_slot);
> > > +#endif
> > >  }
> > >  
> > >  static int dm_exception_table_init(struct dm_exception_table *et,
> > > @@ -835,9 +851,9 @@ static int dm_add_exception(void *contex
> > >  	 */
> > >  	dm_exception_table_lock_init(s, old, &lock);
> > >  
> > > -	dm_exception_table_lock(&lock);
> > > +	dm_exception_table_lock(s, &lock);
> > >  	dm_insert_exception(&s->complete, e);
> > > -	dm_exception_table_unlock(&lock);
> > > +	dm_exception_table_unlock(s, &lock);
> > >  
> > >  	return 0;
> > >  }
> > 
> > That way you don't need the extra 'struct dm_snapshot' arg to all the
> > various dm_exception_table_{lock,unlock} calls.
> > 
> > > @@ -1318,6 +1334,9 @@ static int snapshot_ctr(struct dm_target
> > >  	s->first_merging_chunk = 0;
> > >  	s->num_merging_chunks = 0;
> > >  	bio_list_init(&s->bios_queued_during_merge);
> > > +#ifdef CONFIG_PREEMPT_RT_BASE
> > > +	spin_lock_init(&s->exception_table_lock);
> > > +#endif
> > >  
> > >  	/* Allocate hash table for COW data */
> > >  	if (init_hash_tables(s)) {
> > 
> > And this spin_lock_init() would go in dm_exception_table_lock_init()
> > in appropriate #ifdef with spin_lock_init(&lock->lock)
> 
> dm_exception_table_lock_init initializes an on-stack structure. It can't 
> contain locks.
> 
> > Doing it that way would seriously reduce the size of this patch.
> 
> I reduced the size and I'll send next version.
> 
> > Unless I'm missing something, please submit a v2 and cc linux-rt-user
> > mailing list and the other direct CCs suggested by others in reply to
> > patch 2/2.

Sounds good.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel
  2019-11-12 11:45       ` Nikos Tsironis
@ 2019-11-13  6:01         ` Scott Wood
  0 siblings, 0 replies; 9+ messages in thread
From: Scott Wood @ 2019-11-13  6:01 UTC (permalink / raw)
  To: Nikos Tsironis, Mikulas Patocka, Mike Snitzer
  Cc: Ilias Tsitsimpis, dm-devel, linux-kernel, linux-fsdevel,
	Sebastian Andrzej Siewior, Daniel Wagner, tglx, linux-rt-users,
	Paul Gortmaker

On Tue, 2019-11-12 at 13:45 +0200, Nikos Tsironis wrote:
> On 11/12/19 9:50 AM, Mikulas Patocka wrote:
> > 
> > On Mon, 11 Nov 2019, Mike Snitzer wrote:
> > 
> > > On Mon, Nov 11 2019 at 11:37am -0500,
> > > Nikos Tsironis <ntsironis@arrikto.com> wrote:
> > > 
> > > > On 11/11/19 3:59 PM, Mikulas Patocka wrote:
> > > > > Snapshot doesn't work with realtime kernels since the commit
> > > > > f79ae415b64c.
> > > > > hlist_bl is implemented as a raw spinlock and the code takes two
> > > > > non-raw
> > > > > spinlocks while holding hlist_bl (non-raw spinlocks are blocking
> > > > > mutexes
> > > > > in the realtime kernel, so they couldn't be taken inside a raw
> > > > > spinlock).
> > > > > 
> > > > > This patch fixes the problem by using non-raw spinlock
> > > > > exception_table_lock instead of the hlist_bl lock.
> > > > > 
> > > > > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > > > > Fixes: f79ae415b64c ("dm snapshot: Make exception tables
> > > > > scalable")
> > > > > 
> > > > 
> > > > Hi Mikulas,
> > > > 
> > > > I wasn't aware that hlist_bl is implemented as a raw spinlock in the
> > > > real time kernel. I would expect it to be a standard non-raw
> > > > spinlock,
> > > > so everything works as expected. But, after digging further in the
> > > > real
> > > > time tree, I found commit ad7675b15fd87f1 ("list_bl: Make list head
> > > > locking RT safe") which suggests that such a conversion would break
> > > > other parts of the kernel.
> > > 
> > > Right, the proper fix is to update list_bl to work on realtime (which
> > > I
> > > assume the referenced commit does).  I do not want to take this
> > > dm-snapshot specific workaround that open-codes what should be done
> > > within hlist_{bl_lock,unlock}, etc.
> > 
> > If we change list_bl to use non-raw spinlock, it fails in dentry lookup 
> > code. The dentry code takes a seqlock (which is implemented as preempt 
> > disable in the realtime kernel) and then takes a list_bl lock.
> > 
> > This is wrong from the real-time perspective (the chain in the hash
> > could 
> > be arbitrarily long, so using non-raw spinlock could cause unbounded 
> > wait), however we can't do anything with it.
> > 
> > I think that fixing dm-snapshot is way easier than fixing the dentry
> > code. 
> > If you have an idea how to fix the dentry code, tell us.
> 
> I too think that it would be better to fix list_bl. dm-snapshot isn't
> really broken. One should be able to acquire a spinlock while holding
> another spinlock.

That's not universally true -- even in the absence of RT there are nesting
considerations.  But it would probably be good if raw locks weren't hidden
inside other locking primitives without making it clear (ideally in the
function names) that it's a raw lock.

> Moreover, apart from dm-snapshot, anyone ever using list_bl is at risk
> of breaking the realtime kernel, if he or she is not aware of that
> particular limitation of list_bl's implementation in the realtime tree.

In particular the non-rcu variant seems inherently bad unless you protect
traversal with some other lock (in which case why use bl at all?).  Maybe
fully separate the rcu version of list_bl and keep using raw locks there
(with the name clearly indicating so), with the side benefit that
accidentally mixing rcu and non-rcu operations on the same list would become
a build error, and convert the non-rcu list_bl to use non-raw locks on RT.

BTW, I'm wondering what makes bit spinlocks worse than raw spinlocks on
RT...  commit ad7675b15fd87f19 says there's no lockdep visibility, but that
seems orthogonal to RT, and could be addressed by adding a dep_map on debug
builds the same way the raw lock is currently added.  The other bit spinlock
conversion commits that I could find are replacing them with non-raw locks.

-Scott



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-11-13  6:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-11 13:59 [PATCH 1/2] dm-snapshot: fix crash with the realtime kernel Mikulas Patocka
2019-11-11 16:37 ` Nikos Tsironis
2019-11-12  1:14   ` Mike Snitzer
2019-11-12  7:50     ` Mikulas Patocka
2019-11-12 11:45       ` Nikos Tsironis
2019-11-13  6:01         ` Scott Wood
2019-11-12 15:34 ` Mike Snitzer
2019-11-12 15:57   ` Mikulas Patocka
2019-11-12 16:06     ` Mike Snitzer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.