linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] bcache: fix panic due to cache_set is null
@ 2020-11-30 11:21 Yi Li
  2020-12-01  4:35 ` Yi Li
  2020-12-01 16:38 ` [PATCH] " Coly Li
  0 siblings, 2 replies; 11+ messages in thread
From: Yi Li @ 2020-11-30 11:21 UTC (permalink / raw)
  To: colyli
  Cc: yilikernel, kent.overstreet, linux-bcache, linux-kernel, Yi Li, Guo Chao

bcache_device_detach will release the cache_set after hotunplug cache
disk. update_writeback_rate should check validate of cache_set.

  IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  PGD 879620067 PUD 8755d3067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
  Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
  Workqueue: events update_writeback_rate [bcache]
  task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
  RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
  RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
  RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
  R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
  FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000) knlGS:0000000000000000
  CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
  Stack:
   0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
   ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
   ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
  Call Trace:
   [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
   [<ffffffff81089575>] worker_thread+0x2a5/0x470
   [<ffffffff815a2f58>] ? __schedule+0x648/0x870
   [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
   [<ffffffff8108e3d5>] kthread+0xd5/0xe0
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
   [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110

Reported-by: Guo Chao <guochao@winhong.com>
Signed-off-by: Guo Chao <guochao@winhong.com>
Signed-off-by: Yi Li <yili@winhong.com>
---
 drivers/md/bcache/writeback.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 3c74996978da..186c4c6e1607 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -175,7 +175,15 @@ static void update_writeback_rate(struct work_struct *work)
 	struct cached_dev *dc = container_of(to_delayed_work(work),
 					     struct cached_dev,
 					     writeback_rate_update);
-	struct cache_set *c = dc->disk.c;
+	struct cache_set *c = NULL;
+
+	mutex_lock(&bch_register_lock);
+	c = dc->disk.c;
+
+	if (c == NULL) {
+		mutex_unlock(&bch_register_lock);
+		return;
+	}
 
 	/*
 	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
@@ -194,6 +202,7 @@ static void update_writeback_rate(struct work_struct *work)
 		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
 		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
 		smp_mb__after_atomic();
+		mutex_unlock(&bch_register_lock);
 		return;
 	}
 
@@ -230,6 +239,7 @@ static void update_writeback_rate(struct work_struct *work)
 	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
 	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
 	smp_mb__after_atomic();
+	mutex_unlock(&bch_register_lock);
 }
 
 static unsigned int writeback_delay(struct cached_dev *dc,
-- 
2.25.3




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-11-30 11:21 [PATCH] bcache: fix panic due to cache_set is null Yi Li
@ 2020-12-01  4:35 ` Yi Li
  2020-12-01 16:42   ` Coly Li
  2020-12-01 16:38 ` [PATCH] " Coly Li
  1 sibling, 1 reply; 11+ messages in thread
From: Yi Li @ 2020-12-01  4:35 UTC (permalink / raw)
  To: Yi Li; +Cc: colyli, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

sorry, This patch will cause deadlock, i will check and redo it.

On 11/30/20, Yi Li <yili@winhong.com> wrote:
> bcache_device_detach will release the cache_set after hotunplug cache
> disk. update_writeback_rate should check validate of cache_set.
>
>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   PGD 879620067 PUD 8755d3067 PMD 0
>   Oops: 0000 [#1] SMP
>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
>   Workqueue: events update_writeback_rate [bcache]
>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000)
> knlGS:0000000000000000
>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>   Stack:
>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>   Call Trace:
>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>
> Reported-by: Guo Chao <guochao@winhong.com>
> Signed-off-by: Guo Chao <guochao@winhong.com>
> Signed-off-by: Yi Li <yili@winhong.com>
> ---
>  drivers/md/bcache/writeback.c | 12 +++++++++++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 3c74996978da..186c4c6e1607 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -175,7 +175,15 @@ static void update_writeback_rate(struct work_struct
> *work)
>  	struct cached_dev *dc = container_of(to_delayed_work(work),
>  					     struct cached_dev,
>  					     writeback_rate_update);
> -	struct cache_set *c = dc->disk.c;
> +	struct cache_set *c = NULL;
> +
> +	mutex_lock(&bch_register_lock);
> +	c = dc->disk.c;
> +
> +	if (c == NULL) {
> +		mutex_unlock(&bch_register_lock);
> +		return;
> +	}
>
>  	/*
>  	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
> @@ -194,6 +202,7 @@ static void update_writeback_rate(struct work_struct
> *work)
>  		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>  		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>  		smp_mb__after_atomic();
> +		mutex_unlock(&bch_register_lock);
>  		return;
>  	}
>
> @@ -230,6 +239,7 @@ static void update_writeback_rate(struct work_struct
> *work)
>  	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>  	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>  	smp_mb__after_atomic();
> +	mutex_unlock(&bch_register_lock);
>  }
>
>  static unsigned int writeback_delay(struct cached_dev *dc,
> --
> 2.25.3
>
>
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-11-30 11:21 [PATCH] bcache: fix panic due to cache_set is null Yi Li
  2020-12-01  4:35 ` Yi Li
@ 2020-12-01 16:38 ` Coly Li
  1 sibling, 0 replies; 11+ messages in thread
From: Coly Li @ 2020-12-01 16:38 UTC (permalink / raw)
  To: Yi Li; +Cc: yilikernel, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

On 11/30/20 7:21 PM, Yi Li wrote:
> bcache_device_detach will release the cache_set after hotunplug cache
> disk. update_writeback_rate should check validate of cache_set.
> 

I see the kernel is 4.4.0+10, do you try the the v5.9 kernel ? I don't
see your kernel code, it is not easy to response.

Thanks.

Coly LI

>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   PGD 879620067 PUD 8755d3067 PMD 0
>   Oops: 0000 [#1] SMP
>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
>   Workqueue: events update_writeback_rate [bcache]
>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000) knlGS:0000000000000000
>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>   Stack:
>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>   Call Trace:
>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
> 
> Reported-by: Guo Chao <guochao@winhong.com>
> Signed-off-by: Guo Chao <guochao@winhong.com>
> Signed-off-by: Yi Li <yili@winhong.com>
> ---
>  drivers/md/bcache/writeback.c | 12 +++++++++++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 3c74996978da..186c4c6e1607 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -175,7 +175,15 @@ static void update_writeback_rate(struct work_struct *work)
>  	struct cached_dev *dc = container_of(to_delayed_work(work),
>  					     struct cached_dev,
>  					     writeback_rate_update);
> -	struct cache_set *c = dc->disk.c;
> +	struct cache_set *c = NULL;
> +
> +	mutex_lock(&bch_register_lock);
> +	c = dc->disk.c;
> +
> +	if (c == NULL) {
> +		mutex_unlock(&bch_register_lock);
> +		return;
> +	}
>  
>  	/*
>  	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
> @@ -194,6 +202,7 @@ static void update_writeback_rate(struct work_struct *work)
>  		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>  		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>  		smp_mb__after_atomic();
> +		mutex_unlock(&bch_register_lock);
>  		return;
>  	}
>  
> @@ -230,6 +239,7 @@ static void update_writeback_rate(struct work_struct *work)
>  	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>  	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>  	smp_mb__after_atomic();
> +	mutex_unlock(&bch_register_lock);
>  }
>  
>  static unsigned int writeback_delay(struct cached_dev *dc,
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-12-01  4:35 ` Yi Li
@ 2020-12-01 16:42   ` Coly Li
  2020-12-03  6:25     ` Yi Li
  0 siblings, 1 reply; 11+ messages in thread
From: Coly Li @ 2020-12-01 16:42 UTC (permalink / raw)
  To: Yi Li, Yi Li; +Cc: kent.overstreet, linux-bcache, linux-kernel, Guo Chao

On 12/1/20 12:35 PM, Yi Li wrote:
> sorry, This patch will cause deadlock, i will check and redo it.

Can you try latest upstream kernel firstly ? Before spending more time
on the fix.

If I remember correctly, when cancel_writeback_rate_update_dwork() is
not timed out, the cache set memory won't be freed before the
writeback_rate_update worker terminates. It is possible that I miss
something in the code, but I suggest to test with a kernel after v5.3,
and better a v5.8+ kernel.

Coly Li

> 
> On 11/30/20, Yi Li <yili@winhong.com> wrote:
>> bcache_device_detach will release the cache_set after hotunplug cache
>> disk. update_writeback_rate should check validate of cache_set.
>>
>>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>>   PGD 879620067 PUD 8755d3067 PMD 0
>>   Oops: 0000 [#1] SMP
>>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
>>   Workqueue: events update_writeback_rate [bcache]
>>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000)
>> knlGS:0000000000000000
>>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>>   Stack:
>>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>>   Call Trace:
>>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>
>> Reported-by: Guo Chao <guochao@winhong.com>
>> Signed-off-by: Guo Chao <guochao@winhong.com>
>> Signed-off-by: Yi Li <yili@winhong.com>
>> ---
>>  drivers/md/bcache/writeback.c | 12 +++++++++++-
>>  1 file changed, 11 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index 3c74996978da..186c4c6e1607 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -175,7 +175,15 @@ static void update_writeback_rate(struct work_struct
>> *work)
>>  	struct cached_dev *dc = container_of(to_delayed_work(work),
>>  					     struct cached_dev,
>>  					     writeback_rate_update);
>> -	struct cache_set *c = dc->disk.c;
>> +	struct cache_set *c = NULL;
>> +
>> +	mutex_lock(&bch_register_lock);
>> +	c = dc->disk.c;
>> +
>> +	if (c == NULL) {
>> +		mutex_unlock(&bch_register_lock);
>> +		return;
>> +	}
>>
>>  	/*
>>  	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
>> @@ -194,6 +202,7 @@ static void update_writeback_rate(struct work_struct
>> *work)
>>  		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>>  		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>>  		smp_mb__after_atomic();
>> +		mutex_unlock(&bch_register_lock);
>>  		return;
>>  	}
>>
>> @@ -230,6 +239,7 @@ static void update_writeback_rate(struct work_struct
>> *work)
>>  	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>>  	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>>  	smp_mb__after_atomic();
>> +	mutex_unlock(&bch_register_lock);
>>  }
>>
>>  static unsigned int writeback_delay(struct cached_dev *dc,
>> --
>> 2.25.3
>>
>>
>>
>>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-12-01 16:42   ` Coly Li
@ 2020-12-03  6:25     ` Yi Li
  2020-12-03  6:56       ` Coly Li
  0 siblings, 1 reply; 11+ messages in thread
From: Yi Li @ 2020-12-03  6:25 UTC (permalink / raw)
  To: Coly Li; +Cc: Yi Li, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

> On 12/1/20 12:35 PM, Yi Li wrote:
>> sorry, This patch will cause deadlock, i will check and redo it.
>
> Can you try latest upstream kernel firstly ? Before spending more time
> on the fix.
>

This issue just happened three times (xenserver7.5 dom0 kernel) on the
same machine and cannot reproduce it now. and have not reproduce it
using the lastest uptream kernel.

> If I remember correctly, when cancel_writeback_rate_update_dwork() is
> not timed out, the cache set memory won't be freed before the
> writeback_rate_update worker terminates. It is possible that I miss
> something in the code, but I suggest to test with a kernel after v5.3,
> and better a v5.8+ kernel.
>
> Coly Li
>
Thanks.

it is  confused that why writeback_rate_update worker run  again after
cancel_delayed_work_sync( kernel log telled).

Maybe i should dig more.

Attach some debug info as below.

Nov 30 18:00:44 host-126 kernel: [ 1697.906137] bcache:
update_writeback_rate() dc = ffff880818580000
Nov 30 18:00:44 host-126 kernel: [ 1697.906145] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:44 host-126 kernel: [ 1697.906148] bcache:
update_writeback_rate() dc->disk.flags = ffff8808185800a0
Nov 30 18:00:44 host-126 kernel: [ 1697.906150] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:45 host-126 kernel: [ 1698.578104] bcache:
update_writeback_rate() dc = ffff880839bd0000
Nov 30 18:00:45 host-126 kernel: [ 1698.578108] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:45 host-126 kernel: [ 1698.578109] bcache:
update_writeback_rate() dc->disk.flags = ffff880839bd00a0
Nov 30 18:00:45 host-126 kernel: [ 1698.578111] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:49 host-126 kernel: [ 1702.914126] bcache:
update_writeback_rate() dc = ffff880818580000
Nov 30 18:00:49 host-126 kernel: [ 1702.914130] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:49 host-126 kernel: [ 1702.914132] bcache:
update_writeback_rate() dc->disk.flags = ffff8808185800a0
Nov 30 18:00:49 host-126 kernel: [ 1702.914133] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:50 host-126 kernel: [ 1703.586182] bcache:
update_writeback_rate() dc = ffff880839bd0000
Nov 30 18:00:50 host-126 kernel: [ 1703.586188] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:50 host-126 kernel: [ 1703.586191] bcache:
update_writeback_rate() dc->disk.flags = ffff880839bd00a0
Nov 30 18:00:50 host-126 kernel: [ 1703.586193] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:54 host-126 kernel: [ 1707.922215] bcache:
update_writeback_rate() dc = ffff880818580000
Nov 30 18:00:54 host-126 kernel: [ 1707.922222] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:54 host-126 kernel: [ 1707.922225] bcache:
update_writeback_rate() dc->disk.flags = ffff8808185800a0
Nov 30 18:00:54 host-126 kernel: [ 1707.922227] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:55 host-126 kernel: [ 1708.594202] bcache:
update_writeback_rate() dc = ffff880839bd0000
Nov 30 18:00:55 host-126 kernel: [ 1708.594206] bcache:
update_writeback_rate() c = ffff8808182c0000
Nov 30 18:00:55 host-126 kernel: [ 1708.594208] bcache:
update_writeback_rate() dc->disk.flags = ffff880839bd00a0
Nov 30 18:00:55 host-126 kernel: [ 1708.594210] bcache:
update_writeback_rate() c->flags = ffff8808182c0368
Nov 30 18:00:55 host-126 kernel: [ 1709.118221] sd 0:0:1:0:
device_block, handle(0x0009)
Nov 30 18:00:58 host-126 kernel: [ 1711.368197] sd 0:0:1:0:
device_unblock and setting to running, handle(0x0009)
Nov 30 18:00:58 host-126 kernel: [ 1711.368263] sd 0:0:1:0: [sdb]
tag#0 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Nov 30 18:00:58 host-126 kernel: [ 1711.368277] sd 0:0:1:0: [sdb]
tag#0 CDB: Synchronize Cache(10) 35 00 00 00 00 00 00 00 00 00
Nov 30 18:00:58 host-126 kernel: [ 1711.368289] blk_update_request:
I/O error, dev sdb, sector 36160
Nov 30 18:00:58 host-126 kernel: [ 1711.368326] bcache: error on
96083de4-6b3e-4ede-81d1-44edc1a93729: journal io error, disabling
caching
Nov 30 18:00:58 host-126 kernel: [ 1711.368372] sd 0:0:1:0: [sdb]
tag#1 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Nov 30 18:00:58 host-126 kernel: [ 1711.368387] sd 0:0:1:0: [sdb]
tag#1 CDB: Write(10) 2a 00 00 81 b0 40 00 00 08 00
Nov 30 18:00:58 host-126 kernel: [ 1711.368392] blk_update_request:
I/O error, dev sdb, sector 8499264
Nov 30 18:00:58 host-126 kernel: [ 1711.368407] bcache:
bch_count_io_errors() sdb2: IO error on writing data to cache.
Nov 30 18:00:58 host-126 kernel: [ 1711.368447] bcache:
bch_cached_dev_detach() bch_cached_dev_detach start
Nov 30 18:00:58 host-126 kernel: [ 1711.368458] bcache:
bch_cached_dev_detach() bch_cached_dev_detach end
Nov 30 18:00:58 host-126 kernel: [ 1711.368463] bcache:
conditional_stop_bcache_device() stop_when_cache_set_failed of bcache1
is "auto" and cache is dirty, stop it to avoid potential data
corruption.
Nov 30 18:00:58 host-126 kernel: [ 1711.368466] bcache:
bch_cached_dev_detach() bch_cached_dev_detach start
Nov 30 18:00:58 host-126 kernel: [ 1711.368474] bcache:
bch_cached_dev_detach() bch_cached_dev_detach end
Nov 30 18:00:58 host-126 kernel: [ 1711.368479] bcache:
conditional_stop_bcache_device() stop_when_cache_set_failed of bcache0
is "auto" and cache is clean, keep it alive.
Nov 30 18:00:58 host-126 kernel: [ 1711.368494] Buffer I/O error on
dev bcache1, logical block 210979209, lost async page write
Nov 30 18:00:58 host-126 kernel: [ 1711.368585] bcache:
cached_dev_detach_finish() cached_dev_detach_finish start
Nov 30 18:00:58 host-126 kernel: [ 1711.368589] bcache:
cached_dev_detach_finish() dc->disk.flags = f
Nov 30 18:00:58 host-126 kernel: [ 1711.368591] bcache:
cancel_writeback_rate_update_dwork()
cancel_writeback_rate_update_dwork start
Nov 30 18:00:58 host-126 kernel: [ 1711.368594] bcache:
cancel_writeback_rate_update_dwork()
cancel_writeback_rate_update_dwork end
Nov 30 18:00:58 host-126 kernel: [ 1711.368617] bcache:
cached_dev_detach_finish() cached_dev_detach_finish start
Nov 30 18:00:58 host-126 kernel: [ 1711.369022] bcache:
cached_dev_detach_finish() Caching disabled for sda2
Nov 30 18:00:58 host-126 kernel: [ 1711.369025] bcache:
cached_dev_detach_finish() cached_dev_detach_finish end
Nov 30 18:00:58 host-126 kernel: [ 1711.369035] bcache:
cached_dev_detach_finish() dc->disk.flags = a
Nov 30 18:00:58 host-126 kernel: [ 1711.369039] bcache:
cancel_writeback_rate_update_dwork()
cancel_writeback_rate_update_dwork start
Nov 30 18:00:58 host-126 kernel: [ 1711.369042] bcache:
cancel_writeback_rate_update_dwork()
cancel_writeback_rate_update_dwork end
Nov 30 18:00:58 host-126 kernel: [ 1711.369460] bcache:
cached_dev_detach_finish() Caching disabled for sdf2
Nov 30 18:00:58 host-126 kernel: [ 1711.369464] bcache:
cached_dev_detach_finish() cached_dev_detach_finish end
Nov 30 18:00:58 host-126 kernel: [ 1711.374327] sd 0:0:1:0: [sdb]
Synchronizing SCSI cache
Nov 30 18:00:58 host-126 kernel: [ 1711.374516] sd 0:0:1:0: [sdb]
Synchronize Cache(10) failed: Result: hostbyte=DID_NO_CONNECT
driverbyte=DRIVER_OK
Nov 30 18:00:58 host-126 kernel: [ 1711.376907] bcache:
cache_set_free() Cache set 96083de4-6b3e-4ede-81d1-44edc1a93729
unregistered
Nov 30 18:00:58 host-126 kernel: [ 1711.418979] mpt3sas_cm0: removing
handle(0x0009), sas_addr(0x4433221102000000)
Nov 30 18:00:58 host-126 kernel: [ 1711.418984] mpt3sas_cm0: removing
:enclosure logical id(0x5a4bf013514f4000), slot(4)
Nov 30 18:00:58 host-126 kernel: [ 1711.418988] mpt3sas_cm0: removing
:enclosure level(0x0000), connector name(     )
Nov 30 18:01:05 host-126 kernel: [ 1718.873725] bcache:
cached_dev_free() cached_dev_free start
Nov 30 18:01:05 host-126 kernel: [ 1718.873776] bcache:
bcache_device_free() bcache1 stopped
Nov 30 18:01:05 host-126 kernel: [ 1718.926598] bcache:
cached_dev_free() cached_dev_free end
Nov 30 18:01:06 host-126 kernel: [ 1719.668540]  sda: sda1 sda2
Nov 30 18:01:11 host-126 kernel: [ 1725.010305] bcache:
update_writeback_rate() dc = ffff880818580000
Nov 30 18:01:11 host-126 kernel: [ 1725.010309] bcache:
update_writeback_rate() c =           (null)
------------the kernel log show that cache_set is null.
Nov 30 18:01:11 host-126 kernel: [ 1725.010311] bcache:
update_writeback_rate() dc->disk.flags = ffff8808185800a0
Nov 30 18:01:12 host-126 kernel: [ 1725.369791] mpt3sas_cm0:
detecting: handle(0x0009), sas_address(0x4433221102000000), phy(2)
Nov 30 18:01:12 host-126 kernel: [ 1725.369802] mpt3sas_cm0:
REPORT_LUNS: handle(0x0009), retries(0)
Nov 30 18:01:12 host-126 kernel: [ 1725.618138] mpt3sas_cm0:
TEST_UNIT_READY: handle(0x0009), lun(0)
Nov 30 18:01:13 host-126 kernel: [ 1726.619148] mpt3sas_cm0:
detecting: handle(0x0009), sas_address(0x4433221102000000), phy(2)
Nov 30 18:01:13 host-126 kernel: [ 1726.619156] mpt3sas_cm0:
REPORT_LUNS: handle(0x0009), retries(0)
Nov 30 18:01:13 host-126 kernel: [ 1726.619300] mpt3sas_cm0:
TEST_UNIT_READY: handle(0x0009), lun(0)
Nov 30 18:01:13 host-126 kernel: [ 1726.622323] scsi 0:0:6:0:
Direct-Access     ATA      INTEL SSDSC2BB48 0112 PQ: 0 ANSI: 6
Nov 30 18:01:13 host-126 kernel: [ 1726.622839] scsi 0:0:6:0: SATA:
handle(0x0009), sas_addr(0x4433221102000000), phy(2),
device_name(0x55cd2e414d6dcd4b)
Nov 30 18:01:13 host-126 kernel: [ 1726.622845] scsi 0:0:6:0: SATA:
enclosure logical id(0x5a4bf013514f4000), slot(4)
Nov 30 18:01:13 host-126 kernel: [ 1726.622847] scsi 0:0:6:0: SATA:
enclosure level(0x0000), connector name(     )
Nov 30 18:01:13 host-126 kernel: [ 1726.622990] scsi 0:0:6:0:
atapi(n), ncq(y), asyn_notify(n), smart(y), fua(y), sw_preserve(y)
Nov 30 18:01:13 host-126 kernel: [ 1726.623642] scsi 0:0:6:0:
serial_number(PHDV7102003Q480BGN  )
Nov 30 18:01:13 host-126 kernel: [ 1726.626116] sd 0:0:6:0: Attached
scsi generic sg1 type 0
Nov 30 18:01:13 host-126 kernel: [ 1726.626784] sd 0:0:6:0: [sdb]
937703088 512-byte logical blocks: (480 GB/447 GiB)
Nov 30 18:01:13 host-126 kernel: [ 1726.626789] sd 0:0:6:0: [sdb]
4096-byte physical blocks
Nov 30 18:01:13 host-126 kernel: [ 1726.631161] sd 0:0:6:0: [sdb]
Write Protect is off
Nov 30 18:01:13 host-126 kernel: [ 1726.631165] sd 0:0:6:0: [sdb] Mode
Sense: 9b 00 10 08
Nov 30 18:01:13 host-126 kernel: [ 1726.632513] sd 0:0:6:0: [sdb]
Write cache: enabled, read cache: enabled, supports DPO and FUA
Nov 30 18:01:13 host-126 kernel: [ 1726.648068]  sdb: sdb1 sdb2
Nov 30 18:01:13 host-126 kernel: [ 1726.656841] sd 0:0:6:0: [sdb]
Attached SCSI disk

>>
>> On 11/30/20, Yi Li <yili@winhong.com> wrote:
>>> bcache_device_detach will release the cache_set after hotunplug cache
>>> disk. update_writeback_rate should check validate of cache_set.
>>>
>>>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>>>   PGD 879620067 PUD 8755d3067 PMD 0
>>>   Oops: 0000 [#1] SMP
>>>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>>>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601
>>> 03/21/2017
>>>   Workqueue: events update_writeback_rate [bcache]
>>>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>>>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0
>>> [bcache]
>>>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>>>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>>>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>>>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>>>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>>>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>>>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000)
>>> knlGS:0000000000000000
>>>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>>>   Stack:
>>>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>>>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>>>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>>>   Call Trace:
>>>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>>>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>>>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>>>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>>>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>>
>>> Reported-by: Guo Chao <guochao@winhong.com>
>>> Signed-off-by: Guo Chao <guochao@winhong.com>
>>> Signed-off-by: Yi Li <yili@winhong.com>
>>> ---
>>>  drivers/md/bcache/writeback.c | 12 +++++++++++-
>>>  1 file changed, 11 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/md/bcache/writeback.c
>>> b/drivers/md/bcache/writeback.c
>>> index 3c74996978da..186c4c6e1607 100644
>>> --- a/drivers/md/bcache/writeback.c
>>> +++ b/drivers/md/bcache/writeback.c
>>> @@ -175,7 +175,15 @@ static void update_writeback_rate(struct
>>> work_struct
>>> *work)
>>>  	struct cached_dev *dc = container_of(to_delayed_work(work),
>>>  					     struct cached_dev,
>>>  					     writeback_rate_update);
>>> -	struct cache_set *c = dc->disk.c;
>>> +	struct cache_set *c = NULL;
>>> +
>>> +	mutex_lock(&bch_register_lock);
>>> +	c = dc->disk.c;
>>> +
>>> +	if (c == NULL) {
>>> +		mutex_unlock(&bch_register_lock);
>>> +		return;
>>> +	}
>>>
>>>  	/*
>>>  	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
>>> @@ -194,6 +202,7 @@ static void update_writeback_rate(struct work_struct
>>> *work)
>>>  		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>>>  		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>>>  		smp_mb__after_atomic();
>>> +		mutex_unlock(&bch_register_lock);
>>>  		return;
>>>  	}
>>>
>>> @@ -230,6 +239,7 @@ static void update_writeback_rate(struct work_struct
>>> *work)
>>>  	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
>>>  	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
>>>  	smp_mb__after_atomic();
>>> +	mutex_unlock(&bch_register_lock);
>>>  }
>>>
>>>  static unsigned int writeback_delay(struct cached_dev *dc,
>>> --
>>> 2.25.3
>>>
>>>
>>>
>>>
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-12-03  6:25     ` Yi Li
@ 2020-12-03  6:56       ` Coly Li
  2020-12-03  9:06         ` Yi Li
  0 siblings, 1 reply; 11+ messages in thread
From: Coly Li @ 2020-12-03  6:56 UTC (permalink / raw)
  To: Yi Li; +Cc: Yi Li, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

On 12/3/20 2:25 PM, Yi Li wrote:
>> On 12/1/20 12:35 PM, Yi Li wrote:
>>> sorry, This patch will cause deadlock, i will check and redo it.
>>
>> Can you try latest upstream kernel firstly ? Before spending more time
>> on the fix.
>>
> 
> This issue just happened three times (xenserver7.5 dom0 kernel) on the
> same machine and cannot reproduce it now. and have not reproduce it
> using the lastest uptream kernel.
> 

Hmm, this is something very probably that I am not able to help. It
seems the kernel is a third-part maintained Linux v4.4 based kernel +
bcache backport, which is out of my view.

If similar problem happens on latest upstream kernel, or at least v5.8+
kernel, I can help to take a look.


>> If I remember correctly, when cancel_writeback_rate_update_dwork() is
>> not timed out, the cache set memory won't be freed before the
>> writeback_rate_update worker terminates. It is possible that I miss
>> something in the code, but I suggest to test with a kernel after v5.3,
>> and better a v5.8+ kernel.
>>
>> Coly Li
>>
> Thanks.
> 
> it is  confused that why writeback_rate_update worker run  again after
> cancel_delayed_work_sync( kernel log telled).
> 

[snipped]

Coly Li

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] bcache: fix panic due to cache_set is null
  2020-12-03  6:56       ` Coly Li
@ 2020-12-03  9:06         ` Yi Li
  2020-12-03  9:47           ` [PATCH v2] " Yi Li
  0 siblings, 1 reply; 11+ messages in thread
From: Yi Li @ 2020-12-03  9:06 UTC (permalink / raw)
  To: Coly Li; +Cc: Yi Li, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

The root cause:

After just  cached_dev_free do cancel_writeback_rate_update_dwork
without bch_register_lock
.
at the same time. Wirting the writeback_percent by sysfs witch
bch_register_lock will insert a writeback_rate_update work.

cached_dev_free with bch_register_lock to do bcache_device_free.

(it is introduce by patch 80265d8dfd77792e133793cef44a21323aac2908)

pls:
1: run the shell script
#!/bin/bash
while [ true ]
do
        echo 0 > /sys/block/bcache0/bcache/writeback_percent
done

2: hotplug the cache disk


On 12/3/20, Coly Li <colyli@suse.de> wrote:
> On 12/3/20 2:25 PM, Yi Li wrote:
>>> On 12/1/20 12:35 PM, Yi Li wrote:
>>>> sorry, This patch will cause deadlock, i will check and redo it.
>>>
>>> Can you try latest upstream kernel firstly ? Before spending more time
>>> on the fix.
>>>
>>
>> This issue just happened three times (xenserver7.5 dom0 kernel) on the
>> same machine and cannot reproduce it now. and have not reproduce it
>> using the lastest uptream kernel.
>>
>
> Hmm, this is something very probably that I am not able to help. It
> seems the kernel is a third-part maintained Linux v4.4 based kernel +
> bcache backport, which is out of my view.
>
> If similar problem happens on latest upstream kernel, or at least v5.8+
> kernel, I can help to take a look.
>
>
>>> If I remember correctly, when cancel_writeback_rate_update_dwork() is
>>> not timed out, the cache set memory won't be freed before the
>>> writeback_rate_update worker terminates. It is possible that I miss
>>> something in the code, but I suggest to test with a kernel after v5.3,
>>> and better a v5.8+ kernel.
>>>
>>> Coly Li
>>>
>> Thanks.
>>
>> it is  confused that why writeback_rate_update worker run  again after
>> cancel_delayed_work_sync( kernel log telled).
>>
>
> [snipped]
>
> Coly Li
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] bcache: fix panic due to cache_set is null
  2020-12-03  9:06         ` Yi Li
@ 2020-12-03  9:47           ` Yi Li
  2020-12-03 11:27             ` Coly Li
  0 siblings, 1 reply; 11+ messages in thread
From: Yi Li @ 2020-12-03  9:47 UTC (permalink / raw)
  To: colyli
  Cc: yilikernel, kent.overstreet, linux-bcache, linux-kernel, Yi Li, Guo Chao

bcache_device_detach will release the cache_set after hotunplug cache
disk.

Here is how the issue happens.
1) cached_dev_free do cancel_writeback_rate_update_dwork
   without bch_register_lock.
2) Wirting the writeback_percent by sysfs with
   bch_register_lock will insert a writeback_rate_update work.
3) cached_dev_free with bch_register_lock to do bcache_device_free.
   dc->disk.cl will be set NULL
4) update_writeback_rate will crash when access dc->disk.cl

Fixes: 80265d8dfd77 ("bcache: acquire bch_register_lock later in cached_dev_free()")

  IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  PGD 879620067 PUD 8755d3067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
  Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
  Workqueue: events update_writeback_rate [bcache]
  task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
  RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
  RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
  RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
  R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
  FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000) knlGS:0000000000000000
  CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
  Stack:
   0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
   ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
   ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
  Call Trace:
   [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
   [<ffffffff81089575>] worker_thread+0x2a5/0x470
   [<ffffffff815a2f58>] ? __schedule+0x648/0x870
   [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
   [<ffffffff8108e3d5>] kthread+0xd5/0xe0
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
   [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110

Reported-by: Guo Chao <guochao@winhong.com>
Signed-off-by: Yi Li <yili@winhong.com>
---
 drivers/md/bcache/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 46a00134a36a..8b341f756ac0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1334,9 +1334,6 @@ static void cached_dev_free(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
-	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
-		cancel_writeback_rate_update_dwork(dc);
-
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
 	if (!IS_ERR_OR_NULL(dc->status_update_thread))
@@ -1344,6 +1341,9 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_lock(&bch_register_lock);
 
+	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+		cancel_writeback_rate_update_dwork(dc);
+
 	if (atomic_read(&dc->running))
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
 	bcache_device_free(&dc->disk);
-- 
2.25.3




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] bcache: fix panic due to cache_set is null
  2020-12-03  9:47           ` [PATCH v2] " Yi Li
@ 2020-12-03 11:27             ` Coly Li
  2020-12-04  1:33               ` Yi Li
  0 siblings, 1 reply; 11+ messages in thread
From: Coly Li @ 2020-12-03 11:27 UTC (permalink / raw)
  To: Yi Li; +Cc: yilikernel, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

On 12/3/20 5:47 PM, Yi Li wrote:
> bcache_device_detach will release the cache_set after hotunplug cache
> disk.
> 
> Here is how the issue happens.
> 1) cached_dev_free do cancel_writeback_rate_update_dwork
>    without bch_register_lock.
> 2) Wirting the writeback_percent by sysfs with
>    bch_register_lock will insert a writeback_rate_update work.
> 3) cached_dev_free with bch_register_lock to do bcache_device_free.
>    dc->disk.cl will be set NULL
> 4) update_writeback_rate will crash when access dc->disk.cl

The analysis makes sense, good catch! Thank you for make me understand
the problem.


> 
> Fixes: 80265d8dfd77 ("bcache: acquire bch_register_lock later in cached_dev_free()")
> 
>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   PGD 879620067 PUD 8755d3067 PMD 0
>   Oops: 0000 [#1] SMP
>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
>   Workqueue: events update_writeback_rate [bcache]
>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000) knlGS:0000000000000000
>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>   Stack:
>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>   Call Trace:
>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
> 
> Reported-by: Guo Chao <guochao@winhong.com>
> Signed-off-by: Yi Li <yili@winhong.com>
> ---
>  drivers/md/bcache/super.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 46a00134a36a..8b341f756ac0 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1334,9 +1334,6 @@ static void cached_dev_free(struct closure *cl)
>  {
>  	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
>  
> -	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
> -		cancel_writeback_rate_update_dwork(dc);
> -
>  	if (!IS_ERR_OR_NULL(dc->writeback_thread))
>  		kthread_stop(dc->writeback_thread);
>  	if (!IS_ERR_OR_NULL(dc->status_update_thread))
> @@ -1344,6 +1341,9 @@ static void cached_dev_free(struct closure *cl)
>  
>  	mutex_lock(&bch_register_lock);
>  
> +	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
> +		cancel_writeback_rate_update_dwork(dc);
> +
>  	if (atomic_read(&dc->running))
>  		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
>  	bcache_device_free(&dc->disk);
> 

Such change is problematic, the writeback rate kworker mush stopped
before writeback and status_update thread, otherwise you may encounter
other problem.

And when I review your patch I find another similar potential problem.

This is tricky, let me think how to fix it ....

Thank you again, for catch such issue.

Coly Li


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] bcache: fix panic due to cache_set is null
  2020-12-03 11:27             ` Coly Li
@ 2020-12-04  1:33               ` Yi Li
  2020-12-04  5:54                 ` [PATCH v3] " Yi Li
  0 siblings, 1 reply; 11+ messages in thread
From: Yi Li @ 2020-12-04  1:33 UTC (permalink / raw)
  To: Coly Li; +Cc: Yi Li, kent.overstreet, linux-bcache, linux-kernel, Guo Chao

On 12/3/20, Coly Li <colyli@suse.de> wrote:
> On 12/3/20 5:47 PM, Yi Li wrote:
>> bcache_device_detach will release the cache_set after hotunplug cache
>> disk.
>>
>> Here is how the issue happens.
>> 1) cached_dev_free do cancel_writeback_rate_update_dwork
>>    without bch_register_lock.
>> 2) Wirting the writeback_percent by sysfs with
>>    bch_register_lock will insert a writeback_rate_update work.
>> 3) cached_dev_free with bch_register_lock to do bcache_device_free.
>>    dc->disk.cl will be set NULL
>> 4) update_writeback_rate will crash when access dc->disk.cl
>
> The analysis makes sense, good catch! Thank you for make me understand
> the problem.
>
>
>>
>> Fixes: 80265d8dfd77 ("bcache: acquire bch_register_lock later in
>> cached_dev_free()")
>>
>>   IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
>>   PGD 879620067 PUD 8755d3067 PMD 0
>>   Oops: 0000 [#1] SMP
>>   CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
>>   Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601
>> 03/21/2017
>>   Workqueue: events update_writeback_rate [bcache]
>>   task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
>>   RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0
>> [bcache]
>>   RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
>>   RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
>>   RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
>>   RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
>>   R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
>>   R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
>>   FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000)
>> knlGS:0000000000000000
>>   CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>>   CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
>>   Stack:
>>    0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
>>    ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
>>    ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
>>   Call Trace:
>>    [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
>>    [<ffffffff81089575>] worker_thread+0x2a5/0x470
>>    [<ffffffff815a2f58>] ? __schedule+0x648/0x870
>>    [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
>>    [<ffffffff8108e3d5>] kthread+0xd5/0xe0
>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>    [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
>>    [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
>>
>> Reported-by: Guo Chao <guochao@winhong.com>
>> Signed-off-by: Yi Li <yili@winhong.com>
>> ---
>>  drivers/md/bcache/super.c | 6 +++---
>>  1 file changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index 46a00134a36a..8b341f756ac0 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -1334,9 +1334,6 @@ static void cached_dev_free(struct closure *cl)
>>  {
>>  	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
>>
>> -	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
>> -		cancel_writeback_rate_update_dwork(dc);
>> -
>>  	if (!IS_ERR_OR_NULL(dc->writeback_thread))
>>  		kthread_stop(dc->writeback_thread);
>>  	if (!IS_ERR_OR_NULL(dc->status_update_thread))
>> @@ -1344,6 +1341,9 @@ static void cached_dev_free(struct closure *cl)
>>
>>  	mutex_lock(&bch_register_lock);
>>
>> +	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
>> +		cancel_writeback_rate_update_dwork(dc);
>> +
>>  	if (atomic_read(&dc->running))
>>  		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
>>  	bcache_device_free(&dc->disk);
>>
>
> Such change is problematic, the writeback rate kworker mush stopped
> before writeback and status_update thread, otherwise you may encounter
> other problem.
>
enn, It is possible that I miss something.

1: writeback_rate_update work will add to the system_wq by
schedule_delayed_work.

2: The issue 80265d8dfd77 --" After moving
mutex_lock(&bch_register_lock) to a later location where before
atomic_read(&dc->running) in cached_dev_free()" .


> And when I review your patch I find another similar potential problem.
>
> This is tricky, let me think how to fix it ....
>
> Thank you again, for catch such issue.
>
> Coly Li
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v3] bcache: fix panic due to cache_set is null
  2020-12-04  1:33               ` Yi Li
@ 2020-12-04  5:54                 ` Yi Li
  0 siblings, 0 replies; 11+ messages in thread
From: Yi Li @ 2020-12-04  5:54 UTC (permalink / raw)
  To: colyli
  Cc: yilikernel, kent.overstreet, linux-bcache, linux-kernel, Yi Li, Guo Chao

bcache_device_detach will release the cache_set after hotunplug cache
disk.

Here is how the issue happens.
1) cached_dev_free do cancel_writeback_rate_update_dwork
   without bch_register_lock.
2) Wirting the writeback_percent by sysfs with
   bch_register_lock will insert a writeback_rate_update work.
3) cached_dev_free with bch_register_lock to do bcache_device_free.
   dc->disk.c will be set NULL.
4) update_writeback_rate will crash when access dc->disk.c.

After Patch:
1) cached_dev_free do cancel_writeback_rate_update_dwork and set dc->disk.c
   to NULL with bch_register_lock.
2) dc->disk.c = NULL will avoid that Wirting the writeback_percent by sysfs
   insert a writeback_rate_update work.

Fixes: 80265d8dfd77 ("bcache: acquire bch_register_lock later in cached_dev_free()")

  IP: [<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  PGD 879620067 PUD 8755d3067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 8 PID: 1005702 Comm: kworker/8:0 Tainted: G 4.4.0+10 #1
  Hardware name: Intel BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017
  Workqueue: events update_writeback_rate [bcache]
  task: ffff8808786f3800 ti: ffff88077082c000 task.ti: ffff88077082c000
  RIP: e030:[<ffffffffa03730c9>] update_writeback_rate+0x59/0x3a0 [bcache]
  RSP: e02b:ffff88077082fde0  EFLAGS: 00010202
  RAX: 0000000000000018 RBX: ffff8808047f0b08 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff88088170dab8 RDI: ffff88088170dab8
  RBP: ffff88077082fe18 R08: 000000000000000a R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000017bc8 R12: 0000000000000000
  R13: ffff8808047f0000 R14: 0000000000000200 R15: ffff8808047f0b08
  FS:  00007f157b6d6700(0000) GS:ffff880881700000(0000) knlGS:0000000000000000
  CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000368 CR3: 0000000875c05000 CR4: 0000000000040660
  Stack:
   0000000000000001 0000000000007ff0 ffff88085ff600c0 ffff880881714e80
   ffff880881719500 0000000000000200 ffff8808047f0b08 ffff88077082fe60
   ffffffff81088c0c 0000000081714e80 0000000000000000 ffff880881714e80
  Call Trace:
   [<ffffffff81088c0c>] process_one_work+0x1fc/0x3b0
   [<ffffffff81089575>] worker_thread+0x2a5/0x470
   [<ffffffff815a2f58>] ? __schedule+0x648/0x870
   [<ffffffff810892d0>] ? rescuer_thread+0x300/0x300
   [<ffffffff8108e3d5>] kthread+0xd5/0xe0
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110
   [<ffffffff815a704f>] ret_from_fork+0x3f/0x70
   [<ffffffff8108e300>] ? kthread_stop+0x110/0x110

Reported-by: Guo Chao <guochao@winhong.com>
Signed-off-by: Yi Li <yili@winhong.com>
---
 drivers/md/bcache/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 46a00134a36a..381f9fbcd765 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1122,9 +1122,6 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	BUG_ON(refcount_read(&dc->count));
 
 
-	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
-		cancel_writeback_rate_update_dwork(dc);
-
 	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
 		kthread_stop(dc->writeback_thread);
 		dc->writeback_thread = NULL;
@@ -1138,6 +1135,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
 
 	mutex_lock(&bch_register_lock);
 
+	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+		cancel_writeback_rate_update_dwork(dc);
+
 	calc_cached_dev_sectors(dc->disk.c);
 	bcache_device_detach(&dc->disk);
 	list_move(&dc->list, &uncached_devices);
@@ -1334,9 +1334,6 @@ static void cached_dev_free(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
-	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
-		cancel_writeback_rate_update_dwork(dc);
-
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
 	if (!IS_ERR_OR_NULL(dc->status_update_thread))
@@ -1344,6 +1341,9 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_lock(&bch_register_lock);
 
+	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+		cancel_writeback_rate_update_dwork(dc);
+
 	if (atomic_read(&dc->running))
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
 	bcache_device_free(&dc->disk);
-- 
2.25.3




^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-12-04  6:01 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-30 11:21 [PATCH] bcache: fix panic due to cache_set is null Yi Li
2020-12-01  4:35 ` Yi Li
2020-12-01 16:42   ` Coly Li
2020-12-03  6:25     ` Yi Li
2020-12-03  6:56       ` Coly Li
2020-12-03  9:06         ` Yi Li
2020-12-03  9:47           ` [PATCH v2] " Yi Li
2020-12-03 11:27             ` Coly Li
2020-12-04  1:33               ` Yi Li
2020-12-04  5:54                 ` [PATCH v3] " Yi Li
2020-12-01 16:38 ` [PATCH] " Coly Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).