nvdimm.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
* task ndctl:5155 blocked for more than 120 seconds observed during pmem/btt/dax switch test
       [not found] <895281518.353931.1493045976821.JavaMail.zimbra@redhat.com>
@ 2017-04-24 15:13 ` Yi Zhang
  2017-04-29  5:35   ` Dan Williams
  0 siblings, 1 reply; 6+ messages in thread
From: Yi Zhang @ 2017-04-24 15:13 UTC (permalink / raw)
  To: linux-nvdimm

Hello

I reproduced ndctl blocked issue on 4.11.0-rc8, here is the reproduce steps and kernel log, could you help check it? Thanks.

Reproduce steps:
function pmem_btt_dax_switch() {
        sector_size_list="512 520 528 4096 4104 4160 4224"
        for sector_size in $sector_size_list; do
                ndctl create-namespace -f -e namespace${1}.0 --mode=sector -l $sector_size
                ndctl create-namespace -f -e namespace${1}.0 --mode=raw
		ndctl create-namespace -f -e namespace${1}.0 --mode=dax
        done
}
for i in 0 1 2 3; do
        pmem_btt_dax_switch $i &
done

kernel log:
[ 6026.482747] INFO: task ndctl:5155 blocked for more than 120 seconds.
[ 6026.514573]       Not tainted 4.11.0-rc8 #1
[ 6026.535467] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 6026.573932] ndctl           D    0  5155   5154 0x00000080
[ 6026.600220] Call Trace:
[ 6026.611766]  __schedule+0x289/0x8f0
[ 6026.628026]  schedule+0x36/0x80
[ 6026.642725]  schedule_preempt_disabled+0xe/0x10
[ 6026.663804]  __mutex_lock.isra.8+0x266/0x500
[ 6026.683820]  ? mntput+0x24/0x40
[ 6026.698596]  __mutex_lock_slowpath+0x13/0x20
[ 6026.718558]  mutex_lock+0x2f/0x40
[ 6026.734046]  region_size_show+0x20/0x70 [dax]
[ 6026.754563]  dev_attr_show+0x20/0x50
[ 6026.771246]  ? mutex_lock+0x12/0x40
[ 6026.787201]  sysfs_kf_seq_show+0xbf/0x1a0
[ 6026.805510]  kernfs_seq_show+0x21/0x30
[ 6026.823174]  seq_read+0x115/0x390
[ 6026.838263]  ? do_filp_open+0xa5/0x100
[ 6026.855906]  kernfs_fop_read+0xff/0x180
[ 6026.873983]  __vfs_read+0x37/0x150
[ 6026.889786]  ? security_file_permission+0x9d/0xc0
[ 6026.911642]  vfs_read+0x8c/0x130
[ 6026.926874]  SyS_read+0x55/0xc0
[ 6026.941636]  do_syscall_64+0x67/0x180
[ 6026.959003]  entry_SYSCALL64_slow_path+0x25/0x25
[ 6026.980692] RIP: 0033:0x7f24eba9c7e0
[ 6026.999534] RSP: 002b:00007fff94cbb658 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 6027.035833] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f24eba9c7e0
[ 6027.071099] RDX: 0000000000000400 RSI: 00007fff94cbb680 RDI: 0000000000000004
[ 6027.106350] RBP: 0000000001d784e0 R08: 00007f24eb9fb988 R09: 0000000000000027
[ 6027.141119] R10: 000000000000000a R11: 0000000000000246 R12: 00007fff94cbb680
[ 6027.175009] R13: 0000000001d73270 R14: 00007fff94cbb680 R15: 0000000001d7b333
[ 6027.208899] INFO: task ndctl:5164 blocked for more than 120 seconds.
[ 6027.238487]       Not tainted 4.11.0-rc8 #1
[ 6027.258025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 6027.296084] ndctl           D    0  5164   5163 0x00000080
[ 6027.321726] Call Trace:
[ 6027.333199]  __schedule+0x289/0x8f0
[ 6027.349688]  schedule+0x36/0x80
[ 6027.364463]  schedule_preempt_disabled+0xe/0x10
[ 6027.385667]  __mutex_lock.isra.8+0x266/0x500
[ 6027.405824]  ? refcount_dec_and_test+0x11/0x20
[ 6027.426656]  ? wait_probe_show+0x70/0x70 [libnvdimm]
[ 6027.449966]  __mutex_lock_slowpath+0x13/0x20
[ 6027.470000]  mutex_lock+0x2f/0x40
[ 6027.485369]  flush_regions_dimms+0x1b/0x40 [libnvdimm]
[ 6027.509549]  device_for_each_child+0x50/0x90
[ 6027.529466]  wait_probe_show+0x46/0x70 [libnvdimm]
[ 6027.551543]  dev_attr_show+0x20/0x50
[ 6027.569666]  ? mutex_lock+0x12/0x40
[ 6027.586494]  sysfs_kf_seq_show+0xbf/0x1a0
[ 6027.607243]  kernfs_seq_show+0x21/0x30
[ 6027.625886]  seq_read+0x115/0x390
[ 6027.641497]  ? do_filp_open+0xa5/0x100
[ 6027.659110]  kernfs_fop_read+0xff/0x180
[ 6027.677120]  __vfs_read+0x37/0x150
[ 6027.692972]  ? security_file_permission+0x9d/0xc0
[ 6027.714948]  vfs_read+0x8c/0x130
[ 6027.730083]  SyS_read+0x55/0xc0
[ 6027.745087]  do_syscall_64+0x67/0x180
[ 6027.762273]  entry_SYSCALL64_slow_path+0x25/0x25
[ 6027.784092] RIP: 0033:0x7f08e08527e0
[ 6027.800715] RSP: 002b:00007fff5ffcd358 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 6027.836082] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f08e08527e0
[ 6027.869667] RDX: 0000000000000400 RSI: 00007fff5ffcd380 RDI: 0000000000000003
[ 6027.904697] RBP: 0000000000000000 R08: 00007f08e07b1988 R09: 0000000000000046
[ 6027.938016] R10: 0000000000000046 R11: 0000000000000246 R12: 00007fff5ffcd380
[ 6027.970932] R13: 0000000000000000 R14: 0000000000001388 R15: 00007fff5ffcd380
[ 6028.004331] INFO: task ndctl:5172 blocked for more than 120 seconds.
[ 6028.034311]       Not tainted 4.11.0-rc8 #1
[ 6028.053796] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 6028.092317] ndctl           D    0  5172   5171 0x00000080
[ 6028.120694] Call Trace:
[ 6028.132134]  __schedule+0x289/0x8f0
[ 6028.148496]  schedule+0x36/0x80
[ 6028.163221]  schedule_preempt_disabled+0xe/0x10
[ 6028.184498]  __mutex_lock.isra.8+0x266/0x500
[ 6028.204502]  ? refcount_dec_and_test+0x11/0x20
[ 6028.225383]  ? wait_probe_show+0x70/0x70 [libnvdimm]
[ 6028.248818]  __mutex_lock_slowpath+0x13/0x20
[ 6028.268915]  mutex_lock+0x2f/0x40
[ 6028.284572]  flush_regions_dimms+0x1b/0x40 [libnvdimm]
[ 6028.308483]  device_for_each_child+0x50/0x90
[ 6028.328625]  wait_probe_show+0x46/0x70 [libnvdimm]
[ 6028.351106]  dev_attr_show+0x20/0x50
[ 6028.367457]  ? mutex_lock+0x12/0x40
[ 6028.383180]  sysfs_kf_seq_show+0xbf/0x1a0
[ 6028.401459]  kernfs_seq_show+0x21/0x30
[ 6028.418997]  seq_read+0x115/0x390
[ 6028.434451]  ? do_filp_open+0xa5/0x100
[ 6028.451975]  kernfs_fop_read+0xff/0x180
[ 6028.469849]  __vfs_read+0x37/0x150
[ 6028.485746]  ? security_file_permission+0x9d/0xc0
[ 6028.507435]  vfs_read+0x8c/0x130
[ 6028.522452]  SyS_read+0x55/0xc0
[ 6028.537079]  do_syscall_64+0x67/0x180
[ 6028.554153]  entry_SYSCALL64_slow_path+0x25/0x25
[ 6028.575778] RIP: 0033:0x7eff768387e0
[ 6028.592970] RSP: 002b:00007ffcf5367668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 6028.631343] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007eff768387e0
[ 6028.664793] RDX: 0000000000000400 RSI: 00007ffcf5367690 RDI: 0000000000000003
[ 6028.698191] RBP: 0000000000000000 R08: 00007eff76797988 R09: 0000000000000046
[ 6028.731690] R10: 0000000000000046 R11: 0000000000000246 R12: 00007ffcf5367690
[ 6028.765029] R13: 0000000000000000 R14: 0000000000001388 R15: 00007ffcf5367690
[ 6028.798470] INFO: task ndctl:5180 blocked for more than 120 seconds.
[ 6028.828412]       Not tainted 4.11.0-rc8 #1
[ 6028.848058] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 6028.884846] ndctl           D    0  5180   5179 0x00000080
[ 6028.910311] Call Trace:
[ 6028.921891]  __schedule+0x289/0x8f0
[ 6028.938354]  schedule+0x36/0x80
[ 6028.952914]  __kernfs_remove+0x169/0x220
[ 6028.971210]  ? remove_wait_queue+0x60/0x60
[ 6028.990431]  kernfs_remove_by_name_ns+0x43/0xa0
[ 6029.011866]  remove_files.isra.1+0x36/0x70
[ 6029.032520]  sysfs_remove_group+0x44/0x90
[ 6029.051185]  sysfs_remove_groups+0x2e/0x50
[ 6029.070831]  dax_region_unregister+0x21/0x40 [dax]
[ 6029.093260]  devm_action_release+0xf/0x20
[ 6029.113529]  release_nodes+0x218/0x260
[ 6029.132924]  devres_release_all+0x3c/0x60
[ 6029.152249]  device_release_driver_internal+0x151/0x1f0
[ 6029.176701]  device_release_driver+0x12/0x20
[ 6029.196651]  unbind_store+0xba/0xe0
[ 6029.213026]  drv_attr_store+0x24/0x30
[ 6029.229987]  sysfs_kf_write+0x3a/0x50
[ 6029.247412]  kernfs_fop_write+0xff/0x180
[ 6029.265909]  __vfs_write+0x37/0x160
[ 6029.282231]  ? selinux_file_permission+0xe5/0x120
[ 6029.304504]  ? security_file_permission+0x3b/0xc0
[ 6029.326647]  vfs_write+0xb2/0x1b0
[ 6029.341929]  ? syscall_trace_enter+0x1d0/0x2b0
[ 6029.362863]  SyS_write+0x55/0xc0
[ 6029.377955]  do_syscall_64+0x67/0x180
[ 6029.395080]  entry_SYSCALL64_slow_path+0x25/0x25
[ 6029.416677] RIP: 0033:0x7f83a79b7840
[ 6029.433311] RSP: 002b:00007ffca25e4198 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 6029.468729] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f83a79b7840
[ 6029.502221] RDX: 0000000000000007 RSI: 00000000016deb90 RDI: 0000000000000003
[ 6029.535277] RBP: 00000000016deb90 R08: 00007f83a7916988 R09: 0000000000000046
[ 6029.568341] R10: 00007ffca25e3eb0 R11: 0000000000000246 R12: 0000000000000007
[ 6029.601701] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000002



#ps aux | grep ndctl
root      5155  0.0  0.0  41576  3044 pts/0    D+   10:53   0:00 ndctl create-namespace -f -e namespace2.0 --mode=dax
root      5164  0.0  0.0  41576  3040 pts/0    D+   10:53   0:00 ndctl create-namespace -f -e namespace0.0 --mode=dax
root      5172  0.1  0.0  41576  3024 pts/0    D+   10:53   0:00 ndctl create-namespace -f -e namespace3.0 --mode=dax
root      5180  0.0  0.0  41576  3036 pts/0    D+   10:53   0:00 ndctl create-namespace -f -e namespace1.0 --mode=sector -l 528

# cat /proc/5155/stack 
[<ffffffffc096f320>] region_size_show+0x20/0x70 [dax]
[<ffffffffbeae2fb0>] dev_attr_show+0x20/0x50
[<ffffffffbe8ca08f>] sysfs_kf_seq_show+0xbf/0x1a0
[<ffffffffbe8c8741>] kernfs_seq_show+0x21/0x30
[<ffffffffbe866f65>] seq_read+0x115/0x390
[<ffffffffbe8c8ebf>] kernfs_fop_read+0xff/0x180
[<ffffffffbe83ebe7>] __vfs_read+0x37/0x150
[<ffffffffbe83fb2c>] vfs_read+0x8c/0x130
[<ffffffffbe841105>] SyS_read+0x55/0xc0
[<ffffffffbe603a47>] do_syscall_64+0x67/0x180
[<ffffffffbed5602b>] entry_SYSCALL64_slow_path+0x25/0x25
[<ffffffffffffffff>] 0xffffffffffffffff

# cat /proc/5164/stack 
[<ffffffffc0bf720b>] flush_regions_dimms+0x1b/0x40 [libnvdimm]
[<ffffffffbeae2b30>] device_for_each_child+0x50/0x90
[<ffffffffc0bf71c6>] wait_probe_show+0x46/0x70 [libnvdimm]
[<ffffffffbeae2fb0>] dev_attr_show+0x20/0x50
[<ffffffffbe8ca08f>] sysfs_kf_seq_show+0xbf/0x1a0
[<ffffffffbe8c8741>] kernfs_seq_show+0x21/0x30
[<ffffffffbe866f65>] seq_read+0x115/0x390
[<ffffffffbe8c8ebf>] kernfs_fop_read+0xff/0x180
[<ffffffffbe83ebe7>] __vfs_read+0x37/0x150
[<ffffffffbe83fb2c>] vfs_read+0x8c/0x130
[<ffffffffbe841105>] SyS_read+0x55/0xc0
[<ffffffffbe603a47>] do_syscall_64+0x67/0x180
[<ffffffffbed5602b>] entry_SYSCALL64_slow_path+0x25/0x25
[<ffffffffffffffff>] 0xffffffffffffffff

# cat /proc/5172/stack 
[<ffffffffc0bf720b>] flush_regions_dimms+0x1b/0x40 [libnvdimm]
[<ffffffffbeae2b30>] device_for_each_child+0x50/0x90
[<ffffffffc0bf71c6>] wait_probe_show+0x46/0x70 [libnvdimm]
[<ffffffffbeae2fb0>] dev_attr_show+0x20/0x50
[<ffffffffbe8ca08f>] sysfs_kf_seq_show+0xbf/0x1a0
[<ffffffffbe8c8741>] kernfs_seq_show+0x21/0x30
[<ffffffffbe866f65>] seq_read+0x115/0x390
[<ffffffffbe8c8ebf>] kernfs_fop_read+0xff/0x180
[<ffffffffbe83ebe7>] __vfs_read+0x37/0x150
[<ffffffffbe83fb2c>] vfs_read+0x8c/0x130
[<ffffffffbe841105>] SyS_read+0x55/0xc0
[<ffffffffbe603a47>] do_syscall_64+0x67/0x180
[<ffffffffbed5602b>] entry_SYSCALL64_slow_path+0x25/0x25
[<ffffffffffffffff>] 0xffffffffffffffff

# cat /proc/5180/stack 
[<ffffffffbe8c7669>] __kernfs_remove+0x169/0x220
[<ffffffffbe8c8523>] kernfs_remove_by_name_ns+0x43/0xa0
[<ffffffffbe8cad26>] remove_files.isra.1+0x36/0x70
[<ffffffffbe8cb0e4>] sysfs_remove_group+0x44/0x90
[<ffffffffbe8cb1de>] sysfs_remove_groups+0x2e/0x50
[<ffffffffc09700a1>] dax_region_unregister+0x21/0x40 [dax]
[<ffffffffbeaec2ef>] devm_action_release+0xf/0x20
[<ffffffffbeaed038>] release_nodes+0x218/0x260
[<ffffffffbeaed28c>] devres_release_all+0x3c/0x60
[<ffffffffbeae8d71>] device_release_driver_internal+0x151/0x1f0
[<ffffffffbeae8e22>] device_release_driver+0x12/0x20
[<ffffffffbeae6a3a>] unbind_store+0xba/0xe0
[<ffffffffbeae6034>] drv_attr_store+0x24/0x30
[<ffffffffbe8c9c3a>] sysfs_kf_write+0x3a/0x50
[<ffffffffbe8c971f>] kernfs_fop_write+0xff/0x180
[<ffffffffbe83ed37>] __vfs_write+0x37/0x160
[<ffffffffbe83fc82>] vfs_write+0xb2/0x1b0
[<ffffffffbe8411c5>] SyS_write+0x55/0xc0
[<ffffffffbe603a47>] do_syscall_64+0x67/0x180
[<ffffffffbed5602b>] entry_SYSCALL64_slow_path+0x25/0x25
[<ffffffffffffffff>] 0xffffffffffffffff


Best Regards,
  Yi Zhang


_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: task ndctl:5155 blocked for more than 120 seconds observed during pmem/btt/dax switch test
  2017-04-24 15:13 ` task ndctl:5155 blocked for more than 120 seconds observed during pmem/btt/dax switch test Yi Zhang
@ 2017-04-29  5:35   ` Dan Williams
  2017-04-30  9:16     ` Yi Zhang
  0 siblings, 1 reply; 6+ messages in thread
From: Dan Williams @ 2017-04-29  5:35 UTC (permalink / raw)
  To: Yi Zhang; +Cc: linux-nvdimm

[-- Attachment #1: Type: text/plain, Size: 1046 bytes --]

On Mon, Apr 24, 2017 at 8:13 AM, Yi Zhang <yizhan@redhat.com> wrote:
> Hello
>
> I reproduced ndctl blocked issue on 4.11.0-rc8, here is the reproduce steps and kernel log, could you help check it? Thanks.
>
> Reproduce steps:
> function pmem_btt_dax_switch() {
>         sector_size_list="512 520 528 4096 4104 4160 4224"
>         for sector_size in $sector_size_list; do
>                 ndctl create-namespace -f -e namespace${1}.0 --mode=sector -l $sector_size
>                 ndctl create-namespace -f -e namespace${1}.0 --mode=raw
>                 ndctl create-namespace -f -e namespace${1}.0 --mode=dax
>         done
> }
> for i in 0 1 2 3; do
>         pmem_btt_dax_switch $i &
> done

Thanks for the report!

I couldn't run your script directly, do you have 4 memmap= regions
defined, or...?

I was able to find a locking problem with a debug patch that turned on
lockdep coverage for the device_lock(). Can you give the attached
patch a try to see if it resolves your lockup?

This is against latest nvdimm.git/libnvdimm-for-next

[-- Attachment #2: 0001-libnvdimm-fix-nvdimm_bus_lock-vs-device_lock-orderin.patch --]
[-- Type: text/x-patch, Size: 6036 bytes --]

From c71720d805401ebfc5d53ed1c4241ec566554e60 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Apr 2017 22:05:14 -0700
Subject: [PATCH] libnvdimm: fix nvdimm_bus_lock() vs device_lock() ordering

A debug patch to turn the standard device_lock() into something that
lockdep can analyze yielded the following:

 ======================================================
 [ INFO: possible circular locking dependency detected ]
 4.11.0-rc4+ #106 Tainted: G           O
 -------------------------------------------------------
 lt-libndctl/1898 is trying to acquire lock:
  (&dev->nvdimm_mutex/3){+.+.+.}, at: [<ffffffffc023c948>] nd_attach_ndns+0x178/0x1b0 [libnvdimm]

 but task is already holding lock:
  (&nvdimm_bus->reconfig_mutex){+.+.+.}, at: [<ffffffffc022e0b1>] nvdimm_bus_lock+0x21/0x30 [libnvdimm]

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&nvdimm_bus->reconfig_mutex){+.+.+.}:
        lock_acquire+0xf6/0x1f0
        __mutex_lock+0x88/0x980
        mutex_lock_nested+0x1b/0x20
        nvdimm_bus_lock+0x21/0x30 [libnvdimm]
        nvdimm_namespace_capacity+0x1b/0x40 [libnvdimm]
        nvdimm_namespace_common_probe+0x230/0x510 [libnvdimm]
        nd_pmem_probe+0x14/0x180 [nd_pmem]
        nvdimm_bus_probe+0xa9/0x260 [libnvdimm]

 -> #0 (&dev->nvdimm_mutex/3){+.+.+.}:
        __lock_acquire+0x1107/0x1280
        lock_acquire+0xf6/0x1f0
        __mutex_lock+0x88/0x980
        mutex_lock_nested+0x1b/0x20
        nd_attach_ndns+0x178/0x1b0 [libnvdimm]
        nd_namespace_store+0x308/0x3c0 [libnvdimm]
        namespace_store+0x87/0x220 [libnvdimm]

In this case '&dev->nvdimm_mutex/3' mirrors '&dev->mutex'.

Fix this by replacing the use of device_lock() with nvdimm_bus_lock() to protect
nd_{attach,detach}_ndns() operations.

Cc: <stable@vger.kernel.org>
Fixes: 8c2f7e8658df ("libnvdimm: infrastructure for btt devices")
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt_devs.c |  2 +-
 drivers/nvdimm/claim.c    | 23 +++++++++++++++--------
 drivers/nvdimm/dax_devs.c |  2 +-
 drivers/nvdimm/pfn_devs.c |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 97dd2925ed6e..4b76af2b8715 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -314,7 +314,7 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 	if (rc < 0) {
 		struct nd_btt *nd_btt = to_nd_btt(btt_dev);
 
-		__nd_detach_ndns(btt_dev, &nd_btt->ndns);
+		nd_detach_ndns(btt_dev, &nd_btt->ndns);
 		put_device(btt_dev);
 	}
 
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 2c19bc7fc056..35b210dc1e56 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -21,8 +21,13 @@
 void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
 {
 	struct nd_namespace_common *ndns = *_ndns;
+	struct nvdimm_bus *nvdimm_bus;
 
-	lockdep_assert_held(&ndns->dev.mutex);
+	if (!ndns)
+		return;
+
+	nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
 	dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__);
 	ndns->claim = NULL;
 	*_ndns = NULL;
@@ -37,18 +42,20 @@ void nd_detach_ndns(struct device *dev,
 	if (!ndns)
 		return;
 	get_device(&ndns->dev);
-	device_lock(&ndns->dev);
+	nvdimm_bus_lock(&ndns->dev);
 	__nd_detach_ndns(dev, _ndns);
-	device_unlock(&ndns->dev);
+	nvdimm_bus_unlock(&ndns->dev);
 	put_device(&ndns->dev);
 }
 
 bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 		struct nd_namespace_common **_ndns)
 {
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev);
+
 	if (attach->claim)
 		return false;
-	lockdep_assert_held(&attach->dev.mutex);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
 	dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__);
 	attach->claim = dev;
 	*_ndns = attach;
@@ -61,9 +68,9 @@ bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 {
 	bool claimed;
 
-	device_lock(&attach->dev);
+	nvdimm_bus_lock(&attach->dev);
 	claimed = __nd_attach_ndns(dev, attach, _ndns);
-	device_unlock(&attach->dev);
+	nvdimm_bus_unlock(&attach->dev);
 	return claimed;
 }
 
@@ -114,7 +121,7 @@ static void nd_detach_and_reset(struct device *dev,
 		struct nd_namespace_common **_ndns)
 {
 	/* detach the namespace and destroy / reset the device */
-	nd_detach_ndns(dev, _ndns);
+	__nd_detach_ndns(dev, _ndns);
 	if (is_idle(dev, *_ndns)) {
 		nd_device_unregister(dev, ND_ASYNC);
 	} else if (is_nd_btt(dev)) {
@@ -184,7 +191,7 @@ ssize_t nd_namespace_store(struct device *dev,
 	}
 
 	WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
-	if (!nd_attach_ndns(dev, ndns, _ndns)) {
+	if (!__nd_attach_ndns(dev, ndns, _ndns)) {
 		dev_dbg(dev, "%s already claimed\n",
 				dev_name(&ndns->dev));
 		len = -EBUSY;
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index 45fa82cae87c..c1b6556aea6e 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -124,7 +124,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
 	dev_dbg(dev, "%s: dax: %s\n", __func__,
 			rc == 0 ? dev_name(dax_dev) : "<none>");
 	if (rc < 0) {
-		__nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+		nd_detach_ndns(dax_dev, &nd_pfn->ndns);
 		put_device(dax_dev);
 	} else
 		__nd_device_register(dax_dev);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 6c033c9a2f06..c38566f4da7d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -484,7 +484,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	dev_dbg(dev, "%s: pfn: %s\n", __func__,
 			rc == 0 ? dev_name(pfn_dev) : "<none>");
 	if (rc < 0) {
-		__nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
+		nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
 		put_device(pfn_dev);
 	} else
 		__nd_device_register(pfn_dev);
-- 
2.9.3


[-- Attachment #3: Type: text/plain, Size: 151 bytes --]

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: task ndctl:5155 blocked for more than 120 seconds observed during pmem/btt/dax switch test
  2017-04-29  5:35   ` Dan Williams
@ 2017-04-30  9:16     ` Yi Zhang
  2017-04-30 14:21       ` [PATCH] device-dax: fix sysfs attribute deadlock Dan Williams
  0 siblings, 1 reply; 6+ messages in thread
From: Yi Zhang @ 2017-04-30  9:16 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm

On 04/29/2017 01:35 PM, Dan Williams wrote:
> On Mon, Apr 24, 2017 at 8:13 AM, Yi Zhang <yizhan@redhat.com> wrote:
>> Hello
>>
>> I reproduced ndctl blocked issue on 4.11.0-rc8, here is the reproduce steps and kernel log, could you help check it? Thanks.
>>
>> Reproduce steps:
>> function pmem_btt_dax_switch() {
>>          sector_size_list="512 520 528 4096 4104 4160 4224"
>>          for sector_size in $sector_size_list; do
>>                  ndctl create-namespace -f -e namespace${1}.0 --mode=sector -l $sector_size
>>                  ndctl create-namespace -f -e namespace${1}.0 --mode=raw
>>                  ndctl create-namespace -f -e namespace${1}.0 --mode=dax
>>          done
>> }
>> for i in 0 1 2 3; do
>>          pmem_btt_dax_switch $i &
>> done
> Thanks for the report!
>
> I couldn't run your script directly, do you have 4 memmap= regions
> defined, or...?
Here is my environment[1], pls try script [2]
[1]
# ndctl list -NB
{
   "provider":"ACPI.NFIT",
   "dev":"ndbus0",
   "namespaces":[
     {
       "dev":"namespace1.0",
       "mode":"dax",
       "size":8453619712,
       "uuid":"83bb25db-4a60-4613-a1c8-89d0f8f68c0d"
     },
     {
       "dev":"namespace3.0",
       "mode":"dax",
       "size":8453619712,
       "uuid":"243ab8f4-1d29-43c3-b5f1-be2e470b3a85"
     },
     {
       "dev":"namespace0.0",
       "mode":"dax",
       "size":8453619712,
       "uuid":"38d3a3ad-b6e2-4e4f-bf58-b793f9c039ea"
     },
     {
       "dev":"namespace2.0",
       "mode":"dax",
       "size":8453619712,
       "uuid":"f1887ded-7f8b-4fe0-bacb-303f88584711"
     }
   ]
}

[2]
#!/bin/bash
function pmem_btt_switch() {
         sector_size_list="512 520 528 4096 4104 4160 4224"
         for sector_size in $sector_size_list; do
                 ndctl create-namespace -f -e namespace${1}.0 
--mode=sector -l $sector_size
                 ndctl create-namespace -f -e namespace${1}.0 --mode=raw
                 ndctl create-namespace -f -e namespace${1}.0 --mode=dax
         done
}
num=0
while [ $num -lt 500 ]; do
         for i in 0 1 2 3; do
                 pmem_btt_switch $i &
         done
         wait
         ((num++))
done

> I was able to find a locking problem with a debug patch that turned on
> lockdep coverage for the device_lock(). Can you give the attached
> patch a try to see if it resolves your lockup?
>
> This is against latest nvdimm.git/libnvdimm-for-next
I tried the attached patch against bellow code[3], seems reproduced this 
issue again, pls check below log[4]
[3]
git clone -b libnvdimm-for-next 
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git
[4]
[  637.673117] nd_pmem btt3.0: No existing arenas
[  637.673118] nd_pmem btt1.0: No existing arenas
[  637.692493] pmem1s: detected capacity change from 0 to 7582678528
[  637.694980] pmem0: detected capacity change from 0 to 8589934592
[  637.778798] pmem3s: detected capacity change from 0 to 7582678528
[  637.812557] pmem2: detected capacity change from 0 to 8589934592
[  637.868725] nd_pmem btt0.0: No existing arenas
[  637.908019] pmem0s: detected capacity change from 0 to 7582678528
[  637.951486] nd_pmem btt2.0: No existing arenas
[  637.978690] pmem2s: detected capacity change from 0 to 7582678528
[  638.034491] pmem3: detected capacity change from 0 to 8589934592
[  638.068756] pmem1: detected capacity change from 0 to 8589934592
[  638.107573] pmem0: detected capacity change from 0 to 8589934592
[  638.199041] pmem0: detected capacity change from 0 to 8589934592
[  638.199041] pmem3: detected capacity change from 0 to 8589934592
[  638.199098] pmem1: detected capacity change from 0 to 8589934592
[  638.308711] pmem2: detected capacity change from 0 to 8589934592
[  865.258436] INFO: task ndctl:21792 blocked for more than 120 seconds.
[  865.292282]       Not tainted 4.11.0-rc4+ #1
[  865.311971] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
disables this message.
[  865.347460] ndctl           D    0 21792  21791 0x00000080
[  865.371985] Call Trace:
[  865.382907]  __schedule+0x289/0x8f0
[  865.398556]  schedule+0x36/0x80
[  865.412599]  schedule_preempt_disabled+0xe/0x10
[  865.432906]  __mutex_lock.isra.8+0x266/0x500
[  865.452217]  ? mntput+0x24/0x40
[  865.466262]  __mutex_lock_slowpath+0x13/0x20
[  865.486238]  mutex_lock+0x2f/0x40
[  865.501470]  region_size_show+0x20/0x70 [device_dax]
[  865.523976]  dev_attr_show+0x20/0x50
[  865.539997]  ? mutex_lock+0x12/0x40
[  865.555749]  sysfs_kf_seq_show+0xbf/0x1a0
[  865.573689]  kernfs_seq_show+0x21/0x30
[  865.590477]  seq_read+0x115/0x390
[  865.605305]  ? do_filp_open+0xa5/0x100
[  865.621728]  kernfs_fop_read+0xff/0x180
[  865.638917]  __vfs_read+0x37/0x150
[  865.653763]  ? security_file_permission+0x9d/0xc0
[  865.674791]  vfs_read+0x8c/0x130
[  865.688887]  SyS_read+0x55/0xc0
[  865.703026]  do_syscall_64+0x67/0x180
[  865.719498]  entry_SYSCALL64_slow_path+0x25/0x25
[  865.740944] RIP: 0033:0x7f1fcf5b47e0
[  865.757730] RSP: 002b:00007ffca0272138 EFLAGS: 00000246 ORIG_RAX: 
0000000000000000
[  865.793826] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 
00007f1fcf5b47e0
[  865.828479] RDX: 0000000000000400 RSI: 00007ffca0272160 RDI: 
0000000000000004
[  865.861314] RBP: 00000000012c54f0 R08: 00007f1fcf513988 R09: 
0000000000000027
[  865.893963] R10: 000000000000000a R11: 0000000000000246 R12: 
00007ffca0272160
[  865.926206] R13: 00000000012c8260 R14: 00007ffca0272160 R15: 
00000000012c8343
[  865.958196] INFO: task ndctl:21815 blocked for more than 120 seconds.
[  865.987044]       Not tainted 4.11.0-rc4+ #1
[  866.006393] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
disables this message.
[  866.041532] ndctl           D    0 21815  21814 0x00000080
[  866.066062] Call Trace:
[  866.077166]  __schedule+0x289/0x8f0
[  866.093101]  schedule+0x36/0x80
[  866.107348]  schedule_preempt_disabled+0xe/0x10
[  866.128397]  __mutex_lock.isra.8+0x266/0x500
[  866.150235]  ? refcount_dec_and_test+0x11/0x20
[  866.171204]  ? wait_probe_show+0x70/0x70 [libnvdimm]
[  866.195722]  __mutex_lock_slowpath+0x13/0x20
[  866.215658]  mutex_lock+0x2f/0x40
[  866.231182]  flush_regions_dimms+0x1b/0x40 [libnvdimm]
[  866.255614]  device_for_each_child+0x50/0x90
[  866.275489]  wait_probe_show+0x46/0x70 [libnvdimm]
[  866.299264]  dev_attr_show+0x20/0x50
[  866.318946]  ? mutex_lock+0x12/0x40
[  866.335740]  sysfs_kf_seq_show+0xbf/0x1a0
[  866.354608]  kernfs_seq_show+0x21/0x30
[  866.372248]  seq_read+0x115/0x390
[  866.387589]  ? do_filp_open+0xa5/0x100
[  866.405202]  kernfs_fop_read+0xff/0x180
[  866.423176]  __vfs_read+0x37/0x150
[  866.439121]  ? security_file_permission+0x9d/0xc0
[  866.461188]  vfs_read+0x8c/0x130
[  866.476212]  SyS_read+0x55/0xc0
[  866.490824]  do_syscall_64+0x67/0x180
[  866.507892]  entry_SYSCALL64_slow_path+0x25/0x25
[  866.529750] RIP: 0033:0x7f362c9077e0
[  866.546371] RSP: 002b:00007ffcc956d2c8 EFLAGS: 00000246 ORIG_RAX: 
0000000000000000
[  866.582011] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 
00007f362c9077e0
[  866.615492] RDX: 0000000000000400 RSI: 00007ffcc956d2f0 RDI: 
0000000000000003
[  866.649106] RBP: 0000000000000000 R08: 00007f362c866988 R09: 
0000000000000046
[  866.682500] R10: 0000000000000046 R11: 0000000000000246 R12: 
00007ffcc956d2f0
[  866.715817] R13: 0000000000000000 R14: 0000000000001388 R15: 
00007ffcc956d2f0
[  866.749426] INFO: task ndctl:21818 blocked for more than 120 seconds.
[  866.779733]       Not tainted 4.11.0-rc4+ #1
[  866.800190] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
disables this message.
[  866.840889] ndctl           D    0 21818  21816 0x00000080
[  866.866559] Call Trace:
[  866.877997]  __schedule+0x289/0x8f0
[  866.894153]  schedule+0x36/0x80
[  866.909009]  __kernfs_remove+0x169/0x220
[  866.927326]  ? remove_wait_queue+0x60/0x60
[  866.946570]  kernfs_remove_by_name_ns+0x43/0xa0
[  866.967736]  remove_files.isra.1+0x36/0x70
[  866.986964]  sysfs_remove_group+0x44/0x90
[  867.005536]  sysfs_remove_groups+0x2e/0x50
[  867.025658]  dax_region_unregister+0x21/0x40 [device_dax]
[  867.051102]  devm_action_release+0xf/0x20
[  867.069917]  release_nodes+0x218/0x260
[  867.087469]  devres_release_all+0x3c/0x60
[  867.106331]  device_release_driver_internal+0x151/0x1f0
[  867.130551]  device_release_driver+0x12/0x20
[  867.150939]  unbind_store+0xba/0xe0
[  867.167331]  drv_attr_store+0x24/0x30
[  867.184559]  sysfs_kf_write+0x3a/0x50
[  867.201805]  kernfs_fop_write+0xff/0x180
[  867.220111]  __vfs_write+0x37/0x160
[  867.236342]  ? selinux_file_permission+0xe5/0x120
[  867.258348]  ? security_file_permission+0x3b/0xc0
[  867.280189]  vfs_write+0xb2/0x1b0
[  867.295622]  ? syscall_trace_enter+0x1d0/0x2b0
[  867.317865]  SyS_write+0x55/0xc0
[  867.334608]  do_syscall_64+0x67/0x180
[  867.353717]  entry_SYSCALL64_slow_path+0x25/0x25
[  867.375047] RIP: 0033:0x7f6252c56840
[  867.391768] RSP: 002b:00007ffe36b4cc38 EFLAGS: 00000246 ORIG_RAX: 
0000000000000001
[  867.427789] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 
00007f6252c56840
[  867.461738] RDX: 0000000000000007 RSI: 0000000000b49ba0 RDI: 
0000000000000003
[  867.495312] RBP: 0000000000b49ba0 R08: 00007f6252bb5988 R09: 
0000000000000046
[  867.528377] R10: 00007ffe36b4c950 R11: 0000000000000246 R12: 
0000000000000007
[  867.561816] R13: 0000000000000000 R14: 0000000000000000 R15: 
0000000000000002
[  867.595562] INFO: task ndctl:21819 blocked for more than 120 seconds.
[  867.625845]       Not tainted 4.11.0-rc4+ #1
[  867.645436] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
disables this message.
[  867.682041] ndctl           D    0 21819  21817 0x00000080
[  867.708043] Call Trace:
[  867.719637]  __schedule+0x289/0x8f0
[  867.736425]  schedule+0x36/0x80
[  867.751222]  schedule_preempt_disabled+0xe/0x10
[  867.772404]  __mutex_lock.isra.8+0x266/0x500
[  867.792365]  ? refcount_dec_and_test+0x11/0x20
[  867.813359]  ? wait_probe_show+0x70/0x70 [libnvdimm]
[  867.838480]  __mutex_lock_slowpath+0x13/0x20
[  867.859706]  mutex_lock+0x2f/0x40
[  867.875377]  flush_regions_dimms+0x1b/0x40 [libnvdimm]
[  867.899479]  device_for_each_child+0x50/0x90
[  867.919187]  wait_probe_show+0x46/0x70 [libnvdimm]
[  867.940749]  dev_attr_show+0x20/0x50
[  867.957176]  ? mutex_lock+0x12/0x40
[  867.973416]  sysfs_kf_seq_show+0xbf/0x1a0
[  867.992291]  kernfs_seq_show+0x21/0x30
[  868.009818]  seq_read+0x115/0x390
[  868.025295]  ? do_filp_open+0xa5/0x100
[  868.042770]  kernfs_fop_read+0xff/0x180
[  868.060784]  __vfs_read+0x37/0x150
[  868.076791]  ? security_file_permission+0x9d/0xc0
[  868.100003]  vfs_read+0x8c/0x130
[  868.115039]  SyS_read+0x55/0xc0
[  868.129758]  do_syscall_64+0x67/0x180
[  868.146826]  entry_SYSCALL64_slow_path+0x25/0x25
[  868.168842] RIP: 0033:0x7f727cade7e0
[  868.185547] RSP: 002b:00007ffc3adfdd98 EFLAGS: 00000246 ORIG_RAX: 
0000000000000000
[  868.220933] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 
00007f727cade7e0
[  868.254185] RDX: 0000000000000400 RSI: 00007ffc3adfddc0 RDI: 
0000000000000003
[  868.287336] RBP: 000000000093b570 R08: 000000000093ea50 R09: 
00007f727d3da310
[  868.321130] R10: 0000000000000000 R11: 0000000000000246 R12: 
00007ffc3adfddc0
[  868.356791] R13: 0000000000000000 R14: 0000000000001388 R15: 
00007ffc3adfddc0

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] device-dax: fix sysfs attribute deadlock
  2017-04-30  9:16     ` Yi Zhang
@ 2017-04-30 14:21       ` Dan Williams
  2017-05-02 10:43         ` Yi Zhang
  0 siblings, 1 reply; 6+ messages in thread
From: Dan Williams @ 2017-04-30 14:21 UTC (permalink / raw)
  To: linux-nvdimm; +Cc: linux-kernel, stable

Usage of device_lock() for dax_region attributes is unnecessary and
deadlock prone. It's unnecessary because the order of registration /
un-registration guarantees that drvdata is always valid. It's deadlock
prone because it sets up this situation:

 ndctl           D    0  2170   2082 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  schedule_preempt_disabled+0x15/0x20
  __mutex_lock+0x402/0x980
  ? __mutex_lock+0x158/0x980
  ? align_show+0x2b/0x80 [dax]
  ? kernfs_seq_start+0x2f/0x90
  mutex_lock_nested+0x1b/0x20
  align_show+0x2b/0x80 [dax]
  dev_attr_show+0x20/0x50

 ndctl           D    0  2186   2079 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  __kernfs_remove+0x1f6/0x340
  ? kernfs_remove_by_name_ns+0x45/0xa0
  ? remove_wait_queue+0x70/0x70
  kernfs_remove_by_name_ns+0x45/0xa0
  remove_files.isra.1+0x35/0x70
  sysfs_remove_group+0x44/0x90
  sysfs_remove_groups+0x2e/0x50
  dax_region_unregister+0x25/0x40 [dax]
  devm_action_release+0xf/0x20
  release_nodes+0x16d/0x2b0
  devres_release_all+0x3c/0x60
  device_release_driver_internal+0x17d/0x220
  device_release_driver+0x12/0x20
  unbind_store+0x112/0x160

ndctl/2170 is trying to acquire the device_lock() to read an attribute,
and ndctl/2186 is holding the device_lock() while trying to drain all
active attribute readers.

Thanks to Yi Zhang for the reproduction script.

Fixes: d7fe1a67f658 ("dax: add region 'id', 'size', and 'align' attributes")
Cc: <stable@vger.kernel.org>
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c |   40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index ef93aa84622b..5e8302d3a89c 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -36,36 +36,27 @@ static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
 
+/*
+ * Rely on the fact that drvdata is set before the attributes are
+ * registered, and that the attributes are unregistered before drvdata
+ * is cleared to assume that drvdata is always valid.
+ */
 static ssize_t id_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%d\n", dax_region->id);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%d\n", dax_region->id);
 }
 static DEVICE_ATTR_RO(id);
 
 static ssize_t region_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&dax_region->res));
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%llu\n", (unsigned long long)
+			resource_size(&dax_region->res));
 }
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 		region_size_show, NULL);
@@ -73,16 +64,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 static ssize_t align_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%u\n", dax_region->align);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%u\n", dax_region->align);
 }
 static DEVICE_ATTR_RO(align);
 

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] device-dax: fix sysfs attribute deadlock
  2017-04-30 14:21       ` [PATCH] device-dax: fix sysfs attribute deadlock Dan Williams
@ 2017-05-02 10:43         ` Yi Zhang
  2017-05-02 16:13           ` Dan Williams
  0 siblings, 1 reply; 6+ messages in thread
From: Yi Zhang @ 2017-05-02 10:43 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-kernel, stable, linux-nvdimm

Verified this patch on 4.11.

Tested-by: Yi Zhang <yizhan@redhat.com>

Best Regards,
  Yi Zhang


----- Original Message -----
From: "Dan Williams" <dan.j.williams@intel.com>
To: linux-nvdimm@lists.01.org
Cc: linux-kernel@vger.kernel.org, stable@vger.kernel.org, "Yi Zhang" <yizhan@redhat.com>
Sent: Sunday, April 30, 2017 10:21:54 PM
Subject: [PATCH] device-dax: fix sysfs attribute deadlock

Usage of device_lock() for dax_region attributes is unnecessary and
deadlock prone. It's unnecessary because the order of registration /
un-registration guarantees that drvdata is always valid. It's deadlock
prone because it sets up this situation:

 ndctl           D    0  2170   2082 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  schedule_preempt_disabled+0x15/0x20
  __mutex_lock+0x402/0x980
  ? __mutex_lock+0x158/0x980
  ? align_show+0x2b/0x80 [dax]
  ? kernfs_seq_start+0x2f/0x90
  mutex_lock_nested+0x1b/0x20
  align_show+0x2b/0x80 [dax]
  dev_attr_show+0x20/0x50

 ndctl           D    0  2186   2079 0x00000000
 Call Trace:
  __schedule+0x31f/0x980
  schedule+0x3d/0x90
  __kernfs_remove+0x1f6/0x340
  ? kernfs_remove_by_name_ns+0x45/0xa0
  ? remove_wait_queue+0x70/0x70
  kernfs_remove_by_name_ns+0x45/0xa0
  remove_files.isra.1+0x35/0x70
  sysfs_remove_group+0x44/0x90
  sysfs_remove_groups+0x2e/0x50
  dax_region_unregister+0x25/0x40 [dax]
  devm_action_release+0xf/0x20
  release_nodes+0x16d/0x2b0
  devres_release_all+0x3c/0x60
  device_release_driver_internal+0x17d/0x220
  device_release_driver+0x12/0x20
  unbind_store+0x112/0x160

ndctl/2170 is trying to acquire the device_lock() to read an attribute,
and ndctl/2186 is holding the device_lock() while trying to drain all
active attribute readers.

Thanks to Yi Zhang for the reproduction script.

Fixes: d7fe1a67f658 ("dax: add region 'id', 'size', and 'align' attributes")
Cc: <stable@vger.kernel.org>
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.c |   40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index ef93aa84622b..5e8302d3a89c 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -36,36 +36,27 @@ static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
 
+/*
+ * Rely on the fact that drvdata is set before the attributes are
+ * registered, and that the attributes are unregistered before drvdata
+ * is cleared to assume that drvdata is always valid.
+ */
 static ssize_t id_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%d\n", dax_region->id);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%d\n", dax_region->id);
 }
 static DEVICE_ATTR_RO(id);
 
 static ssize_t region_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&dax_region->res));
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%llu\n", (unsigned long long)
+			resource_size(&dax_region->res));
 }
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 		region_size_show, NULL);
@@ -73,16 +64,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 static ssize_t align_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
+	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%u\n", dax_region->align);
-	device_unlock(dev);
-
-	return rc;
+	return sprintf(buf, "%u\n", dax_region->align);
 }
 static DEVICE_ATTR_RO(align);
 

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] device-dax: fix sysfs attribute deadlock
  2017-05-02 10:43         ` Yi Zhang
@ 2017-05-02 16:13           ` Dan Williams
  0 siblings, 0 replies; 6+ messages in thread
From: Dan Williams @ 2017-05-02 16:13 UTC (permalink / raw)
  To: Yi Zhang; +Cc: linux-kernel, stable, linux-nvdimm

On Tue, May 2, 2017 at 3:43 AM, Yi Zhang <yizhan@redhat.com> wrote:
> Verified this patch on 4.11.
>
> Tested-by: Yi Zhang <yizhan@redhat.com>

Thanks!
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-05-02 16:13 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <895281518.353931.1493045976821.JavaMail.zimbra@redhat.com>
2017-04-24 15:13 ` task ndctl:5155 blocked for more than 120 seconds observed during pmem/btt/dax switch test Yi Zhang
2017-04-29  5:35   ` Dan Williams
2017-04-30  9:16     ` Yi Zhang
2017-04-30 14:21       ` [PATCH] device-dax: fix sysfs attribute deadlock Dan Williams
2017-05-02 10:43         ` Yi Zhang
2017-05-02 16:13           ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).