linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] block: genhd: don't call probe function with major_names_lock held
@ 2021-06-19  1:05 Tetsuo Handa
  2021-06-19  3:24 ` kernel test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-06-19  1:05 UTC (permalink / raw)
  To: hch
  Cc: axboe, desmondcheongzx, gregkh, linux-block,
	linux-kernel-mentees, linux-kernel, linux-mtd, miquel.raynal,
	richard, Shuah Khan, syzbot+6a8a0d93c91e8fbf2e80, vigneshr

syzbot is reporting circular locking problem at blk_request_module() [1],
for blk_request_module() is calling probe function with major_names_lock
held while major_names_lock is held during module's __init and __exit
functions.

                                         loop_exit() {
                                           mutex_lock(&loop_ctl_mutex);
  blk_request_module() {
    mutex_lock(&major_names_lock);
    loop_probe() {
      mutex_lock(&loop_ctl_mutex); // Blocked by loop_exit()
      mutex_unlock(&loop_ctl_mutex);
    }
    mutex_unlock(&major_names_lock);
                                           unregister_blkdev() {
                                             mutex_lock(&major_names_lock); // Blocked by blk_request_module()
                                             mutex_unlock(&major_names_lock);
                                           }
                                           mutex_unlock(&loop_ctl_mutex);
  }
                                         }

Based on an assumption that a probe callback passed to __register_blkdev()
belongs to a module which calls __register_blkdev(), drop major_names_lock
before calling probe function by holding a reference to that module which
contains that probe function. If there is a module where this assumption
does not hold, such module can call ____register_blkdev() directly.

  blk_request_module() {
    mutex_lock(&major_names_lock);
    // Block loop_exit()
    mutex_unlock(&major_names_lock);
    loop_probe() {
      mutex_lock(&loop_ctl_mutex);
      mutex_unlock(&loop_ctl_mutex);
    }
    // Unblock loop_exit()
  }
                                         loop_exit() {
                                           mutex_lock(&loop_ctl_mutex);
                                           unregister_blkdev() {
                                             mutex_lock(&major_names_lock);
                                             mutex_unlock(&major_names_lock);
                                           }
                                           mutex_unlock(&loop_ctl_mutex);
                                         }

Note that regardless of this patch, it is up to probe function to
serialize module's __init function and probe function in that module
by using e.g. a mutex. This patch simply makes sure that module's __exit
function won't be called when probe function is about to be called.

While Desmond Cheong Zhi Xi already proposed a patch for breaking ABCD
circular dependency [2], I consider that this patch is still needed for
safely breaking AB-BA dependency upon module unloading. (Note that syzbot
does not test module unloading code because the syzbot kernels are built
with almost all modules built-in. We need manual inspection.)

By doing kmalloc(GFP_KERNEL) in ____register_blkdev() before holding
major_names_lock, we could convert major_names_lock from a mutex to
a spinlock. But that is beyond the scope of this patch.

Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [1]
Link: https://lkml.kernel.org/r/20210617160904.570111-1-desmondcheongzx@gmail.com [2]
Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
---
 block/genhd.c         | 36 +++++++++++++++++++++++++++---------
 include/linux/genhd.h |  8 +++++---
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 9f8cb7beaad1..9577c70a6bd3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -169,6 +169,7 @@ static struct blk_major_name {
 	int major;
 	char name[16];
 	void (*probe)(dev_t devt);
+	struct module *owner;
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 
@@ -197,7 +198,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
  *         @major = 0, try to allocate any unused major number.
  * @name: the name of the new block device as a zero terminated string
- * @probe: allback that is called on access to any minor number of @major
+ * @probe: callback that is called on access to any minor number of @major
+ * @owner: the owner of @probe function (i.e. THIS_MODULE or NULL).
  *
  * The @name must be unique within the system.
  *
@@ -214,8 +216,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  *
  * Use register_blkdev instead for any new code.
  */
-int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt))
+int ____register_blkdev(unsigned int major, const char *name,
+			void (*probe)(dev_t devt), struct module *owner)
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
@@ -255,6 +257,7 @@ int __register_blkdev(unsigned int major, const char *name,
 
 	p->major = major;
 	p->probe = probe;
+	p->owner = owner;
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -277,7 +280,7 @@ int __register_blkdev(unsigned int major, const char *name,
 	mutex_unlock(&major_names_lock);
 	return ret;
 }
-EXPORT_SYMBOL(__register_blkdev);
+EXPORT_SYMBOL(____register_blkdev);
 
 void unregister_blkdev(unsigned int major, const char *name)
 {
@@ -676,14 +679,29 @@ void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
 	struct blk_major_name **n;
+	void (*probe_fn)(dev_t devt);
 
 	mutex_lock(&major_names_lock);
 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
-		if ((*n)->major == major && (*n)->probe) {
-			(*n)->probe(devt);
-			mutex_unlock(&major_names_lock);
-			return;
-		}
+		if ((*n)->major != major || !(*n)->probe)
+			continue;
+		if (!try_module_get((*n)->owner))
+			break;
+		/*
+		 * Calling probe function with major_names_lock held causes
+		 * circular locking dependency problem. Thus, call it after
+		 * releasing major_names_lock.
+		 */
+		probe_fn = (*n)->probe;
+		mutex_unlock(&major_names_lock);
+		/*
+		 * Assuming that unregister_blkdev() is called from module's
+		 * __exit function, a module refcount taken above allows us
+		 * to safely call probe function without major_names_lock held.
+		 */
+		probe_fn(devt);
+		module_put((*n)->owner);
+		return;
 	}
 	mutex_unlock(&major_names_lock);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6fc26f7bdf71..070b73c043e6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -277,10 +277,12 @@ extern void put_disk(struct gendisk *disk);
 
 #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
 
-int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
+int ____register_blkdev(unsigned int major, const char *name,
+			void (*probe)(dev_t devt), struct module *owner);
+#define __register_blkdev(major, name, probe) \
+	____register_blkdev(major, name, probe, THIS_MODULE)
 #define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
+	____register_blkdev(major, name, NULL, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 bool bdev_check_media_change(struct block_device *bdev);
-- 
2.18.4

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
@ 2021-06-19  3:24 ` kernel test robot
  2021-06-19  6:14 ` kernel test robot
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-19  3:24 UTC (permalink / raw)
  To: Tetsuo Handa, hch
  Cc: kbuild-all, axboe, desmondcheongzx, gregkh, linux-block,
	linux-kernel-mentees, linux-kernel, linux-mtd, miquel.raynal,
	richard

[-- Attachment #1: Type: text/plain, Size: 10902 bytes --]

Hi Tetsuo,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.13-rc6]
[cannot apply to block/for-next next-20210618]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Tetsuo-Handa/block-genhd-don-t-call-probe-function-with-major_names_lock-held/20210619-090731
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/1de14b707f1a3e49fa4412b1eb8391f09747a005
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Tetsuo-Handa/block-genhd-don-t-call-probe-function-with-major_names_lock-held/20210619-090731
        git checkout 1de14b707f1a3e49fa4412b1eb8391f09747a005
        # save the attached .config to linux build tree
        make W=1 ARCH=um SUBARCH=x86_64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/genhd.c:223: warning: expecting prototype for __register_blkdev(). Prototype was for ____register_blkdev() instead


vim +223 block/genhd.c

^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  196  
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  197  /**
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  198   * __register_blkdev - register a new block device
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  199   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  200   * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  201   *         @major = 0, try to allocate any unused major number.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  202   * @name: the name of the new block device as a zero terminated string
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  203   * @probe: callback that is called on access to any minor number of @major
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  204   * @owner: the owner of @probe function (i.e. THIS_MODULE or NULL).
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  205   *
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  206   * The @name must be unique within the system.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  207   *
0e056eb5530da8 block/genhd.c         Mauro Carvalho Chehab 2017-03-30  208   * The return value depends on the @major input parameter:
0e056eb5530da8 block/genhd.c         Mauro Carvalho Chehab 2017-03-30  209   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  210   *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  211   *    then the function returns zero on success, or a negative error code
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  212   *  - if any unused major number was requested with @major = 0 parameter
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  213   *    then the return value is the allocated major number in range
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  214   *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  215   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  216   * See Documentation/admin-guide/devices.txt for the list of allocated
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  217   * major numbers.
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  218   *
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  219   * Use register_blkdev instead for any new code.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  220   */
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  221  int ____register_blkdev(unsigned int major, const char *name,
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  222  			void (*probe)(dev_t devt), struct module *owner)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16 @223  {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  224  	struct blk_major_name **n, *p;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  225  	int index, ret = 0;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  226  
e49fbbbf0aa14f block/genhd.c         Christoph Hellwig     2020-10-29  227  	mutex_lock(&major_names_lock);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  228  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  229  	/* temporary */
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  230  	if (major == 0) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  231  		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  232  			if (major_names[index] == NULL)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  233  				break;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  234  		}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  235  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  236  		if (index == 0) {
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  237  			printk("%s: failed to get major for %s\n",
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  238  			       __func__, name);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  239  			ret = -EBUSY;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  240  			goto out;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  241  		}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  242  		major = index;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  243  		ret = major;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  244  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  245  
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  246  	if (major >= BLKDEV_MAJOR_MAX) {
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  247  		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  248  		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  249  
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  250  		ret = -EINVAL;
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  251  		goto out;
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  252  	}
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  253  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  254  	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  255  	if (p == NULL) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  256  		ret = -ENOMEM;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  257  		goto out;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  258  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  259  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  260  	p->major = major;
a160c6159d4a0c block/genhd.c         Christoph Hellwig     2020-10-29  261  	p->probe = probe;
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  262  	p->owner = owner;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  263  	strlcpy(p->name, name, sizeof(p->name));
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  264  	p->next = NULL;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  265  	index = major_to_index(major);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  266  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  267  	for (n = &major_names[index]; *n; n = &(*n)->next) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  268  		if ((*n)->major == major)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  269  			break;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  270  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  271  	if (!*n)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  272  		*n = p;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  273  	else
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  274  		ret = -EBUSY;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  275  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  276  	if (ret < 0) {
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  277  		printk("register_blkdev: cannot get major %u for %s\n",
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  278  		       major, name);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  279  		kfree(p);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  280  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  281  out:
e49fbbbf0aa14f block/genhd.c         Christoph Hellwig     2020-10-29  282  	mutex_unlock(&major_names_lock);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  283  	return ret;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  284  }
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  285  EXPORT_SYMBOL(____register_blkdev);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  286  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 8634 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
  2021-06-19  3:24 ` kernel test robot
@ 2021-06-19  6:14 ` kernel test robot
  2021-06-19  6:44 ` Greg KH
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: kernel test robot @ 2021-06-19  6:14 UTC (permalink / raw)
  To: Tetsuo Handa, hch
  Cc: kbuild-all, clang-built-linux, axboe, desmondcheongzx, gregkh,
	linux-block, linux-kernel-mentees, linux-kernel, linux-mtd,
	miquel.raynal, richard

[-- Attachment #1: Type: text/plain, Size: 11279 bytes --]

Hi Tetsuo,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.13-rc6]
[cannot apply to block/for-next next-20210618]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Tetsuo-Handa/block-genhd-don-t-call-probe-function-with-major_names_lock-held/20210619-090731
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: x86_64-randconfig-a004-20210618 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project d1baf2895467735ab14f4b3415fce204c0cc8e7f)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/1de14b707f1a3e49fa4412b1eb8391f09747a005
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Tetsuo-Handa/block-genhd-don-t-call-probe-function-with-major_names_lock-held/20210619-090731
        git checkout 1de14b707f1a3e49fa4412b1eb8391f09747a005
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/genhd.c:223: warning: expecting prototype for __register_blkdev(). Prototype was for ____register_blkdev() instead


vim +223 block/genhd.c

^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  196  
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  197  /**
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  198   * __register_blkdev - register a new block device
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  199   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  200   * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  201   *         @major = 0, try to allocate any unused major number.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  202   * @name: the name of the new block device as a zero terminated string
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  203   * @probe: callback that is called on access to any minor number of @major
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  204   * @owner: the owner of @probe function (i.e. THIS_MODULE or NULL).
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  205   *
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  206   * The @name must be unique within the system.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  207   *
0e056eb5530da8 block/genhd.c         Mauro Carvalho Chehab 2017-03-30  208   * The return value depends on the @major input parameter:
0e056eb5530da8 block/genhd.c         Mauro Carvalho Chehab 2017-03-30  209   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  210   *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  211   *    then the function returns zero on success, or a negative error code
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  212   *  - if any unused major number was requested with @major = 0 parameter
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  213   *    then the return value is the allocated major number in range
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  214   *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  215   *
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  216   * See Documentation/admin-guide/devices.txt for the list of allocated
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  217   * major numbers.
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  218   *
e2b6b301871719 block/genhd.c         Christoph Hellwig     2020-11-14  219   * Use register_blkdev instead for any new code.
9e8c0bccdc944b block/genhd.c         Márton Németh         2009-02-20  220   */
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  221  int ____register_blkdev(unsigned int major, const char *name,
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  222  			void (*probe)(dev_t devt), struct module *owner)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16 @223  {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  224  	struct blk_major_name **n, *p;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  225  	int index, ret = 0;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  226  
e49fbbbf0aa14f block/genhd.c         Christoph Hellwig     2020-10-29  227  	mutex_lock(&major_names_lock);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  228  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  229  	/* temporary */
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  230  	if (major == 0) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  231  		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  232  			if (major_names[index] == NULL)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  233  				break;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  234  		}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  235  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  236  		if (index == 0) {
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  237  			printk("%s: failed to get major for %s\n",
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  238  			       __func__, name);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  239  			ret = -EBUSY;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  240  			goto out;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  241  		}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  242  		major = index;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  243  		ret = major;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  244  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  245  
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  246  	if (major >= BLKDEV_MAJOR_MAX) {
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  247  		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
dfc76d11dd455a block/genhd.c         Keyur Patel           2019-02-17  248  		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  249  
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  250  		ret = -EINVAL;
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  251  		goto out;
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  252  	}
133d55cdb2f1f9 block/genhd.c         Logan Gunthorpe       2017-06-16  253  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  254  	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  255  	if (p == NULL) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  256  		ret = -ENOMEM;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  257  		goto out;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  258  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  259  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  260  	p->major = major;
a160c6159d4a0c block/genhd.c         Christoph Hellwig     2020-10-29  261  	p->probe = probe;
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  262  	p->owner = owner;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  263  	strlcpy(p->name, name, sizeof(p->name));
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  264  	p->next = NULL;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  265  	index = major_to_index(major);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  266  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  267  	for (n = &major_names[index]; *n; n = &(*n)->next) {
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  268  		if ((*n)->major == major)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  269  			break;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  270  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  271  	if (!*n)
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  272  		*n = p;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  273  	else
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  274  		ret = -EBUSY;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  275  
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  276  	if (ret < 0) {
f33ff110ef31bd block/genhd.c         Srivatsa S. Bhat      2018-02-05  277  		printk("register_blkdev: cannot get major %u for %s\n",
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  278  		       major, name);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  279  		kfree(p);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  280  	}
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  281  out:
e49fbbbf0aa14f block/genhd.c         Christoph Hellwig     2020-10-29  282  	mutex_unlock(&major_names_lock);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  283  	return ret;
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  284  }
1de14b707f1a3e block/genhd.c         Tetsuo Handa          2021-06-19  285  EXPORT_SYMBOL(____register_blkdev);
^1da177e4c3f41 drivers/block/genhd.c Linus Torvalds        2005-04-16  286  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 44320 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
  2021-06-19  3:24 ` kernel test robot
  2021-06-19  6:14 ` kernel test robot
@ 2021-06-19  6:44 ` Greg KH
  2021-06-19  8:47   ` Tetsuo Handa
       [not found]   ` <20210620024403.820-1-hdanton@sina.com>
  2021-06-21  6:18 ` Christoph Hellwig
  2021-08-15  6:52 ` [PATCH v3] " Tetsuo Handa
  4 siblings, 2 replies; 29+ messages in thread
From: Greg KH @ 2021-06-19  6:44 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: hch, axboe, desmondcheongzx, linux-block, linux-kernel-mentees,
	linux-kernel, linux-mtd, miquel.raynal, richard, Shuah Khan,
	syzbot+6a8a0d93c91e8fbf2e80, vigneshr

On Sat, Jun 19, 2021 at 10:05:44AM +0900, Tetsuo Handa wrote:
> syzbot is reporting circular locking problem at blk_request_module() [1],
> for blk_request_module() is calling probe function with major_names_lock
> held while major_names_lock is held during module's __init and __exit
> functions.
> 
>                                          loop_exit() {
>                                            mutex_lock(&loop_ctl_mutex);
>   blk_request_module() {
>     mutex_lock(&major_names_lock);
>     loop_probe() {
>       mutex_lock(&loop_ctl_mutex); // Blocked by loop_exit()
>       mutex_unlock(&loop_ctl_mutex);
>     }
>     mutex_unlock(&major_names_lock);
>                                            unregister_blkdev() {
>                                              mutex_lock(&major_names_lock); // Blocked by blk_request_module()
>                                              mutex_unlock(&major_names_lock);
>                                            }
>                                            mutex_unlock(&loop_ctl_mutex);
>   }
>                                          }
> 
> Based on an assumption that a probe callback passed to __register_blkdev()
> belongs to a module which calls __register_blkdev(), drop major_names_lock
> before calling probe function by holding a reference to that module which
> contains that probe function. If there is a module where this assumption
> does not hold, such module can call ____register_blkdev() directly.
> 
>   blk_request_module() {
>     mutex_lock(&major_names_lock);
>     // Block loop_exit()
>     mutex_unlock(&major_names_lock);
>     loop_probe() {
>       mutex_lock(&loop_ctl_mutex);
>       mutex_unlock(&loop_ctl_mutex);
>     }
>     // Unblock loop_exit()
>   }
>                                          loop_exit() {
>                                            mutex_lock(&loop_ctl_mutex);
>                                            unregister_blkdev() {
>                                              mutex_lock(&major_names_lock);
>                                              mutex_unlock(&major_names_lock);
>                                            }
>                                            mutex_unlock(&loop_ctl_mutex);
>                                          }
> 
> Note that regardless of this patch, it is up to probe function to
> serialize module's __init function and probe function in that module
> by using e.g. a mutex. This patch simply makes sure that module's __exit
> function won't be called when probe function is about to be called.

module init functions ARE serialized.

> 
> While Desmond Cheong Zhi Xi already proposed a patch for breaking ABCD
> circular dependency [2], I consider that this patch is still needed for
> safely breaking AB-BA dependency upon module unloading. (Note that syzbot
> does not test module unloading code because the syzbot kernels are built
> with almost all modules built-in. We need manual inspection.)
> 
> By doing kmalloc(GFP_KERNEL) in ____register_blkdev() before holding
> major_names_lock, we could convert major_names_lock from a mutex to
> a spinlock. But that is beyond the scope of this patch.
> 
> Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [1]
> Link: https://lkml.kernel.org/r/20210617160904.570111-1-desmondcheongzx@gmail.com [2]
> Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Tested-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
> ---
>  block/genhd.c         | 36 +++++++++++++++++++++++++++---------
>  include/linux/genhd.h |  8 +++++---
>  2 files changed, 32 insertions(+), 12 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 9f8cb7beaad1..9577c70a6bd3 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -169,6 +169,7 @@ static struct blk_major_name {
>  	int major;
>  	char name[16];
>  	void (*probe)(dev_t devt);
> +	struct module *owner;

The module owner should not matter here.


>  } *major_names[BLKDEV_MAJOR_HASH_SIZE];
>  static DEFINE_MUTEX(major_names_lock);
>  
> @@ -197,7 +198,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
>   * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
>   *         @major = 0, try to allocate any unused major number.
>   * @name: the name of the new block device as a zero terminated string
> - * @probe: allback that is called on access to any minor number of @major
> + * @probe: callback that is called on access to any minor number of @major
> + * @owner: the owner of @probe function (i.e. THIS_MODULE or NULL).

This feels wrong.

>   *
>   * The @name must be unique within the system.
>   *
> @@ -214,8 +216,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
>   *
>   * Use register_blkdev instead for any new code.
>   */
> -int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt))
> +int ____register_blkdev(unsigned int major, const char *name,
> +			void (*probe)(dev_t devt), struct module *owner)

That's just a crazy naming scheme, please no.

>  {
>  	struct blk_major_name **n, *p;
>  	int index, ret = 0;
> @@ -255,6 +257,7 @@ int __register_blkdev(unsigned int major, const char *name,
>  
>  	p->major = major;
>  	p->probe = probe;
> +	p->owner = owner;
>  	strlcpy(p->name, name, sizeof(p->name));
>  	p->next = NULL;
>  	index = major_to_index(major);
> @@ -277,7 +280,7 @@ int __register_blkdev(unsigned int major, const char *name,
>  	mutex_unlock(&major_names_lock);
>  	return ret;
>  }
> -EXPORT_SYMBOL(__register_blkdev);
> +EXPORT_SYMBOL(____register_blkdev);

That's crazy.

>  
>  void unregister_blkdev(unsigned int major, const char *name)
>  {
> @@ -676,14 +679,29 @@ void blk_request_module(dev_t devt)
>  {
>  	unsigned int major = MAJOR(devt);
>  	struct blk_major_name **n;
> +	void (*probe_fn)(dev_t devt);
>  
>  	mutex_lock(&major_names_lock);
>  	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
> -		if ((*n)->major == major && (*n)->probe) {
> -			(*n)->probe(devt);
> -			mutex_unlock(&major_names_lock);
> -			return;
> -		}
> +		if ((*n)->major != major || !(*n)->probe)
> +			continue;
> +		if (!try_module_get((*n)->owner))
> +			break;

So you just fail?  Are you sure that is ok?


> +		/*
> +		 * Calling probe function with major_names_lock held causes
> +		 * circular locking dependency problem. Thus, call it after
> +		 * releasing major_names_lock.
> +		 */
> +		probe_fn = (*n)->probe;
> +		mutex_unlock(&major_names_lock);

So you save a pointer off and then unlock?  Why does that lock matter?

> +		/*
> +		 * Assuming that unregister_blkdev() is called from module's
> +		 * __exit function, a module refcount taken above allows us
> +		 * to safely call probe function without major_names_lock held.
> +		 */
> +		probe_fn(devt);
> +		module_put((*n)->owner);

So you are trying to keep the module in memory while the probe is call,
ok.  But module removal is not an issue, you remove modules at your own
risk.  As syzbot isn't even testing this, why is this an issue?


> +		return;
>  	}
>  	mutex_unlock(&major_names_lock);
>  
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 6fc26f7bdf71..070b73c043e6 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -277,10 +277,12 @@ extern void put_disk(struct gendisk *disk);
>  
>  #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
>  
> -int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt));
> +int ____register_blkdev(unsigned int major, const char *name,
> +			void (*probe)(dev_t devt), struct module *owner);
> +#define __register_blkdev(major, name, probe) \
> +	____register_blkdev(major, name, probe, THIS_MODULE)

This just feels wrong, you should only need 1 level deep of __ at most.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  6:44 ` Greg KH
@ 2021-06-19  8:47   ` Tetsuo Handa
       [not found]   ` <20210620024403.820-1-hdanton@sina.com>
  1 sibling, 0 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-06-19  8:47 UTC (permalink / raw)
  To: Greg KH, hch
  Cc: axboe, desmondcheongzx, linux-block, linux-kernel-mentees,
	linux-kernel, linux-mtd, miquel.raynal, richard, Shuah Khan,
	syzbot+6a8a0d93c91e8fbf2e80, vigneshr

On 2021/06/19 15:44, Greg KH wrote:
>> Note that regardless of this patch, it is up to probe function to
>> serialize module's __init function and probe function in that module
>> by using e.g. a mutex. This patch simply makes sure that module's __exit
>> function won't be called when probe function is about to be called.
> 
> module init functions ARE serialized.

The __init functions between module foo and module bar are serialized.
But the __init function for module foo and the probe function for module
foo are not serialized.

> The module owner should not matter here.

The __exit functions between module foo and module bar are serialized.
But the __exit function for module bar and the probe function for module
bar are not serialized.

You can observe a deadlock via the following steps.

(1) Build loop.ko with CONFIG_BLK_DEV_LOOP_MIN_COUNT=0 and
    a delay injection patch shown below.

----------
diff --git a/block/genhd.c b/block/genhd.c
index 9f8cb7beaad1..fe0360dc8c5d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -680,6 +680,11 @@ void blk_request_module(dev_t devt)
 	mutex_lock(&major_names_lock);
 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
 		if ((*n)->major == major && (*n)->probe) {
+			if (!strcmp(current->comm, "bash")) {
+				pr_info("sleep injection start\n");
+				schedule_timeout_killable(10 * HZ); // Call "rmmod" here.
+				pr_info("sleep injection end\n");
+			}
 			(*n)->probe(devt);
 			mutex_unlock(&major_names_lock);
 			return;
----------

(2) Run the following commands from bash shell.

# modprobe loop
# mknod /dev/loop0 b 7 0
# exec 100<>/dev/loop0 & sleep 1; rmmod loop

(3) See dmesg output.

----------
[   32.260467][ T2873] loop: module loaded
[   32.289961][ T2880] sleep injection start
[   42.484039][ T2880] sleep injection end
[   42.484218][ T2880]
[   42.484248][ T2880] ======================================================
[   42.484381][ T2880] WARNING: possible circular locking dependency detected
[   42.484455][ T2880] 5.13.0-rc6+ #867 Not tainted
[   42.484508][ T2880] ------------------------------------------------------
[   42.484579][ T2880] bash/2880 is trying to acquire lock:
[   42.484638][ T2880] ffffffffc075b468 (loop_ctl_mutex){+.+.}-{3:3}, at: loop_probe+0x44/0x90 [loop]
[   42.484760][ T2880]
[   42.484760][ T2880] but task is already holding lock:
[   42.484836][ T2880] ffffffff873ffe28 (major_names_lock){+.+.}-{3:3}, at: blk_request_module+0x1f/0x100
[   42.484950][ T2880]
[   42.484950][ T2880] which lock already depends on the new lock.
[   42.484950][ T2880]
[   42.485053][ T2880]
[   42.485053][ T2880] the existing dependency chain (in reverse order) is:
[   42.485144][ T2880]
[   42.485144][ T2880] -> #1 (major_names_lock){+.+.}-{3:3}:
[   42.485230][ T2880]        lock_acquire+0xb3/0x380
[   42.485292][ T2880]        __mutex_lock+0x89/0x8f0
[   42.485350][ T2880]        mutex_lock_nested+0x16/0x20
[   42.485410][ T2880]        unregister_blkdev+0x38/0xb0
[   42.485469][ T2880]        loop_exit+0x44/0xd84 [loop]
[   42.485534][ T2880]        __x64_sys_delete_module+0x135/0x210
[   42.485602][ T2880]        do_syscall_64+0x36/0x70
[   42.485660][ T2880]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   42.485733][ T2880]
[   42.485733][ T2880] -> #0 (loop_ctl_mutex){+.+.}-{3:3}:
[   42.485817][ T2880]        check_prevs_add+0x16a/0x1040
[   42.487091][ T2880]        __lock_acquire+0x118b/0x1580
[   42.488427][ T2880]        lock_acquire+0xb3/0x380
[   42.489701][ T2880]        __mutex_lock+0x89/0x8f0
[   42.490960][ T2880]        mutex_lock_nested+0x16/0x20
[   42.492374][ T2880]        loop_probe+0x44/0x90 [loop]
[   42.493756][ T2880]        blk_request_module+0xa3/0x100
[   42.495207][ T2880]        blkdev_get_no_open+0x93/0xc0
[   42.496516][ T2880]        blkdev_get_by_dev+0x56/0x200
[   42.497735][ T2880]        blkdev_open+0x5d/0xa0
[   42.498919][ T2880]        do_dentry_open+0x163/0x3b0
[   42.500157][ T2880]        vfs_open+0x28/0x30
[   42.501312][ T2880]        path_openat+0x7e6/0xad0
[   42.502443][ T2880]        do_filp_open+0x9f/0x110
[   42.503592][ T2880]        do_sys_openat2+0x245/0x310
[   42.504647][ T2880]        do_sys_open+0x48/0x80
[   42.505689][ T2880]        __x64_sys_open+0x1c/0x20
[   42.506730][ T2880]        do_syscall_64+0x36/0x70
[   42.507795][ T2880]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   42.508890][ T2880]
[   42.508890][ T2880] other info that might help us debug this:
[   42.508890][ T2880]
[   42.511436][ T2880]  Possible unsafe locking scenario:
[   42.511436][ T2880]
[   42.512303][ T2880]        CPU0                    CPU1
[   42.512727][ T2880]        ----                    ----
[   42.513143][ T2880]   lock(major_names_lock);
[   42.513557][ T2880]                                lock(loop_ctl_mutex);
[   42.513987][ T2880]                                lock(major_names_lock);
[   42.514417][ T2880]   lock(loop_ctl_mutex);
[   42.514841][ T2880]
[   42.514841][ T2880]  *** DEADLOCK ***
[   42.514841][ T2880]
[   42.516112][ T2880] 1 lock held by bash/2880:
[   42.516518][ T2880]  #0: ffffffff873ffe28 (major_names_lock){+.+.}-{3:3}, at: blk_request_module+0x1f/0x100
[   42.517383][ T2880]
[   42.517383][ T2880] stack backtrace:
[   42.518228][ T2880] CPU: 3 PID: 2880 Comm: bash Not tainted 5.13.0-rc6+ #867
[   42.518679][ T2880] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   42.519650][ T2880] Call Trace:
[   42.520128][ T2880]  dump_stack+0x76/0x95
[   42.520608][ T2880]  print_circular_bug.isra.50.cold.71+0x13c/0x141
[   42.521105][ T2880]  check_noncircular+0xfe/0x110
[   42.521607][ T2880]  check_prevs_add+0x16a/0x1040
[   42.522106][ T2880]  __lock_acquire+0x118b/0x1580
[   42.522606][ T2880]  lock_acquire+0xb3/0x380
[   42.523244][ T2880]  ? loop_probe+0x44/0x90 [loop]
[   42.523753][ T2880]  __mutex_lock+0x89/0x8f0
[   42.524250][ T2880]  ? loop_probe+0x44/0x90 [loop]
[   42.524749][ T2880]  ? loop_probe+0x44/0x90 [loop]
[   42.525290][ T2880]  ? blkdev_get_by_dev+0x200/0x200
[   42.525790][ T2880]  ? vprintk_default+0x18/0x20
[   42.526286][ T2880]  ? vprintk+0x56/0x130
[   42.526797][ T2880]  ? blkdev_get_by_dev+0x200/0x200
[   42.527286][ T2880]  mutex_lock_nested+0x16/0x20
[   42.527768][ T2880]  ? mutex_lock_nested+0x16/0x20
[   42.528250][ T2880]  loop_probe+0x44/0x90 [loop]
[   42.528730][ T2880]  blk_request_module+0xa3/0x100
[   42.529209][ T2880]  blkdev_get_no_open+0x93/0xc0
[   42.529691][ T2880]  blkdev_get_by_dev+0x56/0x200
[   42.530176][ T2880]  ? blkdev_get_by_dev+0x200/0x200
[   42.530688][ T2880]  blkdev_open+0x5d/0xa0
[   42.531170][ T2880]  do_dentry_open+0x163/0x3b0
[   42.531656][ T2880]  vfs_open+0x28/0x30
[   42.532132][ T2880]  path_openat+0x7e6/0xad0
[   42.532612][ T2880]  do_filp_open+0x9f/0x110
[   42.533094][ T2880]  ? _raw_spin_unlock+0x1d/0x30
[   42.533582][ T2880]  ? alloc_fd+0x116/0x200
[   42.534069][ T2880]  do_sys_openat2+0x245/0x310
[   42.534566][ T2880]  do_sys_open+0x48/0x80
[   42.535015][ T2880]  __x64_sys_open+0x1c/0x20
[   42.535462][ T2880]  do_syscall_64+0x36/0x70
[   42.535913][ T2880]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   42.536364][ T2880] RIP: 0033:0x7f16f15737f0
[   42.536797][ T2880] Code: 48 8b 15 83 76 2d 00 f7 d8 64 89 02 48 83 c8 ff c3 66 0f 1f 84 00 00 00 00 00 83 3d cd d7 2d 00 00 75 10 b8 02 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e cf 01 00 48 89 04 24
[   42.538148][ T2880] RSP: 002b:00007ffde9e2df58 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
[   42.538671][ T2880] RAX: ffffffffffffffda RBX: 00000000021b5020 RCX: 00007f16f15737f0
[   42.539173][ T2880] RDX: 00000000000001b6 RSI: 0000000000000042 RDI: 00000000021bd310
[   42.539681][ T2880] RBP: 00007ffde9e2dfe0 R08: 0000000000000020 R09: 00000000021bd310
[   42.540187][ T2880] R10: 00000000fffffff0 R11: 0000000000000246 R12: 000000000000000b
[   42.540696][ T2880] R13: 0000000000000001 R14: 0000000000000064 R15: 0000000000000000
[  246.772906][   T35] INFO: task bash:2880 blocked for more than 122 seconds.
[  246.774289][   T35]       Not tainted 5.13.0-rc6+ #867
[  246.775478][   T35] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  246.776734][   T35] task:bash            state:D stack:    0 pid: 2880 ppid:  2856 flags:0x00004000
[  246.779550][   T35] Call Trace:
[  246.781054][   T35]  __schedule+0x271/0x9e0
[  246.782381][   T35]  schedule+0x50/0xc0
[  246.783671][   T35]  schedule_preempt_disabled+0x10/0x20
[  246.784978][   T35]  __mutex_lock+0x467/0x8f0
[  246.786277][   T35]  ? loop_probe+0x44/0x90 [loop]
[  246.787582][   T35]  ? blkdev_get_by_dev+0x200/0x200
[  246.789093][   T35]  ? blkdev_get_by_dev+0x200/0x200
[  246.790409][   T35]  mutex_lock_nested+0x16/0x20
[  246.791700][   T35]  ? mutex_lock_nested+0x16/0x20
[  246.792992][   T35]  loop_probe+0x44/0x90 [loop]
[  246.794286][   T35]  blk_request_module+0xa3/0x100
[  246.795757][   T35]  blkdev_get_no_open+0x93/0xc0
[  246.797066][   T35]  blkdev_get_by_dev+0x56/0x200
[  246.798360][   T35]  ? blkdev_get_by_dev+0x200/0x200
[  246.799661][   T35]  blkdev_open+0x5d/0xa0
[  246.800953][   T35]  do_dentry_open+0x163/0x3b0
[  246.802258][   T35]  vfs_open+0x28/0x30
[  246.803559][   T35]  path_openat+0x7e6/0xad0
[  246.805201][   T35]  do_filp_open+0x9f/0x110
[  246.807128][   T35]  ? _raw_spin_unlock+0x1d/0x30
[  246.808461][   T35]  ? alloc_fd+0x116/0x200
[  246.809763][   T35]  do_sys_openat2+0x245/0x310
[  246.811034][   T35]  do_sys_open+0x48/0x80
[  246.812257][   T35]  __x64_sys_open+0x1c/0x20
[  246.813416][   T35]  do_syscall_64+0x36/0x70
[  246.814522][   T35]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  246.815624][   T35] RIP: 0033:0x7f16f15737f0
[  246.816707][   T35] RSP: 002b:00007ffde9e2df58 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
[  246.817863][   T35] RAX: ffffffffffffffda RBX: 00000000021b5020 RCX: 00007f16f15737f0
[  246.819033][   T35] RDX: 00000000000001b6 RSI: 0000000000000042 RDI: 00000000021bd310
[  246.820059][   T35] RBP: 00007ffde9e2dfe0 R08: 0000000000000020 R09: 00000000021bd310
[  246.820687][   T35] R10: 00000000fffffff0 R11: 0000000000000246 R12: 000000000000000b
[  246.821292][   T35] R13: 0000000000000001 R14: 0000000000000064 R15: 0000000000000000
[  246.821882][   T35] INFO: task rmmod:2882 blocked for more than 122 seconds.
[  246.822456][   T35]       Not tainted 5.13.0-rc6+ #867
[  246.823011][   T35] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  246.823599][   T35] task:rmmod           state:D stack:    0 pid: 2882 ppid:  2856 flags:0x00000000
[  246.824799][   T35] Call Trace:
[  246.825405][   T35]  __schedule+0x271/0x9e0
[  246.826018][   T35]  schedule+0x50/0xc0
[  246.826621][   T35]  schedule_preempt_disabled+0x10/0x20
[  246.827232][   T35]  __mutex_lock+0x467/0x8f0
[  246.827854][   T35]  ? unregister_blkdev+0x38/0xb0
[  246.828608][   T35]  mutex_lock_nested+0x16/0x20
[  246.829237][   T35]  ? mutex_lock_nested+0x16/0x20
[  246.829858][   T35]  unregister_blkdev+0x38/0xb0
[  246.830478][   T35]  loop_exit+0x44/0xd84 [loop]
[  246.831112][   T35]  __x64_sys_delete_module+0x135/0x210
[  246.831749][   T35]  do_syscall_64+0x36/0x70
[  246.832379][   T35]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  246.833019][   T35] RIP: 0033:0x7ff4990fcee7
[  246.833655][   T35] RSP: 002b:00007ffe445ecab8 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[  246.834326][   T35] RAX: ffffffffffffffda RBX: 0000000000a21220 RCX: 00007ff4990fcee7
[  246.835011][   T35] RDX: 00007ff499171a80 RSI: 0000000000000800 RDI: 0000000000a21288
[  246.835794][   T35] RBP: 0000000000000000 R08: 00007ff4993c6060 R09: 00007ff499171a80
[  246.836501][   T35] R10: 00007ffe445ec520 R11: 0000000000000206 R12: 00007ffe445ee840
[  246.837206][   T35] R13: 0000000000000000 R14: 0000000000a21220 R15: 0000000000a21010
[  246.837922][   T35] INFO: lockdep is turned off.
----------

Christoph and Desmond commented

  > > And now you can all probe after it has been freed and/or the module has
  > > been unloaded. The obviously correct fix is to only hold mtd_table_mutex
  > > for the actually required critical section:
  > > 
  > 
  > Thank you for the correction, Christoph. I hadn't thought of the 
  > scenario where the module is unloaded. I'll be more conscientious in the 
  > future.

at https://lkml.kernel.org/r/YMs3O/ce1567cf-bc94-790c-cfc0-e4e429e1a86a@gmail.com
but the fact that AB-BA deadlock is possible was forgotten. Therefore, I propose
this patch for fixing commit a160c6159d4a0cf8 ("block: add an optional probe callback
to major_names") which started calling the probe function without making sure that
the module will not go away.

>> @@ -676,14 +679,29 @@ void blk_request_module(dev_t devt)
>>  {
>>  	unsigned int major = MAJOR(devt);
>>  	struct blk_major_name **n;
>> +	void (*probe_fn)(dev_t devt);
>>  
>>  	mutex_lock(&major_names_lock);
>>  	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
>> -		if ((*n)->major == major && (*n)->probe) {
>> -			(*n)->probe(devt);
>> -			mutex_unlock(&major_names_lock);
>> -			return;
>> -		}
>> +		if ((*n)->major != major || !(*n)->probe)
>> +			continue;
>> +		if (!try_module_get((*n)->owner))
>> +			break;
> 
> So you just fail?  Are you sure that is ok?

After "break;", the control reaches request_module() (which waits for module
unloading to complete and then tries to load that module again).

I think failing open() for once due to racing with module unloading is acceptable
(as long as there is no leak/deadlock/oops etc.), but do we want to immediately
retry this loop after returning from request_module() ?

Also, to make sure that the __init function for module foo and the probe
function for module foo are serialized, should we also verify that
(*n)->owner && (*n)->owner->state == MODULE_STATE_LIVE (which indicates that
the __init function for that module has completed) because module_is_live()
 from try_module_get() is rather weak?

----------
/* FIXME: It'd be nice to isolate modules during init, too, so they
   aren't used before they (may) fail.  But presently too much code
   (IDE & SCSI) require entry into the module during init.*/
static inline bool module_is_live(struct module *mod)
{
	return mod->state != MODULE_STATE_GOING;
}
----------

>> +		/*
>> +		 * Assuming that unregister_blkdev() is called from module's
>> +		 * __exit function, a module refcount taken above allows us
>> +		 * to safely call probe function without major_names_lock held.
>> +		 */
>> +		probe_fn(devt);
>> +		module_put((*n)->owner);
> 
> So you are trying to keep the module in memory while the probe is call,
> ok.  But module removal is not an issue, you remove modules at your own
> risk.  As syzbot isn't even testing this, why is this an issue?

That's a crazy comment. A module removal bug (unless forced unloading is used)
is an issue. Why not fix bugs where syzbot cannot test?


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
       [not found]   ` <20210620024403.820-1-hdanton@sina.com>
@ 2021-06-20 13:54     ` Tetsuo Handa
  2021-06-21  8:54       ` Greg KH
  0 siblings, 1 reply; 29+ messages in thread
From: Tetsuo Handa @ 2021-06-20 13:54 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Greg KH, hch, axboe, desmondcheongzx, linux-block,
	linux-kernel-mentees, linux-kernel, linux-mtd, miquel.raynal,
	richard, Shuah Khan, syzbot+6a8a0d93c91e8fbf2e80, vigneshr

On 2021/06/20 11:44, Hillf Danton wrote:
> Good craft in regard to triggering the ABBA deadlock, but curious why not
> move unregister_blkdev out of and before loop_ctl_mutex, given it will also
> serialise with the prober.
> 

Well, something like this untested diff?

Call unregister_blkdev() as soon as __exit function starts, for calling
probe function after cleanup started will be unexpected for __exit function.

Keep probe function no-op until __init function ends, for probe function
might be called as soon as __register_blkdev() succeeded but calling probe
function before setup completes will be unexpected for __init function.

 drivers/block/ataflop.c |    6 +++++-
 drivers/block/brd.c     |    8 ++++++--
 drivers/block/floppy.c  |    4 ++++
 drivers/block/loop.c    |    4 ++--
 drivers/ide/ide-probe.c |    8 +++++++-
 drivers/md/md.c         |    5 +++++
 drivers/scsi/sd.c       |   10 +---------
 7 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index d601e49f80e0..3681e8c493b1 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1995,6 +1995,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
 }
 
 static DEFINE_MUTEX(ataflop_probe_lock);
+static bool module_initialize_completed;
 
 static void ataflop_probe(dev_t dev)
 {
@@ -2006,6 +2007,8 @@ static void ataflop_probe(dev_t dev)
 
 	if (drive >= FD_MAX_UNITS || type >= NUM_DISK_MINORS)
 		return;
+	if (!module_initialize_completed)
+		return;
 	mutex_lock(&ataflop_probe_lock);
 	if (!unit[drive].disk[type]) {
 		if (ataflop_alloc_disk(drive, type) == 0)
@@ -2080,6 +2083,7 @@ static int __init atari_floppy_init (void)
 	       UseTrackbuffer ? "" : "no ");
 	config_types();
 
+	module_initialize_completed = true;
 	return 0;
 
 err:
@@ -2138,6 +2142,7 @@ static void __exit atari_floppy_exit(void)
 {
 	int i, type;
 
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	for (i = 0; i < FD_MAX_UNITS; i++) {
 		for (type = 0; type < NUM_DISK_MINORS; type++) {
 			if (!unit[i].disk[type])
@@ -2148,7 +2153,6 @@ static void __exit atari_floppy_exit(void)
 		}
 		blk_mq_free_tag_set(&unit[i].tag_set);
 	}
-	unregister_blkdev(FLOPPY_MAJOR, "fd");
 
 	del_timer_sync(&fd_timer);
 	atari_stram_free( DMABuffer );
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 7562cf30b14e..91a10c938e65 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -371,6 +371,7 @@ __setup("ramdisk_size=", ramdisk_size);
 static LIST_HEAD(brd_devices);
 static DEFINE_MUTEX(brd_devices_mutex);
 static struct dentry *brd_debugfs_dir;
+static bool module_initialize_completed;
 
 static struct brd_device *brd_alloc(int i)
 {
@@ -439,6 +440,8 @@ static void brd_probe(dev_t dev)
 	struct brd_device *brd;
 	int i = MINOR(dev) / max_part;
 
+	if (!module_initialize_completed)
+		return;
 	mutex_lock(&brd_devices_mutex);
 	list_for_each_entry(brd, &brd_devices, brd_list) {
 		if (brd->brd_number == i)
@@ -530,6 +533,7 @@ static int __init brd_init(void)
 	mutex_unlock(&brd_devices_mutex);
 
 	pr_info("brd: module loaded\n");
+	module_initialize_completed = true;
 	return 0;
 
 out_free:
@@ -550,13 +554,13 @@ static void __exit brd_exit(void)
 {
 	struct brd_device *brd, *next;
 
+	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
 	debugfs_remove_recursive(brd_debugfs_dir);
 
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 		brd_del_one(brd);
 
-	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
-
 	pr_info("brd: module unloaded\n");
 }
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8a9d22207c59..37b8e53c183d 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4523,6 +4523,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type)
 }
 
 static DEFINE_MUTEX(floppy_probe_lock);
+static bool module_initialize_completed;
 
 static void floppy_probe(dev_t dev)
 {
@@ -4533,6 +4534,8 @@ static void floppy_probe(dev_t dev)
 	    type >= ARRAY_SIZE(floppy_type))
 		return;
 
+	if (!module_initialize_completed)
+		return;
 	mutex_lock(&floppy_probe_lock);
 	if (!disks[drive][type]) {
 		if (floppy_alloc_disk(drive, type) == 0)
@@ -4705,6 +4708,7 @@ static int __init do_floppy_init(void)
 				NULL);
 	}
 
+	module_initialize_completed = true;
 	return 0;
 
 out_remove_drives:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 76e12f3482a9..08aef61ab791 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2386,13 +2386,13 @@ static int loop_exit_cb(int id, void *ptr, void *data)
 
 static void __exit loop_exit(void)
 {
+	unregister_blkdev(LOOP_MAJOR, "loop");
+
 	mutex_lock(&loop_ctl_mutex);
 
 	idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
 	idr_destroy(&loop_index_idr);
 
-	unregister_blkdev(LOOP_MAJOR, "loop");
-
 	misc_deregister(&loop_misc);
 
 	mutex_unlock(&loop_ctl_mutex);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index aefd74c0d862..8c71356cdcfe 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -40,6 +40,8 @@
 #include <linux/uaccess.h>
 #include <asm/io.h>
 
+static bool module_initialize_completed;
+
 /**
  *	generic_id		-	add a generic drive id
  *	@drive:	drive to make an ID block for
@@ -904,6 +906,8 @@ static int init_irq (ide_hwif_t *hwif)
 
 static void ata_probe(dev_t dev)
 {
+	if (!module_initialize_completed)
+		return;
 	request_module("ide-disk");
 	request_module("ide-cd");
 	request_module("ide-tape");
@@ -1475,6 +1479,8 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 		}
 	}
 
+	if (j)
+		module_initialize_completed = true;
 	return j ? 0 : -1;
 }
 EXPORT_SYMBOL_GPL(ide_host_register);
@@ -1539,6 +1545,7 @@ EXPORT_SYMBOL_GPL(ide_port_unregister_devices);
 
 static void ide_unregister(ide_hwif_t *hwif)
 {
+	unregister_blkdev(hwif->major, hwif->name);
 	mutex_lock(&ide_cfg_mtx);
 
 	if (hwif->present) {
@@ -1559,7 +1566,6 @@ static void ide_unregister(ide_hwif_t *hwif)
 	 * Remove us from the kernel's knowledge
 	 */
 	kfree(hwif->sg_table);
-	unregister_blkdev(hwif->major, hwif->name);
 
 	ide_release_dma_engine(hwif);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 49f897fbb89b..6603900062bc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,6 +68,8 @@
 #include "md-bitmap.h"
 #include "md-cluster.h"
 
+static bool module_initialize_completed;
+
 /* pers_list is a list of registered personalities protected
  * by pers_lock.
  * pers_lock does extra service to protect accesses to
@@ -5776,6 +5778,8 @@ static void md_probe(dev_t dev)
 {
 	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
 		return;
+	if (!module_initialize_completed)
+		return;
 	if (create_on_open)
 		md_alloc(dev, NULL);
 }
@@ -9590,6 +9594,7 @@ static int __init md_init(void)
 	raid_table_header = register_sysctl_table(raid_root_table);
 
 	md_geninit();
+	module_initialize_completed = true;
 	return 0;
 
 err_mdp:
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cb3c37d1e009..4fc8f4db2ccf 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -629,14 +629,6 @@ static struct scsi_driver sd_template = {
 	.eh_reset		= sd_eh_reset,
 };
 
-/*
- * Don't request a new module, as that could deadlock in multipath
- * environment.
- */
-static void sd_default_probe(dev_t devt)
-{
-}
-
 /*
  * Device no to disk mapping:
  * 
@@ -3715,7 +3707,7 @@ static int __init init_sd(void)
 	SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
 
 	for (i = 0; i < SD_MAJORS; i++) {
-		if (__register_blkdev(sd_major(i), "sd", sd_default_probe))
+		if (register_blkdev(sd_major(i), "sd"))
 			continue;
 		majors++;
 	}


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
                   ` (2 preceding siblings ...)
  2021-06-19  6:44 ` Greg KH
@ 2021-06-21  6:18 ` Christoph Hellwig
  2021-08-15  6:52 ` [PATCH v3] " Tetsuo Handa
  4 siblings, 0 replies; 29+ messages in thread
From: Christoph Hellwig @ 2021-06-21  6:18 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: hch, axboe, desmondcheongzx, gregkh, linux-block,
	linux-kernel-mentees, linux-kernel, linux-mtd, miquel.raynal,
	richard, Shuah Khan, syzbot+6a8a0d93c91e8fbf2e80, vigneshr

The obvious fix is to not call unregister_blkdev under loop_ctl_mutex.
Which is pretty pointless anyway - I have a series to fix that but it
might need some reordering to move this to the front.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] block: genhd: don't call probe function with major_names_lock held
  2021-06-20 13:54     ` Tetsuo Handa
@ 2021-06-21  8:54       ` Greg KH
  0 siblings, 0 replies; 29+ messages in thread
From: Greg KH @ 2021-06-21  8:54 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Hillf Danton, hch, axboe, desmondcheongzx, linux-block,
	linux-kernel-mentees, linux-kernel, linux-mtd, miquel.raynal,
	richard, Shuah Khan, syzbot+6a8a0d93c91e8fbf2e80, vigneshr

On Sun, Jun 20, 2021 at 10:54:20PM +0900, Tetsuo Handa wrote:
> On 2021/06/20 11:44, Hillf Danton wrote:
> > Good craft in regard to triggering the ABBA deadlock, but curious why not
> > move unregister_blkdev out of and before loop_ctl_mutex, given it will also
> > serialise with the prober.
> > 
> 
> Well, something like this untested diff?
> 
> Call unregister_blkdev() as soon as __exit function starts, for calling
> probe function after cleanup started will be unexpected for __exit function.
> 
> Keep probe function no-op until __init function ends, for probe function
> might be called as soon as __register_blkdev() succeeded but calling probe
> function before setup completes will be unexpected for __init function.
> 
>  drivers/block/ataflop.c |    6 +++++-
>  drivers/block/brd.c     |    8 ++++++--
>  drivers/block/floppy.c  |    4 ++++
>  drivers/block/loop.c    |    4 ++--
>  drivers/ide/ide-probe.c |    8 +++++++-
>  drivers/md/md.c         |    5 +++++
>  drivers/scsi/sd.c       |   10 +---------
>  7 files changed, 30 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
> index d601e49f80e0..3681e8c493b1 100644
> --- a/drivers/block/ataflop.c
> +++ b/drivers/block/ataflop.c
> @@ -1995,6 +1995,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
>  }
>  
>  static DEFINE_MUTEX(ataflop_probe_lock);
> +static bool module_initialize_completed;

This is almost always wrong.

>  
>  static void ataflop_probe(dev_t dev)
>  {
> @@ -2006,6 +2007,8 @@ static void ataflop_probe(dev_t dev)
>  
>  	if (drive >= FD_MAX_UNITS || type >= NUM_DISK_MINORS)
>  		return;
> +	if (!module_initialize_completed)
> +		return;

This is not correct, when you register a callback structure, it can be
instantly called.  Do not expect it to be delayed until later.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
                   ` (3 preceding siblings ...)
  2021-06-21  6:18 ` Christoph Hellwig
@ 2021-08-15  6:52 ` Tetsuo Handa
  2021-08-15  7:06   ` Greg KH
  2021-08-16  7:33   ` [PATCH v3] " Christoph Hellwig
  4 siblings, 2 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-15  6:52 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: Greg KH, Linus Torvalds, Hannes Reinecke, Chaitanya Kulkarni,
	Hillf Danton, Desmond Cheong Zhi Xi, linux-block

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit a160c6159d4a0cf8 ("block: add an optional probe callback to
major_names") is calling the module's probe function with major_names_lock
held.

When copying content of /proc/devices to another file via sendfile(),

  sb_writers#$N => &p->lock => major_names_lock

dependency is recorded.

When loop_process_work() from WQ context performs a write request,

  (wq_completion)loop$M => (work_completion)&lo->rootcg_work =>
  sb_writers#$N

dependency is recorded.

When flush_workqueue() from drain_workqueue() from destroy_workqueue()
 from __loop_clr_fd() from blkdev_put() from blkdev_close() from __fput()
is called,

  &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M

dependency is recorded.

When loop_control_remove() from loop_control_ioctl(LOOP_CTL_REMOVE) is
called,

  loop_ctl_mutex => &lo->lo_mutex

dependency is recorded.

As a result, lockdep thinks that there is

  loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock

dependency chain.

Then, if loop_add() from loop_probe() from blk_request_module() from
blkdev_get_no_open() from blkdev_get_by_dev() from blkdev_open() from
do_dentry_open() from path_openat() from do_filp_open() is called,

  major_names_lock => loop_ctl_mutex

dependency is appended to the dependency chain.

There would be two approaches for breaking this circular dependency.
One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
kill major_names_lock => loop_ctl_mutex chain. This patch implements
the latter, due to the following reasons.

 (1) sb_writers#$N => &p->lock => major_names_lock chain is unavoidable

 (2) this patch can also fix similar problem in other modules [2] which
     is caused by calling the probe function with major_names_lock held

 (3) I believe that this patch is principally safer than e.g.
     commit bd5c39edad535d9f ("loop: reduce loop_ctl_mutex coverage in
     loop_exit") which waits until the probe function finishes using
     global mutex in order to fix deadlock reproducible by sleep
     injection [3]

This patch adds THIS_MODULE parameter to __register_blkdev(), based on an
assumption that a probe callback passed to __register_blkdev() belongs to
a module which calls __register_blkdev(). Then, drop major_names_lock
before calling probe function by holding a reference to that module which
contains that probe function.

It may sound strange to pass THIS_MODULE as a function argument, but
what this patch is doing is essentially the same with passing e.g.
"struct file_system_type" argument initialized with .owner = THIS_MODULE
to register_filesystem(). To minimize lines changed, this patch does not
define some "struct" for __register_blkdev().

Link: https://syzkaller.appspot.com/bug?id=7bb10e8b62f83e4d445cdf4c13d69e407e629558 [1]
Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [2]
Link: https://lkml.kernel.org/r/c4edf07f-92e1-a350-2743-f0b0234a2b6c@i-love.sakura.ne.jp [3]
Reported-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Fixes: a160c6159d4a0cf8 ("block: add an optional probe callback to major_names")
---
 block/genhd.c           | 32 +++++++++++++++++++++++++-------
 drivers/block/ataflop.c |  2 +-
 drivers/block/brd.c     |  2 +-
 drivers/block/floppy.c  |  2 +-
 drivers/block/loop.c    |  2 +-
 drivers/md/md.c         |  4 ++--
 drivers/scsi/sd.c       |  2 +-
 include/linux/genhd.h   |  4 ++--
 8 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 298ee78c1bda..e7c75c5aa831 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -162,6 +162,7 @@ static struct blk_major_name {
 	int major;
 	char name[16];
 	void (*probe)(dev_t devt);
+	struct module *owner;
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 
@@ -190,7 +191,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
  *         @major = 0, try to allocate any unused major number.
  * @name: the name of the new block device as a zero terminated string
- * @probe: allback that is called on access to any minor number of @major
+ * @probe: callback that is called on access to any minor number of @major
+ * @owner: THIS_MODULE if @probe is not NULL, ignored if @probe is NULL.
  *
  * The @name must be unique within the system.
  *
@@ -208,7 +210,7 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * Use register_blkdev instead for any new code.
  */
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt))
+		      void (*probe)(dev_t devt), struct module *owner)
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
@@ -248,6 +250,7 @@ int __register_blkdev(unsigned int major, const char *name,
 
 	p->major = major;
 	p->probe = probe;
+	p->owner = owner;
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -653,14 +656,29 @@ void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
 	struct blk_major_name **n;
+	void (*probe_fn)(dev_t devt);
 
 	mutex_lock(&major_names_lock);
 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
-		if ((*n)->major == major && (*n)->probe) {
-			(*n)->probe(devt);
-			mutex_unlock(&major_names_lock);
-			return;
-		}
+		if ((*n)->major != major || !(*n)->probe)
+			continue;
+		if (!try_module_get((*n)->owner))
+			break;
+		/*
+		 * Calling probe function with major_names_lock held causes
+		 * circular locking dependency problem. Thus, call it after
+		 * releasing major_names_lock.
+		 */
+		probe_fn = (*n)->probe;
+		mutex_unlock(&major_names_lock);
+		/*
+		 * Assuming that unregister_blkdev() is called from module's
+		 * __exit function, a module refcount taken above allows us
+		 * to safely call probe function without major_names_lock held.
+		 */
+		probe_fn(devt);
+		module_put((*n)->owner);
+		return;
 	}
 	mutex_unlock(&major_names_lock);
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index a093644ac39f..1b7fe10d49e7 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -2016,7 +2016,7 @@ static int __init atari_floppy_init (void)
 		return -ENODEV;
 
 	mutex_lock(&ataflop_probe_lock);
-	ret = __register_blkdev(FLOPPY_MAJOR, "fd", ataflop_probe);
+	ret = __register_blkdev(FLOPPY_MAJOR, "fd", ataflop_probe, THIS_MODULE);
 	if (ret)
 		goto out_unlock;
 
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 95694113e38e..d0bdfd56dfc8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -487,7 +487,7 @@ static int __init brd_init(void)
 	 *	dynamically.
 	 */
 
-	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe))
+	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe, THIS_MODULE))
 		return -EIO;
 
 	brd_check_and_reset_par();
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 87460e0e5c72..ee33ba03e6bd 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4570,7 +4570,7 @@ static int __init do_floppy_init(void)
 		timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
 	}
 
-	err = __register_blkdev(FLOPPY_MAJOR, "fd", floppy_probe);
+	err = __register_blkdev(FLOPPY_MAJOR, "fd", floppy_probe, THIS_MODULE);
 	if (err)
 		goto out_put_disk;
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f0cdff0c5fbf..d6606c3b7d74 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2564,7 +2564,7 @@ static int __init loop_init(void)
 		goto err_out;
 
 
-	if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) {
+	if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe, THIS_MODULE)) {
 		err = -EIO;
 		goto misc_out;
 	}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ae8fe54ea358..c13f45c0f502 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9584,11 +9584,11 @@ static int __init md_init(void)
 	if (!md_rdev_misc_wq)
 		goto err_rdev_misc_wq;
 
-	ret = __register_blkdev(MD_MAJOR, "md", md_probe);
+	ret = __register_blkdev(MD_MAJOR, "md", md_probe, THIS_MODULE);
 	if (ret < 0)
 		goto err_md;
 
-	ret = __register_blkdev(0, "mdp", md_probe);
+	ret = __register_blkdev(0, "mdp", md_probe, THIS_MODULE);
 	if (ret < 0)
 		goto err_mdp;
 	mdp_major = ret;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index b8d55af763f9..ddd67a1045e7 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3729,7 +3729,7 @@ static int __init init_sd(void)
 	SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
 
 	for (i = 0; i < SD_MAJORS; i++) {
-		if (__register_blkdev(sd_major(i), "sd", sd_default_probe))
+		if (__register_blkdev(sd_major(i), "sd", sd_default_probe, THIS_MODULE))
 			continue;
 		majors++;
 	}
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 13b34177cc85..b0948003071d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -303,9 +303,9 @@ struct gendisk *__blk_alloc_disk(int node);
 void blk_cleanup_disk(struct gendisk *disk);
 
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
+		      void (*probe)(dev_t devt), struct module *owner);
 #define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
+	__register_blkdev(major, name, NULL, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 bool bdev_check_media_change(struct block_device *bdev);
-- 
2.18.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-08-15  6:52 ` [PATCH v3] " Tetsuo Handa
@ 2021-08-15  7:06   ` Greg KH
  2021-08-15  7:49     ` Tetsuo Handa
  2021-08-16  7:33   ` [PATCH v3] " Christoph Hellwig
  1 sibling, 1 reply; 29+ messages in thread
From: Greg KH @ 2021-08-15  7:06 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block

On Sun, Aug 15, 2021 at 03:52:45PM +0900, Tetsuo Handa wrote:
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -303,9 +303,9 @@ struct gendisk *__blk_alloc_disk(int node);
>  void blk_cleanup_disk(struct gendisk *disk);
>  
>  int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt));
> +		      void (*probe)(dev_t devt), struct module *owner);
>  #define register_blkdev(major, name) \
> -	__register_blkdev(major, name, NULL)
> +	__register_blkdev(major, name, NULL, NULL)
>  void unregister_blkdev(unsigned int major, const char *name);

Do not force modules to put their own THIS_MODULE macro as a parameter,
put it in the .h file so that it happens automagically, much like the
usb_register() define in include/linux/usb.h is created.

If you do that, you can probably get rid of the __register_blkdev()
direct calls as well...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-08-15  7:06   ` Greg KH
@ 2021-08-15  7:49     ` Tetsuo Handa
  2021-08-15  9:19       ` Greg KH
  0 siblings, 1 reply; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-15  7:49 UTC (permalink / raw)
  To: Greg KH
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block

On 2021/08/15 16:06, Greg KH wrote:
> On Sun, Aug 15, 2021 at 03:52:45PM +0900, Tetsuo Handa wrote:
>> --- a/include/linux/genhd.h
>> +++ b/include/linux/genhd.h
>> @@ -303,9 +303,9 @@ struct gendisk *__blk_alloc_disk(int node);
>>  void blk_cleanup_disk(struct gendisk *disk);
>>  
>>  int __register_blkdev(unsigned int major, const char *name,
>> -		void (*probe)(dev_t devt));
>> +		      void (*probe)(dev_t devt), struct module *owner);
>>  #define register_blkdev(major, name) \
>> -	__register_blkdev(major, name, NULL)
>> +	__register_blkdev(major, name, NULL, NULL)
>>  void unregister_blkdev(unsigned int major, const char *name);
> 
> Do not force modules to put their own THIS_MODULE macro as a parameter,
> put it in the .h file so that it happens automagically, much like the
> usb_register() define in include/linux/usb.h is created.

Sure. We can do like below.

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 13b34177cc85..70f00641fa11 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -302,10 +302,12 @@ extern void put_disk(struct gendisk *disk);
 struct gendisk *__blk_alloc_disk(int node);
 void blk_cleanup_disk(struct gendisk *disk);
 
-int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
+int ____register_blkdev(unsigned int major, const char *name,
+			void (*probe)(dev_t devt), struct module *owner);
+#define __register_blkdev(major, name, probe) \
+	____register_blkdev(major, name, probe, THIS_MODULE)
 #define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
+	____register_blkdev(major, name, NULL, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 bool bdev_check_media_change(struct block_device *bdev);

> 
> If you do that, you can probably get rid of the __register_blkdev()
> direct calls as well...

I assume "automagically" implies "do not patch individual unregister_blkdev() /
__register_blkdev() callers". But "removing __register_blkdev() direct calls"
requires "patching individual __register_blkdev() callers". I didn't catch
what you suggested here.

Anyway, since this patch should be backported to 5.11+ kernels, lines changed
should be kept minimal. We can do whatever remapping after this patch.

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-08-15  7:49     ` Tetsuo Handa
@ 2021-08-15  9:19       ` Greg KH
  2021-08-18 11:07         ` [PATCH v4] " Tetsuo Handa
  0 siblings, 1 reply; 29+ messages in thread
From: Greg KH @ 2021-08-15  9:19 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block

On Sun, Aug 15, 2021 at 04:49:55PM +0900, Tetsuo Handa wrote:
> On 2021/08/15 16:06, Greg KH wrote:
> > On Sun, Aug 15, 2021 at 03:52:45PM +0900, Tetsuo Handa wrote:
> >> --- a/include/linux/genhd.h
> >> +++ b/include/linux/genhd.h
> >> @@ -303,9 +303,9 @@ struct gendisk *__blk_alloc_disk(int node);
> >>  void blk_cleanup_disk(struct gendisk *disk);
> >>  
> >>  int __register_blkdev(unsigned int major, const char *name,
> >> -		void (*probe)(dev_t devt));
> >> +		      void (*probe)(dev_t devt), struct module *owner);
> >>  #define register_blkdev(major, name) \
> >> -	__register_blkdev(major, name, NULL)
> >> +	__register_blkdev(major, name, NULL, NULL)
> >>  void unregister_blkdev(unsigned int major, const char *name);
> > 
> > Do not force modules to put their own THIS_MODULE macro as a parameter,
> > put it in the .h file so that it happens automagically, much like the
> > usb_register() define in include/linux/usb.h is created.
> 
> Sure. We can do like below.
> 
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 13b34177cc85..70f00641fa11 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -302,10 +302,12 @@ extern void put_disk(struct gendisk *disk);
>  struct gendisk *__blk_alloc_disk(int node);
>  void blk_cleanup_disk(struct gendisk *disk);
>  
> -int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt));
> +int ____register_blkdev(unsigned int major, const char *name,
> +			void (*probe)(dev_t devt), struct module *owner);
> +#define __register_blkdev(major, name, probe) \
> +	____register_blkdev(major, name, probe, THIS_MODULE)
>  #define register_blkdev(major, name) \
> -	__register_blkdev(major, name, NULL)
> +	____register_blkdev(major, name, NULL, NULL)

Why stop at 4 _ characters?

{sigh}

I think you need a new naming scheme here...

> > 
> > If you do that, you can probably get rid of the __register_blkdev()
> > direct calls as well...
> 
> I assume "automagically" implies "do not patch individual unregister_blkdev() /
> __register_blkdev() callers". But "removing __register_blkdev() direct calls"
> requires "patching individual __register_blkdev() callers". I didn't catch
> what you suggested here.

Yes, that is a different thing, sorry for the confusion.

> Anyway, since this patch should be backported to 5.11+ kernels, lines changed
> should be kept minimal. We can do whatever remapping after this patch.

Don't worry about backports, fix it properly first.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-08-15  6:52 ` [PATCH v3] " Tetsuo Handa
  2021-08-15  7:06   ` Greg KH
@ 2021-08-16  7:33   ` Christoph Hellwig
  2021-08-16 14:44     ` Tetsuo Handa
       [not found]     ` <20210817081045.3609-1-hdanton@sina.com>
  1 sibling, 2 replies; 29+ messages in thread
From: Christoph Hellwig @ 2021-08-16  7:33 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Greg KH, Linus Torvalds,
	Hannes Reinecke, Chaitanya Kulkarni, Hillf Danton,
	Desmond Cheong Zhi Xi, linux-block

This is the wrong way to approach it.  Instead of making things ever
more complex try to make it simpler.  In this case, ensure that the
destroy_workqueue is not held with pointless locks.  Given that the
loop device already is known to not have a reference and marked as in
the rundown state there shouldn't be anything that is required to
be protected by lo_mutex.  So something like this untested patch
should probably do the work:

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fa1c298a8cfb..c734dc768316 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1347,16 +1347,15 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	 * became visible.
 	 */
 
-	mutex_lock(&lo->lo_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
 		err = -ENXIO;
-		goto out_unlock;
+		goto out;
 	}
 
 	filp = lo->lo_backing_file;
 	if (filp == NULL) {
 		err = -EINVAL;
-		goto out_unlock;
+		goto out;
 	}
 
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
@@ -1366,6 +1365,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
+
+	mutex_lock(&lo->lo_mutex);
 	spin_lock_irq(&lo->lo_work_lock);
 	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
 				idle_list) {
@@ -1413,8 +1414,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
 	lo_number = lo->lo_number;
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
+
 	if (partscan) {
 		/*
 		 * open_mutex has been held already in release path, so don't
@@ -1435,7 +1436,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 		/* Device is gone, no point in returning error */
 		err = 0;
 	}
-
+out:
 	/*
 	 * lo->lo_state is set to Lo_unbound here after above partscan has
 	 * finished.

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
  2021-08-16  7:33   ` [PATCH v3] " Christoph Hellwig
@ 2021-08-16 14:44     ` Tetsuo Handa
       [not found]     ` <20210817081045.3609-1-hdanton@sina.com>
  1 sibling, 0 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-16 14:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Greg KH, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block

On 2021/08/16 16:33, Christoph Hellwig wrote:
> This is the wrong way to approach it.  Instead of making things ever
> more complex try to make it simpler.  In this case, ensure that the
> destroy_workqueue is not held with pointless locks.  Given that the
> loop device already is known to not have a reference and marked as in
> the rundown state there shouldn't be anything that is required to
> be protected by lo_mutex.  So something like this untested patch
> should probably do the work:

I tested your untested patch, and I confirmed that your untested patch
does not fix the circular locking problem, for you are not seeing the
entire dependency chain.

> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index fa1c298a8cfb..c734dc768316 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1347,16 +1347,15 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
>  	 * became visible.
>  	 */
>  
> -	mutex_lock(&lo->lo_mutex);
>  	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
>  		err = -ENXIO;
> -		goto out_unlock;
> +		goto out;
>  	}

Would you please write your patch as verbose as mine? Since your patch
lacks why it is safe to make such change, I can't review your patch.
I wonder why "goto out;" (which sets lo->lo_state = Lo_unbound) is
the correct choice here.

>  
>  	filp = lo->lo_backing_file;
>  	if (filp == NULL) {
>  		err = -EINVAL;
> -		goto out_unlock;
> +		goto out;
>  	}

I also wonder when lo->lo_backing_file == NULL case is possible.

As far as I can see, "lo->lo_state = Lo_rundown;" is done only when
"lo->lo_state == Lo_bound". And "lo->lo_state = Lo_bound;" is done
only when "lo->lo_backing_file = file;" was done.

I guess this is another sanity check which should use WARN_ON_ONCE().

>  
>  	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
> @@ -1366,6 +1365,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
>  	blk_mq_freeze_queue(lo->lo_queue);

I also wonder why it is safe to call blk_mq_freeze_queue() without holding
lo->lo_mutex. loop_change_fd(), loop_set_status(), lo_release() etc. are
calling blk_mq_freeze_queue() with lo->lo_mutex held.

If we need to hold lo->lo_mutex when calling blk_mq_freeze_queue() from
__loop_clr_fd(), why it is safe to once release lo->lo_mutex before
destroy_workqueue() and reacquire lo->lo_mutex after destroy_workqueue() ?

Since there is too little comment regarding what lock protects what resource
and/or operation, nobody can review the correctness of locking in loop module.
The loop module is a labyrinth...

>  
>  	destroy_workqueue(lo->workqueue);
> +
> +	mutex_lock(&lo->lo_mutex);
>  	spin_lock_irq(&lo->lo_work_lock);
>  	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
>  				idle_list) {
> @@ -1413,8 +1414,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
>  	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
>  	lo_number = lo->lo_number;
>  	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
> -out_unlock:
>  	mutex_unlock(&lo->lo_mutex);
> +
>  	if (partscan) {
>  		/*
>  		 * open_mutex has been held already in release path, so don't
> @@ -1435,7 +1436,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
>  		/* Device is gone, no point in returning error */
>  		err = 0;
>  	}
> -
> +out:
>  	/*
>  	 * lo->lo_state is set to Lo_unbound here after above partscan has
>  	 * finished.
> 

Anyway, I modified your patch as below and tested on v5.14-rc6.

----------------------------------------
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f0cdff0c5fbf..daa47cea8a32 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1349,17 +1349,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	 * became visible.
 	 */
 
-	mutex_lock(&lo->lo_mutex);
-	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
-		err = -ENXIO;
-		goto out_unlock;
-	}
+	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown))
+		return -ENXIO;
 
 	filp = lo->lo_backing_file;
-	if (filp == NULL) {
-		err = -EINVAL;
-		goto out_unlock;
-	}
+	if (WARN_ON_ONCE(!filp))
+		return -EINVAL;
 
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
@@ -1368,6 +1363,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
+
+	mutex_lock(&lo->lo_mutex);
 	spin_lock_irq(&lo->lo_work_lock);
 	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
 				idle_list) {
@@ -1415,8 +1412,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
 	lo_number = lo->lo_number;
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
+
 	if (partscan) {
 		/*
 		 * open_mutex has been held already in release path, so don't
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 6ce4bc57f919..d02412055e6c 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -525,14 +525,10 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	if (!blktrans_notifier.list.next)
 		register_mtd_user(&blktrans_notifier);
 
-
-	mutex_lock(&mtd_table_mutex);
-
 	ret = register_blkdev(tr->major, tr->name);
 	if (ret < 0) {
 		printk(KERN_WARNING "Unable to register %s block device on major %d: %d\n",
 		       tr->name, tr->major, ret);
-		mutex_unlock(&mtd_table_mutex);
 		return ret;
 	}
 
@@ -542,12 +538,12 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	tr->blkshift = ffs(tr->blksize) - 1;
 
 	INIT_LIST_HEAD(&tr->devs);
-	list_add(&tr->list, &blktrans_majors);
 
+	mutex_lock(&mtd_table_mutex);
+	list_add(&tr->list, &blktrans_majors);
 	mtd_for_each_device(mtd)
 		if (mtd->type != MTD_ABSENT)
 			tr->add_mtd(tr, mtd);
-
 	mutex_unlock(&mtd_table_mutex);
 	return 0;
 }
----------------------------------------

And I immediately got the following lockdep warning (as I expected).

----------------------------------------
[  124.519262] loop0: detected capacity change from 0 to 4096
[  124.552028] EXT4-fs error (device loop0): ext4_fill_super:4956: inode #2: comm a.out: iget: root inode unallocated
[  124.556889] EXT4-fs (loop0): get root inode failed
[  124.559074] EXT4-fs (loop0): mount failed

[  124.917463] ======================================================
[  124.919741] WARNING: possible circular locking dependency detected
[  124.922233] 5.14.0-rc6+ #744 Not tainted
[  124.923799] ------------------------------------------------------
[  124.925996] systemd-udevd/6710 is trying to acquire lock:
[  124.928106] ffff88800dc82948 ((wq_completion)loop0){+.+.}-{0:0}, at: flush_workqueue+0x85/0x560
[  124.931100]
               but task is already holding lock:
[  124.933388] ffff88800dc87128 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x180
[  124.936216]
               which lock already depends on the new lock.

[  124.939807]
               the existing dependency chain (in reverse order) is:
[  124.942741]
               -> #6 (&disk->open_mutex){+.+.}-{3:3}:
[  124.945672]        lock_acquire+0xd0/0x300
[  124.947321]        __mutex_lock+0x97/0x950
[  124.948938]        del_gendisk+0x4e/0x1d0
[  124.950666]        loop_remove+0x10/0x40
[  124.952238]        loop_control_ioctl+0x193/0x1a0
[  124.954068]        __x64_sys_ioctl+0x6a/0xa0
[  124.955736]        do_syscall_64+0x35/0xb0
[  124.957326]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  124.959358]
               -> #5 (loop_ctl_mutex){+.+.}-{3:3}:
[  124.962171]        lock_acquire+0xd0/0x300
[  124.963810]        __mutex_lock+0x97/0x950
[  124.965426]        loop_add+0x44/0x2c0
[  124.966890]        blk_request_module+0x63/0xc0
[  124.968526]        blkdev_get_no_open+0x98/0xc0
[  124.970361]        blkdev_get_by_dev+0x54/0x330
[  124.972108]        blkdev_open+0x59/0xa0
[  124.973681]        do_dentry_open+0x14c/0x3a0
[  124.975321]        path_openat+0x78e/0xa50
[  124.977043]        do_filp_open+0xad/0x120
[  124.979067]        do_sys_openat2+0x241/0x310
[  124.980969]        do_sys_open+0x3f/0x80
[  124.982581]        do_syscall_64+0x35/0xb0
[  124.984201]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  124.986265]
               -> #4 (major_names_lock){+.+.}-{3:3}:
[  124.988750]        lock_acquire+0xd0/0x300
[  124.990405]        __mutex_lock+0x97/0x950
[  124.992091]        blkdev_show+0x18/0x90
[  124.993674]        devinfo_show+0x58/0x70
[  124.995404]        seq_read_iter+0x27b/0x3f0
[  124.997072]        proc_reg_read_iter+0x3c/0x60
[  124.998719]        new_sync_read+0x110/0x190
[  125.000428]        vfs_read+0x11d/0x1b0
[  125.001867]        ksys_read+0x63/0xe0
[  125.003429]        do_syscall_64+0x35/0xb0
[  125.005099]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  125.007527]
               -> #3 (&p->lock){+.+.}-{3:3}:
[  125.009796]        lock_acquire+0xd0/0x300
[  125.011675]        __mutex_lock+0x97/0x950
[  125.013469]        seq_read_iter+0x4c/0x3f0
[  125.015295]        generic_file_splice_read+0xf7/0x1a0
[  125.017381]        splice_direct_to_actor+0xc0/0x230
[  125.019268]        do_splice_direct+0x8c/0xd0
[  125.021085]        do_sendfile+0x319/0x5a0
[  125.022737]        __x64_sys_sendfile64+0xad/0xc0
[  125.024571]        do_syscall_64+0x35/0xb0
[  125.026347]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  125.028731]
               -> #2 (sb_writers#3){.+.+}-{0:0}:
[  125.031219]        lock_acquire+0xd0/0x300
[  125.032799]        lo_write_bvec+0xe1/0x260
[  125.034456]        loop_process_work+0x3e5/0xcf0
[  125.036629]        process_one_work+0x2aa/0x600
[  125.038524]        worker_thread+0x48/0x3d0
[  125.040279]        kthread+0x13e/0x170
[  125.041824]        ret_from_fork+0x1f/0x30
[  125.043639]
               -> #1 ((work_completion)(&lo->rootcg_work)){+.+.}-{0:0}:
[  125.047124]        lock_acquire+0xd0/0x300
[  125.048763]        process_one_work+0x27e/0x600
[  125.050544]        worker_thread+0x48/0x3d0
[  125.052206]        kthread+0x13e/0x170
[  125.053703]        ret_from_fork+0x1f/0x30
[  125.055324]
               -> #0 ((wq_completion)loop0){+.+.}-{0:0}:
[  125.057947]        check_prev_add+0x91/0xc00
[  125.059585]        __lock_acquire+0x14a8/0x1f40
[  125.061433]        lock_acquire+0xd0/0x300
[  125.063203]        flush_workqueue+0xa9/0x560
[  125.064891]        drain_workqueue+0x9b/0x100
[  125.066638]        destroy_workqueue+0x2f/0x210
[  125.068305]        __loop_clr_fd+0xa9/0x5b0
[  125.070086]        blkdev_put+0xaf/0x180
[  125.071606]        blkdev_close+0x20/0x30
[  125.073522]        __fput+0xa0/0x240
[  125.074924]        task_work_run+0x57/0xa0
[  125.076564]        exit_to_user_mode_prepare+0x252/0x260
[  125.078666]        syscall_exit_to_user_mode+0x19/0x60
[  125.080784]        do_syscall_64+0x42/0xb0
[  125.082360]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  125.084403]
               other info that might help us debug this:

[  125.087757] Chain exists of:
                 (wq_completion)loop0 --> loop_ctl_mutex --> &disk->open_mutex

[  125.092256]  Possible unsafe locking scenario:

[  125.094994]        CPU0                    CPU1
[  125.096963]        ----                    ----
[  125.098729]   lock(&disk->open_mutex);
[  125.100224]                                lock(loop_ctl_mutex);
[  125.102521]                                lock(&disk->open_mutex);
[  125.104862]   lock((wq_completion)loop0);
[  125.106511]
                *** DEADLOCK ***

[  125.109224] 1 lock held by systemd-udevd/6710:
[  125.111072]  #0: ffff88800dc87128 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x180
[  125.114270]
               stack backtrace:
[  125.116193] CPU: 3 PID: 6710 Comm: systemd-udevd Not tainted 5.14.0-rc6+ #744
[  125.118706] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[  125.122536] Call Trace:
[  125.123604]  dump_stack_lvl+0x57/0x7d
[  125.125064]  check_noncircular+0x114/0x130
[  125.126621]  check_prev_add+0x91/0xc00
[  125.128264]  ? __lock_acquire+0x5c1/0x1f40
[  125.129953]  __lock_acquire+0x14a8/0x1f40
[  125.131544]  lock_acquire+0xd0/0x300
[  125.132936]  ? flush_workqueue+0x85/0x560
[  125.134500]  ? lockdep_init_map_type+0x51/0x220
[  125.136299]  ? lockdep_init_map_type+0x51/0x220
[  125.138086]  flush_workqueue+0xa9/0x560
[  125.139761]  ? flush_workqueue+0x85/0x560
[  125.141318]  ? drain_workqueue+0x9b/0x100
[  125.142822]  drain_workqueue+0x9b/0x100
[  125.144486]  destroy_workqueue+0x2f/0x210
[  125.146190]  __loop_clr_fd+0xa9/0x5b0
[  125.147655]  ? __mutex_unlock_slowpath+0x40/0x2a0
[  125.149516]  blkdev_put+0xaf/0x180
[  125.151042]  blkdev_close+0x20/0x30
[  125.152548]  __fput+0xa0/0x240
[  125.153865]  task_work_run+0x57/0xa0
[  125.155394]  exit_to_user_mode_prepare+0x252/0x260
[  125.157248]  syscall_exit_to_user_mode+0x19/0x60
[  125.158997]  do_syscall_64+0x42/0xb0
[  125.160510]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  125.162554] RIP: 0033:0x7eff14494987
[  125.164075] Code: ff ff e8 9c 11 02 00 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 5d f8 ff
[  125.170670] RSP: 002b:00007ffde2ab8e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
[  125.173563] RAX: 0000000000000000 RBX: 00007eff13f1c788 RCX: 00007eff14494987
[  125.176138] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000006
[  125.178858] RBP: 0000000000000006 R08: 0000562c4155aaf0 R09: 0000000000000000
[  125.181641] R10: 00007eff13f1c788 R11: 0000000000000246 R12: 0000000000000000
[  125.184197] R13: 0000000000000000 R14: 0000000000000000 R15: 0000562c41542635
[  125.187428] loop0: detected capacity change from 0 to 4096
----------------------------------------

I was expecting this result from the beginning.

Before your patch, lockdep thinks that there are

  loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock => loop_ctl_mutex

dependency chain and

  &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock => loop_ctl_mutex

dependency chain (as explained in my patch). Since your patch attempts to
take (wq_completion)loop$M without &lo->lo_mutex, the dependency chain
after your patch will become

  &disk->open_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock => loop_ctl_mutex

but after all

  loop_ctl_mutex => &disk->open_mutex

dependency is appended when loop_control_remove() from
loop_control_ioctl(LOOP_CTL_REMOVE) is called...

Not only it is unclear what lo->lo_mutex lock is protecting, but also it is
unclear and erroneous what loop_ctl_mutex lock is protecting. Nobody noticed
lack of loop_ctl_mutex protection in the following patch.

On 2021/08/15 15:52, Tetsuo Handa wrote:
> There would be two approaches for breaking this circular dependency.
> One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
> kill major_names_lock => loop_ctl_mutex chain. This patch implements
> the latter, due to the following reasons.

F.Y.I. Below is a patch for implementing the former.
This is less suitable for backport, for this is more difficult to review.

 From 1b95e875071961b22420f69ac6c5a8ae9aa11638 Mon Sep 17 00:00:00 2001From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 14 Aug 2021 14:21:36 +0900
Subject: [PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit a160c6159d4a0cf8 ("block: add an optional probe callback to
major_names") is calling the module's probe function with major_names_lock
held.

Fortunately, since commit 990e78116d38059c ("block: loop: fix deadlock
between open and remove") stopped holding loop_ctl_mutex in lo_open(),
current role of loop_ctl_mutex is to serialize access to loop_index_idr
and loop_add()/loop_remove(); in other words, management of id for IDR.

Since IDR allows idr_replace(), we can reserve id for not-yet-initialized
and to-be-released loop devices, by assigning a dummy pointer to that
reserved id. By using idr_replace(), we can replace loop_ctl_mutex with
loop_idr_spinlock for protecting access to loop_index_idr.

By introducing loop_idr_spinlock, remaining role of loop_ctl_mutex can be
reduced to serialization of loop_control_remove(). Thus, let's rename
loop_ctl_mutex to loop_removal_mutex. Note that removing serialization
between loop_add() and loop_remove() has a side effect

  Thread A:             Thread B:

  loop_remove(1) {
    Lock loop_removal_mutex.
    lo = idr_find(1);
    idr_replace(dummy, 1);
    Unlock loop_removal_mutex.
                        loop_remove(1) {
                          Lock loop_removal_mutex.
                          lo = idr_find(1);
                          Unlock loop_removal_mutex.
                          Fails with -ENODEV due to lo == dummy.
                        }
                        loop_add(1) {
                          Fails with -EEXIST due to idr_alloc(1) failure.
                        }
    loop_remove(lo) {
      idr_remove(1);
    }
    return 0;
  }

which we could argue such sequence as a caller's bug. /dev/loop-control
users are expected to serialize ioctl() calls when passing non-negative
same id value. As long as ioctl() calls with non-negative same id value
are serialized, this approach can minimize serialization by
loop_removal_mutex.

In loop_control_remove(), we might expect that we can get rid of
loop_removal_mutex if we temporarily hide this lo (using idr_replace())
before holding lo_mutex and show this lo again (using idr_replace()) if
loop_remove() cannot be called. But we can't get rid of loop_removal_mutex
in order to close a use-after-free race window explained below.

I found that loop_unregister_transfer() which is called from
cleanup_cryptoloop() lacks serialization between kfree() from
loop_remove() from loop_control_remove() and mutex_lock() from
unregister_transfer_cb().

Both loop_control_remove() and loop_unregister_transfer() hold lo_mutex
of lo found from loop_index_idr, but loop_unregister_transfer() must
synchronously check all lo found from loop_index_idr (and call
loop_release_xfer() as needed) due to being called by module unloading
operation. Temporarily hiding lo on loop_control_remove() side can result
in failing to call loop_release_xfer() from unregister_transfer_cb() from
loop_unregister_transfer().

Given that cryptoloop is not safe for journaled file systems, I wonder
how many cryptoloop users are there. We could consider getting rid of
loop_unregister_transfer() by removing cleanup_cryptoloop() (i.e. make
cryptoloop no longer unloadable) which is the only in-tree caller of
loop_unregister_transfer() function.

For now, we need to hold loop_removal_mutex in loop_unregister_transfer().
In contrast, holding loop_removal_mutex in loop_exit() is pointless, for
all users must close /dev/loop-control and /dev/loop$num (in order to drop
module's refcount to 0) before loop_exit() starts, and nobody can open
/dev/loop-control or /dev/loop$num afterwards.

Link: https://syzkaller.appspot.com/bug?id=7bb10e8b62f83e4d445cdf4c13d69e407e629558 [1]
Reported-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
---

 drivers/block/loop.c | 130 +++++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 41 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f0cdff0c5fbf..424d90f978d5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -86,8 +86,15 @@
 
 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
 
+/*
+ * Used for avoiding -EEXIST error at bdi_register() which happens when id is reused
+ * before bdi_unregister() completes, by preserving specific id for loop_index_idr.
+ */
+#define HIDDEN_LOOP_DEVICE ((struct loop_device *) -1)
+
 static DEFINE_IDR(loop_index_idr);
-static DEFINE_MUTEX(loop_ctl_mutex);
+static DEFINE_SPINLOCK(loop_idr_spinlock);
+static DEFINE_MUTEX(loop_removal_mutex);
 static DEFINE_MUTEX(loop_validate_mutex);
 
 /**
@@ -2113,28 +2120,37 @@ int loop_register_transfer(struct loop_func_table *funcs)
 	return 0;
 }
 
-static int unregister_transfer_cb(int id, void *ptr, void *data)
-{
-	struct loop_device *lo = ptr;
-	struct loop_func_table *xfer = data;
-
-	mutex_lock(&lo->lo_mutex);
-	if (lo->lo_encryption == xfer)
-		loop_release_xfer(lo);
-	mutex_unlock(&lo->lo_mutex);
-	return 0;
-}
-
 int loop_unregister_transfer(int number)
 {
 	unsigned int n = number;
 	struct loop_func_table *xfer;
+	struct loop_device *lo;
+	int id;
 
 	if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
 		return -EINVAL;
 
 	xfer_funcs[n] = NULL;
-	idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
+
+	/*
+	 * Use loop_removal_mutex in order to make sure that
+	 * loop_control_remove() won't call loop_remove().
+	 */
+	mutex_lock(&loop_removal_mutex);
+	spin_lock(&loop_idr_spinlock);
+	idr_for_each_entry(&loop_index_idr, lo, id) {
+		if (lo == HIDDEN_LOOP_DEVICE)
+			continue;
+		spin_unlock(&loop_idr_spinlock);
+		mutex_lock(&lo->lo_mutex);
+		if (lo->lo_encryption == xfer)
+			loop_release_xfer(lo);
+		mutex_unlock(&lo->lo_mutex);
+		spin_lock(&loop_idr_spinlock);
+	}
+	spin_unlock(&loop_idr_spinlock);
+	mutex_unlock(&loop_removal_mutex);
+
 	return 0;
 }
 
@@ -2313,20 +2329,21 @@ static int loop_add(int i)
 		goto out;
 	lo->lo_state = Lo_unbound;
 
-	err = mutex_lock_killable(&loop_ctl_mutex);
-	if (err)
-		goto out_free_dev;
-
 	/* allocate id, if @id >= 0, we're requesting that specific id */
+	idr_preload(GFP_KERNEL);
+	/* Reserve id for this loop device, but keep this loop device hidden. */
+	spin_lock(&loop_idr_spinlock);
 	if (i >= 0) {
-		err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
+		err = idr_alloc(&loop_index_idr, HIDDEN_LOOP_DEVICE, i, i + 1, GFP_ATOMIC);
 		if (err == -ENOSPC)
 			err = -EEXIST;
 	} else {
-		err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
+		err = idr_alloc(&loop_index_idr, HIDDEN_LOOP_DEVICE, 0, 0, GFP_ATOMIC);
 	}
+	spin_unlock(&loop_idr_spinlock);
+	idr_preload_end();
 	if (err < 0)
-		goto out_unlock;
+		goto out_free_dev;
 	i = err;
 
 	err = -ENOMEM;
@@ -2392,16 +2409,21 @@ static int loop_add(int i)
 	disk->private_data	= lo;
 	disk->queue		= lo->lo_queue;
 	sprintf(disk->disk_name, "loop%d", i);
+	/* Make this loop device reachable from pathname. */
 	add_disk(disk);
-	mutex_unlock(&loop_ctl_mutex);
+	/* Show this loop device. */
+	spin_lock(&loop_idr_spinlock);
+	idr_replace(&loop_index_idr, lo, i);
+	spin_unlock(&loop_idr_spinlock);
 	return i;
 
 out_cleanup_tags:
 	blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
+	/* Release id reserved for lo->lo_number. */
+	spin_lock(&loop_idr_spinlock);
 	idr_remove(&loop_index_idr, i);
-out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
+	spin_unlock(&loop_idr_spinlock);
 out_free_dev:
 	kfree(lo);
 out:
@@ -2410,9 +2432,15 @@ static int loop_add(int i)
 
 static void loop_remove(struct loop_device *lo)
 {
+	/* Make this loop device unreachable from pathname. */
 	del_gendisk(lo->lo_disk);
 	blk_cleanup_disk(lo->lo_disk);
 	blk_mq_free_tag_set(&lo->tag_set);
+	/* Release id used by lo->lo_number. */
+	spin_lock(&loop_idr_spinlock);
+	idr_remove(&loop_index_idr, lo->lo_number);
+	spin_unlock(&loop_idr_spinlock);
+	/* There is no route which can find this loop device. */
 	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
 }
@@ -2435,52 +2463,68 @@ static int loop_control_remove(int idx)
 		pr_warn("deleting an unspecified loop device is not supported.\n");
 		return -EINVAL;
 	}
-		
-	ret = mutex_lock_killable(&loop_ctl_mutex);
+
+	/* Serialize concurrent loop_control_remove() and loop_unregister_transfer(). */
+	ret = mutex_lock_killable(&loop_removal_mutex);
 	if (ret)
 		return ret;
 
+	/*
+	 * Identify the loop device to remove. Skip the device if it is owned by
+	 * loop_remove()/loop_add() where it is not safe to access lo_mutex.
+	 */
+	spin_lock(&loop_idr_spinlock);
 	lo = idr_find(&loop_index_idr, idx);
-	if (!lo) {
+	if (!lo || lo == HIDDEN_LOOP_DEVICE) {
+		spin_unlock(&loop_idr_spinlock);
 		ret = -ENODEV;
-		goto out_unlock_ctrl;
+		goto out_unlock;
 	}
+	spin_unlock(&loop_idr_spinlock);
 
 	ret = mutex_lock_killable(&lo->lo_mutex);
 	if (ret)
-		goto out_unlock_ctrl;
+		goto out_unlock;
 	if (lo->lo_state != Lo_unbound ||
 	    atomic_read(&lo->lo_refcnt) > 0) {
 		mutex_unlock(&lo->lo_mutex);
 		ret = -EBUSY;
-		goto out_unlock_ctrl;
+		goto out_unlock;
 	}
+	/* Mark this loop device no longer open()-able. */
 	lo->lo_state = Lo_deleting;
 	mutex_unlock(&lo->lo_mutex);
 
-	idr_remove(&loop_index_idr, lo->lo_number);
+	/* Hide this loop device, but keep lo->lo_number still held. */
+	spin_lock(&loop_idr_spinlock);
+	idr_replace(&loop_index_idr, HIDDEN_LOOP_DEVICE, lo->lo_number);
+	spin_unlock(&loop_idr_spinlock);
+	/* Allow loop_control_remove() and loop_unregister_transfer() to resume. */
+	mutex_unlock(&loop_removal_mutex);
+	/* Remove this loop device. */
 	loop_remove(lo);
-out_unlock_ctrl:
-	mutex_unlock(&loop_ctl_mutex);
+	return 0;
+out_unlock:
+	mutex_unlock(&loop_removal_mutex);
 	return ret;
 }
 
 static int loop_control_get_free(int idx)
 {
 	struct loop_device *lo;
-	int id, ret;
+	int id;
 
-	ret = mutex_lock_killable(&loop_ctl_mutex);
-	if (ret)
-		return ret;
+	spin_lock(&loop_idr_spinlock);
 	idr_for_each_entry(&loop_index_idr, lo, id) {
+		if (lo == HIDDEN_LOOP_DEVICE)
+			continue;
 		if (lo->lo_state == Lo_unbound)
 			goto found;
 	}
-	mutex_unlock(&loop_ctl_mutex);
+	spin_unlock(&loop_idr_spinlock);
 	return loop_add(-1);
 found:
-	mutex_unlock(&loop_ctl_mutex);
+	spin_unlock(&loop_idr_spinlock);
 	return id;
 }
 
@@ -2590,10 +2634,14 @@ static void __exit loop_exit(void)
 	unregister_blkdev(LOOP_MAJOR, "loop");
 	misc_deregister(&loop_misc);
 
-	mutex_lock(&loop_ctl_mutex);
+	/*
+	 * There can't be hidden loop device on loop_index_idr, for loop_add()/loop_remove()
+	 * can't be in progress when this module is unloading. Also, there is no need to use
+	 * loop_idr_spinlock here, for nobody else can access loop_index_idr when this module
+	 * is unloading.
+	 */
 	idr_for_each_entry(&loop_index_idr, lo, id)
 		loop_remove(lo);
-	mutex_unlock(&loop_ctl_mutex);
 
 	idr_destroy(&loop_index_idr);
 }
-- 
2.18.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3] block: genhd: don't call probe function with major_names_lock held
       [not found]     ` <20210817081045.3609-1-hdanton@sina.com>
@ 2021-08-17 10:18       ` Tetsuo Handa
  0 siblings, 0 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-17 10:18 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Christoph Hellwig, Greg KH, Desmond Cheong Zhi Xi, linux-kernel,
	linux-block

On 2021/08/17 17:10, Hillf Danton wrote:
> See if it is safe to kfree(lo) after removing it from idr, with the
> deadlock dissolved.

It is not safe to call loop_remove() after idr_remove(). Please see HIDDEN_LOOP_DEVICE magic
in "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex".

> 
> --- x/drivers/block/loop.c
> +++ y/drivers/block/loop.c
> @@ -2459,7 +2459,9 @@ static int loop_control_remove(int idx)
>  	mutex_unlock(&lo->lo_mutex);
>  
>  	idr_remove(&loop_index_idr, lo->lo_number);
> +	mutex_unlock(&loop_ctl_mutex);
>  	loop_remove(lo);
> +	return 0;
>  out_unlock_ctrl:
>  	mutex_unlock(&loop_ctl_mutex);
>  	return ret;
> --
> 

"[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex" can be a further
improvement after "[PATCH v3] block: genhd: don't call probe function with major_names_lock held".

I really would like to apply "[PATCH v3] block: genhd: don't call probe function with major_names_lock held" first.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-15  9:19       ` Greg KH
@ 2021-08-18 11:07         ` Tetsuo Handa
  2021-08-18 13:27           ` Greg KH
  2021-08-18 13:47           ` [PATCH v4] " Christoph Hellwig
  0 siblings, 2 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-18 11:07 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: Linus Torvalds, Hannes Reinecke, Chaitanya Kulkarni,
	Hillf Danton, Desmond Cheong Zhi Xi, linux-block, Greg KH,
	Tyler Hicks, Pavel Tatashin

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit a160c6159d4a0cf8 ("block: add an optional probe callback to
major_names") is calling the module's probe function with major_names_lock
held.

When copying content of /proc/devices to another file via sendfile(),

  sb_writers#$N => &p->lock => major_names_lock

dependency is recorded.

When loop_process_work() from WQ context performs a write request,

  (wq_completion)loop$M => (work_completion)&lo->rootcg_work =>
  sb_writers#$N

dependency is recorded.

When flush_workqueue() from drain_workqueue() from destroy_workqueue()
 from __loop_clr_fd() from blkdev_put() from blkdev_close() from __fput()
is called,

  &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M

dependency is recorded.

When loop_control_remove() from loop_control_ioctl(LOOP_CTL_REMOVE) is
called,

  loop_ctl_mutex => &lo->lo_mutex

dependency is recorded.

As a result, lockdep thinks that there is

  loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock

dependency chain.

Then, if loop_add() from loop_probe() from blk_request_module() from
blkdev_get_no_open() from blkdev_get_by_dev() from blkdev_open() from
do_dentry_open() from path_openat() from do_filp_open() is called,

  major_names_lock => loop_ctl_mutex

dependency is appended to the dependency chain.

There would be two approaches for breaking this circular dependency.
One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
kill major_names_lock => loop_ctl_mutex chain. This patch implements
the latter, due to the following reasons.

 (1) sb_writers#$N => &p->lock => major_names_lock chain is unavoidable

 (2) this patch can also fix similar problem in other modules [2] which
     is caused by calling the probe function with major_names_lock held

 (3) I believe that this patch is principally safer than e.g.
     commit bd5c39edad535d9f ("loop: reduce loop_ctl_mutex coverage in
     loop_exit") which waits until the probe function finishes using
     global mutex in order to fix deadlock reproducible by sleep
     injection [3]

This patch adds THIS_MODULE parameter to __register_blkdev() as with
usb_register(), and drops major_names_lock before calling probe function
by holding a reference to that module which contains that probe function.

Since cdev uses register_chrdev() and __register_chrdev(), bdev should be
able to preserve register_blkdev() and __register_blkdev() naming scheme.
Thus, use register_blkdev_with_probe() as an inline function for
automagically appending THIS_MODULE parameter.

Link: https://syzkaller.appspot.com/bug?id=7bb10e8b62f83e4d445cdf4c13d69e407e629558 [1]
Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [2]
Link: https://lkml.kernel.org/r/c4edf07f-92e1-a350-2743-f0b0234a2b6c@i-love.sakura.ne.jp [3]
Reported-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Fixes: a160c6159d4a0cf8 ("block: add an optional probe callback to major_names")
---
 block/genhd.c         | 33 ++++++++++++++++++++++++++-------
 include/linux/genhd.h | 11 +++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 298ee78c1bda..a3e2e5666457 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -162,6 +162,7 @@ static struct blk_major_name {
 	int major;
 	char name[16];
 	void (*probe)(dev_t devt);
+	struct module *owner;
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 
@@ -190,7 +191,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
  *         @major = 0, try to allocate any unused major number.
  * @name: the name of the new block device as a zero terminated string
- * @probe: allback that is called on access to any minor number of @major
+ * @probe: callback that is called on access to any minor number of @major
+ * @owner: THIS_MODULE if @probe is not NULL, ignored if @probe is NULL.
  *
  * The @name must be unique within the system.
  *
@@ -207,8 +209,9 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  *
  * Use register_blkdev instead for any new code.
  */
+#undef __register_blkdev
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt))
+		      void (*probe)(dev_t devt), struct module *owner)
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
@@ -248,6 +251,7 @@ int __register_blkdev(unsigned int major, const char *name,
 
 	p->major = major;
 	p->probe = probe;
+	p->owner = owner;
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -653,14 +657,29 @@ void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
 	struct blk_major_name **n;
+	void (*probe_fn)(dev_t devt);
 
 	mutex_lock(&major_names_lock);
 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
-		if ((*n)->major == major && (*n)->probe) {
-			(*n)->probe(devt);
-			mutex_unlock(&major_names_lock);
-			return;
-		}
+		if ((*n)->major != major || !(*n)->probe)
+			continue;
+		if (!try_module_get((*n)->owner))
+			break;
+		/*
+		 * Calling probe function with major_names_lock held causes
+		 * circular locking dependency problem. Thus, call it after
+		 * releasing major_names_lock.
+		 */
+		probe_fn = (*n)->probe;
+		mutex_unlock(&major_names_lock);
+		/*
+		 * Assuming that unregister_blkdev() is called from module's
+		 * __exit function, a module refcount taken above allows us
+		 * to safely call probe function without major_names_lock held.
+		 */
+		probe_fn(devt);
+		module_put((*n)->owner);
+		return;
 	}
 	mutex_unlock(&major_names_lock);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 13b34177cc85..2ed856616d47 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -303,9 +303,16 @@ struct gendisk *__blk_alloc_disk(int node);
 void blk_cleanup_disk(struct gendisk *disk);
 
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
+		      void (*probe)(dev_t devt), struct module *owner);
+static inline int register_blkdev_with_probe(unsigned int major, const char *name,
+					     void (*probe)(dev_t devt), struct module *owner)
+{
+	return __register_blkdev(major, name, probe, owner);
+}
+#define __register_blkdev(major, name, probe) \
+	register_blkdev_with_probe(major, name, probe, THIS_MODULE)
 #define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
+	register_blkdev_with_probe(major, name, NULL, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 bool bdev_check_media_change(struct block_device *bdev);
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 11:07         ` [PATCH v4] " Tetsuo Handa
@ 2021-08-18 13:27           ` Greg KH
  2021-08-18 14:44             ` Tetsuo Handa
  2021-08-18 13:47           ` [PATCH v4] " Christoph Hellwig
  1 sibling, 1 reply; 29+ messages in thread
From: Greg KH @ 2021-08-18 13:27 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On Wed, Aug 18, 2021 at 08:07:32PM +0900, Tetsuo Handa wrote:
> syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
> commit a160c6159d4a0cf8 ("block: add an optional probe callback to
> major_names") is calling the module's probe function with major_names_lock
> held.
> 
> When copying content of /proc/devices to another file via sendfile(),
> 
>   sb_writers#$N => &p->lock => major_names_lock
> 
> dependency is recorded.
> 
> When loop_process_work() from WQ context performs a write request,
> 
>   (wq_completion)loop$M => (work_completion)&lo->rootcg_work =>
>   sb_writers#$N
> 
> dependency is recorded.
> 
> When flush_workqueue() from drain_workqueue() from destroy_workqueue()
>  from __loop_clr_fd() from blkdev_put() from blkdev_close() from __fput()
> is called,
> 
>   &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M
> 
> dependency is recorded.
> 
> When loop_control_remove() from loop_control_ioctl(LOOP_CTL_REMOVE) is
> called,
> 
>   loop_ctl_mutex => &lo->lo_mutex
> 
> dependency is recorded.
> 
> As a result, lockdep thinks that there is
> 
>   loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
>   (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
>   major_names_lock
> 
> dependency chain.
> 
> Then, if loop_add() from loop_probe() from blk_request_module() from
> blkdev_get_no_open() from blkdev_get_by_dev() from blkdev_open() from
> do_dentry_open() from path_openat() from do_filp_open() is called,
> 
>   major_names_lock => loop_ctl_mutex
> 
> dependency is appended to the dependency chain.
> 
> There would be two approaches for breaking this circular dependency.
> One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
> kill major_names_lock => loop_ctl_mutex chain. This patch implements
> the latter, due to the following reasons.
> 
>  (1) sb_writers#$N => &p->lock => major_names_lock chain is unavoidable
> 
>  (2) this patch can also fix similar problem in other modules [2] which
>      is caused by calling the probe function with major_names_lock held
> 
>  (3) I believe that this patch is principally safer than e.g.
>      commit bd5c39edad535d9f ("loop: reduce loop_ctl_mutex coverage in
>      loop_exit") which waits until the probe function finishes using
>      global mutex in order to fix deadlock reproducible by sleep
>      injection [3]
> 
> This patch adds THIS_MODULE parameter to __register_blkdev() as with
> usb_register(), and drops major_names_lock before calling probe function
> by holding a reference to that module which contains that probe function.
> 
> Since cdev uses register_chrdev() and __register_chrdev(), bdev should be
> able to preserve register_blkdev() and __register_blkdev() naming scheme.

Note, the cdev api is anything but good, so should not be used as an
excuse for anything.  Don't copy it unless you have a very good reason.

Also, what changed in this version?  I see no patch history here :(

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 11:07         ` [PATCH v4] " Tetsuo Handa
  2021-08-18 13:27           ` Greg KH
@ 2021-08-18 13:47           ` Christoph Hellwig
  2021-08-18 14:34             ` Tetsuo Handa
  1 sibling, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2021-08-18 13:47 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Greg KH, Tyler Hicks, Pavel Tatashin

Hi Tetsuo,

I might sound like a broken record, but we need to reduce the locking
complexity, not make it worse and fall down the locking cliff.  I did
send a suggested series this morning, in which you found a bunch of
bugs.  I'd appreciate it if you could use your superior skills to
actually help explain the issue and arrive at a common solution that
actually simplifies things instead of steaming down the locking cliff
full speed.  Thank you very much.

On Wed, Aug 18, 2021 at 08:07:32PM +0900, Tetsuo Handa wrote:
> syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
> commit a160c6159d4a0cf8 ("block: add an optional probe callback to
> major_names") is calling the module's probe function with major_names_lock
> held.
> 
> When copying content of /proc/devices to another file via sendfile(),
> 
>   sb_writers#$N => &p->lock => major_names_lock
> 
> dependency is recorded.
> 
> When loop_process_work() from WQ context performs a write request,
> 
>   (wq_completion)loop$M => (work_completion)&lo->rootcg_work =>
>   sb_writers#$N
> 
> dependency is recorded.
> 
> When flush_workqueue() from drain_workqueue() from destroy_workqueue()
>  from __loop_clr_fd() from blkdev_put() from blkdev_close() from __fput()
> is called,
> 
>   &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M
> 
> dependency is recorded.
> 
> When loop_control_remove() from loop_control_ioctl(LOOP_CTL_REMOVE) is
> called,
> 
>   loop_ctl_mutex => &lo->lo_mutex
> 
> dependency is recorded.
> 
> As a result, lockdep thinks that there is
> 
>   loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
>   (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
>   major_names_lock
> 
> dependency chain.
> 
> Then, if loop_add() from loop_probe() from blk_request_module() from
> blkdev_get_no_open() from blkdev_get_by_dev() from blkdev_open() from
> do_dentry_open() from path_openat() from do_filp_open() is called,
> 
>   major_names_lock => loop_ctl_mutex
> 
> dependency is appended to the dependency chain.
> 
> There would be two approaches for breaking this circular dependency.
> One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
> kill major_names_lock => loop_ctl_mutex chain. This patch implements
> the latter, due to the following reasons.
> 
>  (1) sb_writers#$N => &p->lock => major_names_lock chain is unavoidable
> 
>  (2) this patch can also fix similar problem in other modules [2] which
>      is caused by calling the probe function with major_names_lock held
> 
>  (3) I believe that this patch is principally safer than e.g.
>      commit bd5c39edad535d9f ("loop: reduce loop_ctl_mutex coverage in
>      loop_exit") which waits until the probe function finishes using
>      global mutex in order to fix deadlock reproducible by sleep
>      injection [3]
> 
> This patch adds THIS_MODULE parameter to __register_blkdev() as with
> usb_register(), and drops major_names_lock before calling probe function
> by holding a reference to that module which contains that probe function.
> 
> Since cdev uses register_chrdev() and __register_chrdev(), bdev should be
> able to preserve register_blkdev() and __register_blkdev() naming scheme.
> Thus, use register_blkdev_with_probe() as an inline function for
> automagically appending THIS_MODULE parameter.
> 
> Link: https://syzkaller.appspot.com/bug?id=7bb10e8b62f83e4d445cdf4c13d69e407e629558 [1]
> Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [2]
> Link: https://lkml.kernel.org/r/c4edf07f-92e1-a350-2743-f0b0234a2b6c@i-love.sakura.ne.jp [3]
> Reported-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
> Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Tested-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
> Fixes: a160c6159d4a0cf8 ("block: add an optional probe callback to major_names")
> ---
>  block/genhd.c         | 33 ++++++++++++++++++++++++++-------
>  include/linux/genhd.h | 11 +++++++++--
>  2 files changed, 35 insertions(+), 9 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 298ee78c1bda..a3e2e5666457 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -162,6 +162,7 @@ static struct blk_major_name {
>  	int major;
>  	char name[16];
>  	void (*probe)(dev_t devt);
> +	struct module *owner;
>  } *major_names[BLKDEV_MAJOR_HASH_SIZE];
>  static DEFINE_MUTEX(major_names_lock);
>  
> @@ -190,7 +191,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
>   * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
>   *         @major = 0, try to allocate any unused major number.
>   * @name: the name of the new block device as a zero terminated string
> - * @probe: allback that is called on access to any minor number of @major
> + * @probe: callback that is called on access to any minor number of @major
> + * @owner: THIS_MODULE if @probe is not NULL, ignored if @probe is NULL.
>   *
>   * The @name must be unique within the system.
>   *
> @@ -207,8 +209,9 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
>   *
>   * Use register_blkdev instead for any new code.
>   */
> +#undef __register_blkdev
>  int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt))
> +		      void (*probe)(dev_t devt), struct module *owner)
>  {
>  	struct blk_major_name **n, *p;
>  	int index, ret = 0;
> @@ -248,6 +251,7 @@ int __register_blkdev(unsigned int major, const char *name,
>  
>  	p->major = major;
>  	p->probe = probe;
> +	p->owner = owner;
>  	strlcpy(p->name, name, sizeof(p->name));
>  	p->next = NULL;
>  	index = major_to_index(major);
> @@ -653,14 +657,29 @@ void blk_request_module(dev_t devt)
>  {
>  	unsigned int major = MAJOR(devt);
>  	struct blk_major_name **n;
> +	void (*probe_fn)(dev_t devt);
>  
>  	mutex_lock(&major_names_lock);
>  	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
> -		if ((*n)->major == major && (*n)->probe) {
> -			(*n)->probe(devt);
> -			mutex_unlock(&major_names_lock);
> -			return;
> -		}
> +		if ((*n)->major != major || !(*n)->probe)
> +			continue;
> +		if (!try_module_get((*n)->owner))
> +			break;
> +		/*
> +		 * Calling probe function with major_names_lock held causes
> +		 * circular locking dependency problem. Thus, call it after
> +		 * releasing major_names_lock.
> +		 */
> +		probe_fn = (*n)->probe;
> +		mutex_unlock(&major_names_lock);
> +		/*
> +		 * Assuming that unregister_blkdev() is called from module's
> +		 * __exit function, a module refcount taken above allows us
> +		 * to safely call probe function without major_names_lock held.
> +		 */
> +		probe_fn(devt);
> +		module_put((*n)->owner);
> +		return;
>  	}
>  	mutex_unlock(&major_names_lock);
>  
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 13b34177cc85..2ed856616d47 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -303,9 +303,16 @@ struct gendisk *__blk_alloc_disk(int node);
>  void blk_cleanup_disk(struct gendisk *disk);
>  
>  int __register_blkdev(unsigned int major, const char *name,
> -		void (*probe)(dev_t devt));
> +		      void (*probe)(dev_t devt), struct module *owner);
> +static inline int register_blkdev_with_probe(unsigned int major, const char *name,
> +					     void (*probe)(dev_t devt), struct module *owner)
> +{
> +	return __register_blkdev(major, name, probe, owner);
> +}
> +#define __register_blkdev(major, name, probe) \
> +	register_blkdev_with_probe(major, name, probe, THIS_MODULE)
>  #define register_blkdev(major, name) \
> -	__register_blkdev(major, name, NULL)
> +	register_blkdev_with_probe(major, name, NULL, NULL)
>  void unregister_blkdev(unsigned int major, const char *name);
>  
>  bool bdev_check_media_change(struct block_device *bdev);
> -- 
> 2.25.1
> 
---end quoted text---

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 13:47           ` [PATCH v4] " Christoph Hellwig
@ 2021-08-18 14:34             ` Tetsuo Handa
  2021-08-18 14:41               ` Greg KH
  2021-08-19  9:19               ` Christoph Hellwig
  0 siblings, 2 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-18 14:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Linus Torvalds, Hannes Reinecke, Chaitanya Kulkarni,
	Hillf Danton, Desmond Cheong Zhi Xi, linux-block, Greg KH,
	Tyler Hicks, Pavel Tatashin

On 2021/08/18 22:47, Christoph Hellwig wrote:
> Hi Tetsuo,
> 
> I might sound like a broken record, but we need to reduce the locking
> complexity, not make it worse and fall down the locking cliff.  I did
> send a suggested series this morning, in which you found a bunch of
> bugs.  I'd appreciate it if you could use your superior skills to
> actually help explain the issue and arrive at a common solution that
> actually simplifies things instead of steaming down the locking cliff
> full speed.  Thank you very much.

I posted "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"
which reduces the locking complexity while fixing bugs, but you ignored it. Instead,
you decided to remove cryptoloop module (where userspace doing "modprobe cryptoloop"
will break). That is, you decided not to provide patches which can be backported.

Please do review my patches carefully instead of posting broken random patch series
at full speed. "[PATCH v3] block: genhd: don't call probe function with major_names_lock
held" is a revertable patch which we can revert in future after all locking dependencies
are simplified enough. Applying this patch does not break userspace.

Then, if you review "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and
loop_removal_mutex" carefully, you can easily figure out why I used HIDDEN_LOOP_DEVICE
magic. You are not aware of why loop_ctl_mutex is held during entire add/remove/lookup
operations. Since reducing the duration of loop_ctl_mutex might break existing userspace,
this patch is not suitable for backporting. We can apply this patch based on agreement
for future kernels and userspace.

After "[PATCH v3] block: genhd: don't call probe function with major_names_lock held"
and "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"
are applied, you can propose changes which remove cryptoloop module (of course, with
cares added for not to break userspace).


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 14:34             ` Tetsuo Handa
@ 2021-08-18 14:41               ` Greg KH
  2021-08-18 14:51                 ` Tetsuo Handa
  2021-08-19  9:19               ` Christoph Hellwig
  1 sibling, 1 reply; 29+ messages in thread
From: Greg KH @ 2021-08-18 14:41 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On Wed, Aug 18, 2021 at 11:34:15PM +0900, Tetsuo Handa wrote:
> On 2021/08/18 22:47, Christoph Hellwig wrote:
> > Hi Tetsuo,
> > 
> > I might sound like a broken record, but we need to reduce the locking
> > complexity, not make it worse and fall down the locking cliff.  I did
> > send a suggested series this morning, in which you found a bunch of
> > bugs.  I'd appreciate it if you could use your superior skills to
> > actually help explain the issue and arrive at a common solution that
> > actually simplifies things instead of steaming down the locking cliff
> > full speed.  Thank you very much.
> 
> I posted "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"
> which reduces the locking complexity while fixing bugs, but you ignored it. Instead,
> you decided to remove cryptoloop module (where userspace doing "modprobe cryptoloop"
> will break). That is, you decided not to provide patches which can be backported.

Do not ever worry about backporting patches.  Worry about getting the
changes and code correct, stable work can come afterwards, if needed.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 13:27           ` Greg KH
@ 2021-08-18 14:44             ` Tetsuo Handa
  2021-08-18 15:28               ` Greg KH
  0 siblings, 1 reply; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-18 14:44 UTC (permalink / raw)
  To: Greg KH
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On 2021/08/18 22:27, Greg KH wrote:
> On Wed, Aug 18, 2021 at 08:07:32PM +0900, Tetsuo Handa wrote:
>> This patch adds THIS_MODULE parameter to __register_blkdev() as with
>> usb_register(), and drops major_names_lock before calling probe function
>> by holding a reference to that module which contains that probe function.
>>
>> Since cdev uses register_chrdev() and __register_chrdev(), bdev should be
>> able to preserve register_blkdev() and __register_blkdev() naming scheme.
> 
> Note, the cdev api is anything but good, so should not be used as an
> excuse for anything.  Don't copy it unless you have a very good reason.
> 
> Also, what changed in this version?  I see no patch history here :(

Nothing but passing THIS_MODULE automagically using macro, as a response to

  > Do not force modules to put their own THIS_MODULE macro as a parameter,
  > put it in the .h file so that it happens automagically, much like the
  > usb_register() define in include/linux/usb.h is created.

suggestion. You also said

  > Why stop at 4 _ characters?
  > 
  > {sigh}
  > 
  > I think you need a new naming scheme here...

but I think current naming scheme is fine, and I even think v3 patch is fine.
Please suggest alternative candidates for register_blkdev() and __register_blkdev()...


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 14:41               ` Greg KH
@ 2021-08-18 14:51                 ` Tetsuo Handa
  2021-08-19  9:16                   ` Christoph Hellwig
  0 siblings, 1 reply; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-18 14:51 UTC (permalink / raw)
  To: Greg KH
  Cc: Christoph Hellwig, Jens Axboe, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On 2021/08/18 23:41, Greg KH wrote:
> On Wed, Aug 18, 2021 at 11:34:15PM +0900, Tetsuo Handa wrote:
>> On 2021/08/18 22:47, Christoph Hellwig wrote:
>>> Hi Tetsuo,
>>>
>>> I might sound like a broken record, but we need to reduce the locking
>>> complexity, not make it worse and fall down the locking cliff.  I did
>>> send a suggested series this morning, in which you found a bunch of
>>> bugs.  I'd appreciate it if you could use your superior skills to
>>> actually help explain the issue and arrive at a common solution that
>>> actually simplifies things instead of steaming down the locking cliff
>>> full speed.  Thank you very much.
>>
>> I posted "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"
>> which reduces the locking complexity while fixing bugs, but you ignored it. Instead,
>> you decided to remove cryptoloop module (where userspace doing "modprobe cryptoloop"
>> will break). That is, you decided not to provide patches which can be backported.
> 
> Do not ever worry about backporting patches.  Worry about getting the
> changes and code correct, stable work can come afterwards, if needed.

The loop module is one of critical components for syzkaller fuzzing.
Bugs which involves the loop module prevents syzkaller from finding/testing
other bugs. But changes are continuously applied without careful review.
Therefore, the changes and code are not getting correct. Stable work cannot
come afterwards...


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 14:44             ` Tetsuo Handa
@ 2021-08-18 15:28               ` Greg KH
  2021-08-21  6:12                 ` [PATCH v5] " Tetsuo Handa
  0 siblings, 1 reply; 29+ messages in thread
From: Greg KH @ 2021-08-18 15:28 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jens Axboe, Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On Wed, Aug 18, 2021 at 11:44:14PM +0900, Tetsuo Handa wrote:
> On 2021/08/18 22:27, Greg KH wrote:
> > On Wed, Aug 18, 2021 at 08:07:32PM +0900, Tetsuo Handa wrote:
> >> This patch adds THIS_MODULE parameter to __register_blkdev() as with
> >> usb_register(), and drops major_names_lock before calling probe function
> >> by holding a reference to that module which contains that probe function.
> >>
> >> Since cdev uses register_chrdev() and __register_chrdev(), bdev should be
> >> able to preserve register_blkdev() and __register_blkdev() naming scheme.
> > 
> > Note, the cdev api is anything but good, so should not be used as an
> > excuse for anything.  Don't copy it unless you have a very good reason.
> > 
> > Also, what changed in this version?  I see no patch history here :(
> 
> Nothing but passing THIS_MODULE automagically using macro, as a response to
> 
>   > Do not force modules to put their own THIS_MODULE macro as a parameter,
>   > put it in the .h file so that it happens automagically, much like the
>   > usb_register() define in include/linux/usb.h is created.

Then properly document it as you should when sending new versions of
patches.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 14:51                 ` Tetsuo Handa
@ 2021-08-19  9:16                   ` Christoph Hellwig
  2021-08-19 14:47                     ` Tetsuo Handa
  0 siblings, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2021-08-19  9:16 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Greg KH, Christoph Hellwig, Jens Axboe, Linus Torvalds,
	Hannes Reinecke, Chaitanya Kulkarni, Hillf Danton,
	Desmond Cheong Zhi Xi, linux-block, Tyler Hicks, Pavel Tatashin

On Wed, Aug 18, 2021 at 11:51:24PM +0900, Tetsuo Handa wrote:
> The loop module is one of critical components for syzkaller fuzzing.
> Bugs which involves the loop module prevents syzkaller from finding/testing
> other bugs. But changes are continuously applied without careful review.
> Therefore, the changes and code are not getting correct. Stable work cannot
> come afterwards...

I know a very simple solution: review them and provide detailed feedback.
It's not like patches just appear in a tree somewhere.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 14:34             ` Tetsuo Handa
  2021-08-18 14:41               ` Greg KH
@ 2021-08-19  9:19               ` Christoph Hellwig
  2021-08-19 14:23                 ` Tetsuo Handa
  1 sibling, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2021-08-19  9:19 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Greg KH, Tyler Hicks, Pavel Tatashin

On Wed, Aug 18, 2021 at 11:34:15PM +0900, Tetsuo Handa wrote:
> I posted "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"

Well, you hid it somewhere deep inside a mail deep in a thread.  If you
want proper patch review and actually post it as a patch.

> which reduces the locking complexity while fixing bugs, but you ignored it. Instead,

No, it doesn't.  Adding even more locks does not reduce complexity.

> you decided to remove cryptoloop module (where userspace doing "modprobe cryptoloop"
> will break).

Did you try it?  Because on my system it works just fine thanks to
the MODULE_ALIAS().

> That is, you decided not to provide patches which can be backported.

Please stop this bullshit.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-19  9:19               ` Christoph Hellwig
@ 2021-08-19 14:23                 ` Tetsuo Handa
  2021-08-19 15:10                   ` Greg KH
  0 siblings, 1 reply; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-19 14:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Linus Torvalds, Hannes Reinecke, Chaitanya Kulkarni,
	Hillf Danton, Desmond Cheong Zhi Xi, linux-block, Greg KH,
	Tyler Hicks, Pavel Tatashin

On 2021/08/19 18:19, Christoph Hellwig wrote:
> On Wed, Aug 18, 2021 at 11:34:15PM +0900, Tetsuo Handa wrote:
>> I posted "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and loop_removal_mutex"
> 
> Well, you hid it somewhere deep inside a mail deep in a thread.  If you
> want proper patch review and actually post it as a patch.

I didn't paste it for review. I pasted it only for demonstrating how complicated
it is if we try to address this circular locking problem on the loop module side.

> 
>> which reduces the locking complexity while fixing bugs, but you ignored it. Instead,
> 
> No, it doesn't.  Adding even more locks does not reduce complexity.

I guess that my and your terminology are incompatible.

> 
>> you decided to remove cryptoloop module (where userspace doing "modprobe cryptoloop"
>> will break).
> 
> Did you try it?

No, I didn't. I concluded that this series does not worth trying as soon as
I looked at how your series tries to address and found failing badly.

>                  Because on my system it works just fine thanks to
> the MODULE_ALIAS().

OK, I didn't realize that your series contained MODULE_ALIAS("cryptoloop") line
because I looked only loop add/remove/lookup part and found three bugs.

Strictly speaking, adding MODULE_ALIAS("cryptoloop") might still break userspace.
See the return code difference of "rmmod cryptoloop" shown in below example.

Before your series:
----------------------------------------
root@fuzz:~# truncate -s 16M /tmp/file
root@fuzz:~# modprobe loop
root@fuzz:~# lsmod | grep loop
loop                   40960  0
root@fuzz:~# losetup /dev/loop0 /tmp/file
root@fuzz:~# lsmod | grep loop
loop                   40960  1
root@fuzz:~# modprobe cryptoloop
root@fuzz:~# lsmod | grep loop
cryptoloop             16384  0
loop                   40960  2 cryptoloop
root@fuzz:~# rmmod cryptoloop
root@fuzz:~# lsmod | grep loop
loop                   40960  1
----------------------------------------

After your series:
----------------------------------------
root@fuzz:~# truncate -s 16M /tmp/file
root@fuzz:~# modprobe loop
root@fuzz:~# lsmod | grep loop
loop                   40960  0
root@fuzz:~# losetup /dev/loop0 /tmp/file
root@fuzz:~# lsmod | grep loop
loop                   40960  1
root@fuzz:~# modprobe cryptoloop
root@fuzz:~# lsmod | grep loop
loop                   40960  1
root@fuzz:~# rmmod cryptoloop
rmmod: ERROR: Module cryptoloop is not currently loaded
root@fuzz:~# echo $?
1
root@fuzz:~# lsmod | grep loop
loop                   40960  1
----------------------------------------

If userspace is expecting that "rmmod cryptoloop" succeeds, this is a breakage.

Anyway, I tried your series, and below is the result of your series.

----------------------------------------
fuzz login: [   46.957802][ T6638] loop0: detected capacity change from 0 to 4096
[   46.963189][ T6638] EXT4-fs error (device loop0): ext4_fill_super:4956: inode #2: comm a.out: iget: root inode unallocated
[   46.969001][ T6638] EXT4-fs (loop0): get root inode failed
[   46.972008][ T6638] EXT4-fs (loop0): mount failed
[   47.001030][ T6613] sysfs: cannot create duplicate filename '/devices/virtual/block/loop1'
[   47.005448][ T6613] CPU: 2 PID: 6613 Comm: systemd-udevd Tainted: G            E     5.14.0-rc6+ #747
[   47.009769][ T6613] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.014951][ T6613] Call Trace:
[   47.016418][ T6613]  dump_stack_lvl+0x57/0x7d
[   47.018735][ T6613]  sysfs_warn_dup.cold+0x17/0x24
[   47.021035][ T6613]  sysfs_create_dir_ns+0xb7/0xd0
[   47.023299][ T6613]  kobject_add_internal+0xa8/0x290
[   47.025396][ T6613]  kobject_add+0x7e/0xb0
[   47.027276][ T6613]  device_add+0x11d/0x910
[   47.029380][ T6613]  ? dev_set_name+0x4e/0x70
[   47.032238][ T6613]  __device_add_disk+0xbf/0x300
[   47.034167][ T6613]  ? blkdev_get_by_dev+0x330/0x330
[   47.037066][ T6613]  loop_add+0x292/0x320 [loop]
[   47.038716][ T6613]  blk_request_module+0x63/0xc0
[   47.040340][ T6613]  blkdev_get_no_open+0x98/0xc0
[   47.041969][ T6613]  blkdev_get_by_dev+0x54/0x330
[   47.043630][ T6613]  ? blkdev_get_by_dev+0x330/0x330
[   47.045431][ T6613]  blkdev_open+0x59/0xa0
[   47.046944][ T6613]  do_dentry_open+0x14c/0x3a0
[   47.048560][ T6613]  path_openat+0x78e/0xa50
[   47.050129][ T6613]  do_filp_open+0xad/0x120
[   47.051625][ T6613]  ? alloc_fd+0x14c/0x1f0
[   47.053085][ T6613]  do_sys_openat2+0x241/0x310
[   47.054698][ T6613]  do_sys_open+0x3f/0x80
[   47.056133][ T6613]  do_syscall_64+0x35/0xb0
[   47.057612][ T6613]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.059613][ T6613] RIP: 0033:0x7f8f168c7eab
[   47.061137][ T6613] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25
[   47.067845][ T6613] RSP: 002b:00007ffeb2e28d60 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
[   47.070689][ T6613] RAX: ffffffffffffffda RBX: 000055714bba6d20 RCX: 00007f8f168c7eab
[   47.073379][ T6613] RDX: 00000000000a0800 RSI: 000055714bd070e0 RDI: 00000000ffffff9c
[   47.076091][ T6613] RBP: 000055714bd070e0 R08: 000055714b782270 R09: 00007ffeb2fe3090
[   47.078974][ T6613] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000000a0800
[   47.081669][ T6613] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
[   47.085863][ T6613] kobject_add_internal failed for loop1 with -EEXIST, don't try to register things with the same name in the same directory.
[   47.090403][ T6613] kobject_add_internal failed for queue (error: -2 parent: loop1)
[   47.093848][ T6613] kobject_add_internal failed for integrity (error: -2 parent: loop1)
[   47.108619][ T6612] 
[   47.109437][ T6612] ======================================================
[   47.111786][ T6612] WARNING: possible circular locking dependency detected
[   47.114178][ T6612] 5.14.0-rc6+ #747 Tainted: G            E    
[   47.116181][ T6612] ------------------------------------------------------
[   47.118489][ T6612] systemd-udevd/6612 is trying to acquire lock:
[   47.120548][ T6612] ffff88801b875948 ((wq_completion)loop0){+.+.}-{0:0}, at: flush_workqueue+0x85/0x560
[   47.123687][ T6612] 
[   47.123687][ T6612] but task is already holding lock:
[   47.126176][ T6612] ffff88800cdfbc88 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x5b/0x610 [loop]
[   47.129415][ T6612] 
[   47.129415][ T6612] which lock already depends on the new lock.
[   47.129415][ T6612] 
[   47.132788][ T6612] 
[   47.132788][ T6612] the existing dependency chain (in reverse order) is:
[   47.135661][ T6612] 
[   47.135661][ T6612] -> #6 (&lo->lo_mutex){+.+.}-{3:3}:
[   47.138051][ T6612]        lock_acquire+0xd0/0x300
[   47.139625][ T6612]        __mutex_lock+0x97/0x950
[   47.141566][ T6612]        loop_control_ioctl+0x10a/0x1b0 [loop]
[   47.143602][ T6612]        __x64_sys_ioctl+0x6a/0xa0
[   47.145473][ T6612]        do_syscall_64+0x35/0xb0
[   47.147054][ T6612]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.149295][ T6612] 
[   47.149295][ T6612] -> #5 (loop_ctl_mutex){+.+.}-{3:3}:
[   47.151671][ T6612]        lock_acquire+0xd0/0x300
[   47.153206][ T6612]        __mutex_lock+0x97/0x950
[   47.154698][ T6612]        loop_add+0x43/0x320 [loop]
[   47.156471][ T6612]        blk_request_module+0x63/0xc0
[   47.158153][ T6612]        blkdev_get_no_open+0x98/0xc0
[   47.159848][ T6612]        blkdev_get_by_dev+0x54/0x330
[   47.161721][ T6612]        blkdev_open+0x59/0xa0
[   47.163292][ T6612]        do_dentry_open+0x14c/0x3a0
[   47.165028][ T6612]        path_openat+0x78e/0xa50
[   47.166629][ T6612]        do_filp_open+0xad/0x120
[   47.168341][ T6612]        do_sys_openat2+0x241/0x310
[   47.169956][ T6612]        do_sys_open+0x3f/0x80
[   47.171463][ T6612]        do_syscall_64+0x35/0xb0
[   47.172980][ T6612]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.174897][ T6612] 
[   47.174897][ T6612] -> #4 (major_names_lock){+.+.}-{3:3}:
[   47.177271][ T6612]        lock_acquire+0xd0/0x300
[   47.178930][ T6612]        __mutex_lock+0x97/0x950
[   47.180514][ T6612]        blkdev_show+0x18/0x90
[   47.181961][ T6612]        devinfo_show+0x58/0x70
[   47.183489][ T6612]        seq_read_iter+0x27b/0x3f0
[   47.185044][ T6612]        proc_reg_read_iter+0x3c/0x60
[   47.186665][ T6612]        new_sync_read+0x110/0x190
[   47.188268][ T6612]        vfs_read+0x11d/0x1b0
[   47.189661][ T6612]        ksys_read+0x63/0xe0
[   47.191162][ T6612]        do_syscall_64+0x35/0xb0
[   47.192745][ T6612]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.194831][ T6612] 
[   47.194831][ T6612] -> #3 (&p->lock){+.+.}-{3:3}:
[   47.196966][ T6612]        lock_acquire+0xd0/0x300
[   47.198445][ T6612]        __mutex_lock+0x97/0x950
[   47.200160][ T6612]        seq_read_iter+0x4c/0x3f0
[   47.201690][ T6612]        generic_file_splice_read+0xf7/0x1a0
[   47.203512][ T6612]        splice_direct_to_actor+0xc0/0x230
[   47.205258][ T6612]        do_splice_direct+0x8c/0xd0
[   47.206811][ T6612]        do_sendfile+0x319/0x5a0
[   47.208264][ T6612]        __x64_sys_sendfile64+0xad/0xc0
[   47.209870][ T6612]        do_syscall_64+0x35/0xb0
[   47.211670][ T6612]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.213551][ T6612] 
[   47.213551][ T6612] -> #2 (sb_writers#3){.+.+}-{0:0}:
[   47.215827][ T6612]        lock_acquire+0xd0/0x300
[   47.217277][ T6612]        lo_write_bvec+0xe1/0x260 [loop]
[   47.218964][ T6612]        loop_process_work+0x3fa/0xcf0 [loop]
[   47.220716][ T6612]        process_one_work+0x2aa/0x600
[   47.222343][ T6612]        worker_thread+0x48/0x3d0
[   47.223796][ T6612]        kthread+0x13e/0x170
[   47.225124][ T6612]        ret_from_fork+0x1f/0x30
[   47.226605][ T6612] 
[   47.226605][ T6612] -> #1 ((work_completion)(&lo->rootcg_work)){+.+.}-{0:0}:
[   47.229563][ T6612]        lock_acquire+0xd0/0x300
[   47.231200][ T6612]        process_one_work+0x27e/0x600
[   47.232784][ T6612]        worker_thread+0x48/0x3d0
[   47.234372][ T6612]        kthread+0x13e/0x170
[   47.235704][ T6612]        ret_from_fork+0x1f/0x30
[   47.237136][ T6612] 
[   47.237136][ T6612] -> #0 ((wq_completion)loop0){+.+.}-{0:0}:
[   47.239544][ T6612]        check_prev_add+0x91/0xc00
[   47.241026][ T6612]        __lock_acquire+0x14a8/0x1f40
[   47.242661][ T6612]        lock_acquire+0xd0/0x300
[   47.244133][ T6612]        flush_workqueue+0xa9/0x560
[   47.246009][ T6612]        drain_workqueue+0x9b/0x100
[   47.247518][ T6612]        destroy_workqueue+0x2f/0x210
[   47.249094][ T6612]        __loop_clr_fd+0xbf/0x610 [loop]
[   47.251016][ T6612]        blkdev_put+0xaf/0x180
[   47.252421][ T6612]        blkdev_close+0x20/0x30
[   47.253942][ T6612]        __fput+0xa0/0x240
[   47.255221][ T6612]        task_work_run+0x57/0xa0
[   47.256649][ T6612]        exit_to_user_mode_prepare+0x252/0x260
[   47.258572][ T6612]        syscall_exit_to_user_mode+0x19/0x60
[   47.260300][ T6612]        do_syscall_64+0x42/0xb0
[   47.262051][ T6612]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.263923][ T6612] 
[   47.263923][ T6612] other info that might help us debug this:
[   47.263923][ T6612] 
[   47.267105][ T6612] Chain exists of:
[   47.267105][ T6612]   (wq_completion)loop0 --> loop_ctl_mutex --> &lo->lo_mutex
[   47.267105][ T6612] 
[   47.271208][ T6612]  Possible unsafe locking scenario:
[   47.271208][ T6612] 
[   47.273524][ T6612]        CPU0                    CPU1
[   47.275232][ T6612]        ----                    ----
[   47.276848][ T6612]   lock(&lo->lo_mutex);
[   47.278416][ T6612]                                lock(loop_ctl_mutex);
[   47.280490][ T6612]                                lock(&lo->lo_mutex);
[   47.282600][ T6612]   lock((wq_completion)loop0);
[   47.284026][ T6612] 
[   47.284026][ T6612]  *** DEADLOCK ***
[   47.284026][ T6612] 
[   47.286474][ T6612] 2 locks held by systemd-udevd/6612:
[   47.288032][ T6612]  #0: ffff888009829128 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x180
[   47.290806][ T6612]  #1: ffff88800cdfbc88 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x5b/0x610 [loop]
[   47.293886][ T6612] 
[   47.293886][ T6612] stack backtrace:
[   47.295873][ T6612] CPU: 2 PID: 6612 Comm: systemd-udevd Tainted: G            E     5.14.0-rc6+ #747
[   47.298596][ T6612] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.302078][ T6612] Call Trace:
[   47.303038][ T6612]  dump_stack_lvl+0x57/0x7d
[   47.304506][ T6612]  check_noncircular+0x114/0x130
[   47.305936][ T6612]  check_prev_add+0x91/0xc00
[   47.307261][ T6612]  ? __lock_acquire+0x5c1/0x1f40
[   47.308783][ T6612]  __lock_acquire+0x14a8/0x1f40
[   47.310184][ T6612]  lock_acquire+0xd0/0x300
[   47.311765][ T6612]  ? flush_workqueue+0x85/0x560
[   47.313256][ T6612]  ? lockdep_init_map_type+0x51/0x220
[   47.314828][ T6612]  ? lockdep_init_map_type+0x51/0x220
[   47.316459][ T6612]  flush_workqueue+0xa9/0x560
[   47.317824][ T6612]  ? flush_workqueue+0x85/0x560
[   47.319246][ T6612]  ? drain_workqueue+0x9b/0x100
[   47.320768][ T6612]  drain_workqueue+0x9b/0x100
[   47.322124][ T6612]  destroy_workqueue+0x2f/0x210
[   47.323753][ T6612]  __loop_clr_fd+0xbf/0x610 [loop]
[   47.325250][ T6612]  ? __mutex_unlock_slowpath+0x40/0x2a0
[   47.326855][ T6612]  blkdev_put+0xaf/0x180
[   47.328478][ T6612]  blkdev_close+0x20/0x30
[   47.329831][ T6612]  __fput+0xa0/0x240
[   47.330967][ T6612]  task_work_run+0x57/0xa0
[   47.332297][ T6612]  exit_to_user_mode_prepare+0x252/0x260
[   47.333979][ T6612]  syscall_exit_to_user_mode+0x19/0x60
[   47.335654][ T6612]  do_syscall_64+0x42/0xb0
[   47.336939][ T6612]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.338650][ T6612] RIP: 0033:0x7f8f168c8987
[   47.340034][ T6612] Code: ff ff e8 9c 11 02 00 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 c3 5d f8 ff
[   47.346349][ T6612] RSP: 002b:00007ffeb2e28da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
[   47.349015][ T6612] RAX: 0000000000000000 RBX: 00007f8f16350788 RCX: 00007f8f168c8987
[   47.351533][ T6612] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000006
[   47.353848][ T6612] RBP: 0000000000000006 R08: 000055714b78faf0 R09: 0000000000000000
[   47.356265][ T6612] R10: 00007f8f16350788 R11: 0000000000000246 R12: 0000000000000000
[   47.358566][ T6612] R13: 0000000000000000 R14: 0000000000000000 R15: 000055714b777635
[   47.366005][ T6642] loop0: detected capacity change from 0 to 4096
[   47.369917][ T6642] EXT4-fs error (device loop0): ext4_fill_super:4956: inode #2: comm a.out: iget: root inode unallocated
[   47.373415][ T6642] EXT4-fs (loop0): get root inode failed
[   47.375217][ T6642] EXT4-fs (loop0): mount failed
[   47.402986][ T6650] ------------[ cut here ]------------
[   47.404646][ T6650] kernfs: can not remove 'format', no directory
[   47.406571][ T6650] WARNING: CPU: 3 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.409418][ T6650] Modules linked in: loop(E)
[   47.411712][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G            E     5.14.0-rc6+ #747
[   47.414617][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.418279][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.421282][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.427371][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.429782][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c28 RCX: 0000000000000027
[   47.432268][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807d3e78b8
[   47.434731][ T6650] RBP: 0000000000000000 R08: ffff88807d3e78b0 R09: 0000000000000000
[   47.439351][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff833d7a00
[   47.442188][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.444833][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.447964][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.462680][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.465238][ T6650] Call Trace:
[   47.466194][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.467659][ T6650]  sysfs_remove_group+0x38/0x80
[   47.469986][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.471472][ T6650]  __kobject_del+0x1b/0x80
[   47.472828][ T6650]  kobject_del+0xf/0x20
[   47.474071][ T6650]  blk_integrity_del+0x1d/0x30
[   47.475524][ T6650]  del_gendisk+0x3c/0x1d0
[   47.476868][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.478601][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.480437][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.481816][ T6650]  do_syscall_64+0x35/0xb0
[   47.483135][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.484923][ T6650] RIP: 0033:0x7f2a593f689d
[   47.486240][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   47.492220][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   47.494945][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   47.497417][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   47.499842][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   47.502165][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   47.504530][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   47.506856][ T6650] irq event stamp: 0
[   47.508086][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   47.510206][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.513225][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.516142][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   47.518372][ T6650] ---[ end trace 864118b148d36438 ]---
[   47.520079][ T6650] ------------[ cut here ]------------
[   47.521683][ T6650] kernfs: can not remove 'tag_size', no directory
[   47.523607][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.526414][ T6650] Modules linked in: loop(E)
[   47.527966][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   47.530959][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.534391][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.536288][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.547625][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.550209][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c30 RCX: 0000000000000027
[   47.553838][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807cfe78b8
[   47.556561][ T6650] RBP: 0000000000000000 R08: ffff88807cfe78b0 R09: 0000000000000000
[   47.559190][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff832b69c5
[   47.562085][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.564588][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.567361][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.569384][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.571960][ T6650] Call Trace:
[   47.572947][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.574474][ T6650]  sysfs_remove_group+0x38/0x80
[   47.575927][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.577461][ T6650]  __kobject_del+0x1b/0x80
[   47.579167][ T6650]  kobject_del+0xf/0x20
[   47.580455][ T6650]  blk_integrity_del+0x1d/0x30
[   47.582019][ T6650]  del_gendisk+0x3c/0x1d0
[   47.583455][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.584906][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.586743][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.588213][ T6650]  do_syscall_64+0x35/0xb0
[   47.589643][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.591405][ T6650] RIP: 0033:0x7f2a593f689d
[   47.592727][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   47.599154][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   47.601767][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   47.604170][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   47.606578][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   47.609213][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   47.611959][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   47.614393][ T6650] irq event stamp: 0
[   47.615556][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   47.617786][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.620581][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.623221][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   47.625325][ T6650] ---[ end trace 864118b148d36439 ]---
[   47.626907][ T6650] ------------[ cut here ]------------
[   47.628950][ T6650] kernfs: can not remove 'protection_interval_bytes', no directory
[   47.631422][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.634336][ T6650] Modules linked in: loop(E)
[   47.635703][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   47.638354][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.643677][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.645933][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.651920][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.653721][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c38 RCX: 0000000000000027
[   47.656499][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807cfe78b8
[   47.659000][ T6650] RBP: 0000000000000000 R08: ffff88807cfe78b0 R09: 0000000000000000
[   47.661814][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff832b69ab
[   47.664364][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.666748][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.669505][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.671563][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.673942][ T6650] Call Trace:
[   47.674922][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.676452][ T6650]  sysfs_remove_group+0x38/0x80
[   47.677970][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.679790][ T6650]  __kobject_del+0x1b/0x80
[   47.681237][ T6650]  kobject_del+0xf/0x20
[   47.682481][ T6650]  blk_integrity_del+0x1d/0x30
[   47.684036][ T6650]  del_gendisk+0x3c/0x1d0
[   47.685432][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.686948][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.688652][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.690033][ T6650]  do_syscall_64+0x35/0xb0
[   47.691385][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.693112][ T6650] RIP: 0033:0x7f2a593f689d
[   47.694393][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   47.700693][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   47.703295][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   47.705675][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   47.708199][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   47.710668][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   47.713170][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   47.715608][ T6650] irq event stamp: 0
[   47.716815][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   47.719102][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.721832][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.724668][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   47.727010][ T6650] ---[ end trace 864118b148d3643a ]---
[   47.728848][ T6650] ------------[ cut here ]------------
[   47.730606][ T6650] kernfs: can not remove 'read_verify', no directory
[   47.732800][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.735787][ T6650] Modules linked in: loop(E)
[   47.737179][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   47.739905][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.743602][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.745897][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.752712][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.754781][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c40 RCX: 0000000000000027
[   47.757805][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807cfe78b8
[   47.760520][ T6650] RBP: 0000000000000000 R08: ffff88807cfe78b0 R09: 0000000000000000
[   47.763314][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff832b699f
[   47.765957][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.768742][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.771947][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.774165][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.776826][ T6650] Call Trace:
[   47.777929][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.779816][ T6650]  sysfs_remove_group+0x38/0x80
[   47.781543][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.783155][ T6650]  __kobject_del+0x1b/0x80
[   47.784479][ T6650]  kobject_del+0xf/0x20
[   47.785724][ T6650]  blk_integrity_del+0x1d/0x30
[   47.787259][ T6650]  del_gendisk+0x3c/0x1d0
[   47.788687][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.790402][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.792090][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.793511][ T6650]  do_syscall_64+0x35/0xb0
[   47.794943][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.797167][ T6650] RIP: 0033:0x7f2a593f689d
[   47.798585][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   47.804763][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   47.807396][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   47.809802][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   47.812567][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   47.815142][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   47.817622][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   47.820127][ T6650] irq event stamp: 0
[   47.821451][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   47.823604][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.826320][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.829671][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   47.831940][ T6650] ---[ end trace 864118b148d3643b ]---
[   47.833574][ T6650] ------------[ cut here ]------------
[   47.835257][ T6650] kernfs: can not remove 'write_generate', no directory
[   47.837521][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.840695][ T6650] Modules linked in: loop(E)
[   47.842135][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   47.844971][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.848917][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.850797][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.856857][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.858698][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c48 RCX: 0000000000000000
[   47.861422][ T6650] RDX: ffff88807cff7298 RSI: ffff88807cfe78b0 RDI: ffff88807cfe78b0
[   47.864174][ T6650] RBP: 0000000000000000 R08: ffff88807cfe78b0 R09: 0000000000000000
[   47.866632][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff832b6990
[   47.869145][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.871616][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.874333][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.876383][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.879038][ T6650] Call Trace:
[   47.880101][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.881725][ T6650]  sysfs_remove_group+0x38/0x80
[   47.883177][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.884789][ T6650]  __kobject_del+0x1b/0x80
[   47.886112][ T6650]  kobject_del+0xf/0x20
[   47.887475][ T6650]  blk_integrity_del+0x1d/0x30
[   47.889081][ T6650]  del_gendisk+0x3c/0x1d0
[   47.890460][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.892233][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.893923][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.895566][ T6650]  do_syscall_64+0x35/0xb0
[   47.896910][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   47.898759][ T6650] RIP: 0033:0x7f2a593f689d
[   47.900215][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   47.906156][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   47.908869][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   47.911283][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   47.914069][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   47.916676][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   47.919200][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   47.921709][ T6650] irq event stamp: 0
[   47.922952][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   47.925358][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.928268][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   47.931498][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   47.933852][ T6650] ---[ end trace 864118b148d3643c ]---
[   47.935591][ T6650] ------------[ cut here ]------------
[   47.937425][ T6650] kernfs: can not remove 'device_is_integrity_capable', no directory
[   47.940207][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   47.943277][ T6650] Modules linked in: loop(E)
[   47.944652][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   47.947858][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   47.951643][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   47.953546][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   47.959582][ T6650] RSP: 0018:ffffc90001673df0 EFLAGS: 00010286
[   47.961695][ T6650] RAX: 0000000000000000 RBX: ffffffff83804c50 RCX: 0000000000000027
[   47.964495][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807cfe78b8
[   47.967082][ T6650] RBP: 0000000000000000 R08: ffff88807cfe78b0 R09: 0000000000000000
[   47.970046][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff832b6974
[   47.972639][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   47.975300][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   47.978324][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.980753][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   47.983200][ T6650] Call Trace:
[   47.984185][ T6650]  remove_files.isra.0+0x2b/0x60
[   47.985968][ T6650]  sysfs_remove_group+0x38/0x80
[   47.987600][ T6650]  sysfs_remove_groups+0x24/0x40
[   47.989451][ T6650]  __kobject_del+0x1b/0x80
[   47.990779][ T6650]  kobject_del+0xf/0x20
[   47.992091][ T6650]  blk_integrity_del+0x1d/0x30
[   47.993517][ T6650]  del_gendisk+0x3c/0x1d0
[   47.994813][ T6650]  loop_remove+0x10/0x40 [loop]
[   47.996663][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   47.998397][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   47.999876][ T6650]  do_syscall_64+0x35/0xb0
[   48.001201][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   48.003003][ T6650] RIP: 0033:0x7f2a593f689d
[   48.004386][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   48.010505][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   48.013686][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   48.016190][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   48.018747][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   48.021360][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   48.023806][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   48.026206][ T6650] irq event stamp: 0
[   48.027547][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   48.030407][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   48.033184][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   48.035888][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   48.038266][ T6650] ---[ end trace 864118b148d3643d ]---
[   48.039939][ T6650] ------------[ cut here ]------------
[   48.041610][ T6650] kernfs: can not remove 'bdi', no directory
[   48.043416][ T6650] WARNING: CPU: 2 PID: 6650 at fs/kernfs/dir.c:1508 kernfs_remove_by_name_ns+0x6f/0x80
[   48.046747][ T6650] Modules linked in: loop(E)
[   48.048240][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   48.050851][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   48.054640][ T6650] RIP: 0010:kernfs_remove_by_name_ns+0x6f/0x80
[   48.056488][ T6650] Code: bb 38 01 31 c0 5d 41 5c 41 5d c3 48 c7 c7 e0 7d 7d 83 e8 b4 bb 38 01 b8 fe ff ff ff eb e7 48 c7 c7 18 39 2a 83 e8 41 9a 29 01 <0f> 0b b8 fe ff ff ff eb d2 0f 1f 84 00 00 00 00 00 41 57 41 56 41
[   48.063272][ T6650] RSP: 0018:ffffc90001673e88 EFLAGS: 00010282
[   48.065255][ T6650] RAX: 0000000000000000 RBX: ffff88800cdb1000 RCX: 0000000000000027
[   48.067812][ T6650] RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff88807cfe78b8
[   48.070366][ T6650] RBP: ffff888035d5d840 R08: ffff88807cfe78b0 R09: 0000000000000000
[   48.072797][ T6650] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff8329031f
[   48.075132][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   48.077741][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   48.080951][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   48.082969][ T6650] CR2: 00005651cf34e6b8 CR3: 0000000007541000 CR4: 00000000001506e0
[   48.085432][ T6650] Call Trace:
[   48.086474][ T6650]  del_gendisk+0x1b3/0x1d0
[   48.087839][ T6650]  loop_remove+0x10/0x40 [loop]
[   48.089397][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   48.091101][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   48.092474][ T6650]  do_syscall_64+0x35/0xb0
[   48.093839][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   48.095824][ T6650] RIP: 0033:0x7f2a593f689d
[   48.097205][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   48.103356][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   48.105880][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   48.108297][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   48.110806][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   48.113462][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   48.115938][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   48.118411][ T6650] irq event stamp: 0
[   48.119606][ T6650] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[   48.121771][ T6650] hardirqs last disabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   48.124858][ T6650] softirqs last  enabled at (0): [<ffffffff811262d4>] copy_process+0x984/0x2010
[   48.127996][ T6650] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   48.130546][ T6650] ---[ end trace 864118b148d3643e ]---
[   48.132559][ T6650] BUG: kernel NULL pointer dereference, address: 0000000000000110
[   48.135053][ T6650] #PF: supervisor read access in kernel mode
[   48.136835][ T6650] #PF: error_code(0x0000) - not-present page
[   48.138685][ T6650] PGD 1f595067 P4D 1f595067 PUD b382067 PMD 0 
[   48.140520][ T6650] Oops: 0000 [#1] PREEMPT SMP
[   48.141990][ T6650] CPU: 2 PID: 6650 Comm: a.out Tainted: G        W   E     5.14.0-rc6+ #747
[   48.144537][ T6650] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   48.148343][ T6650] RIP: 0010:device_del+0x46/0x410
[   48.150045][ T6650] Code: 48 89 ef 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 c0 e8 9c fb 07 01 8b 05 42 94 2d 02 85 c0 0f 85 8c 03 00 00 48 8b 53 48 <0f> b6 82 10 01 00 00 a8 01 75 09 83 c8 01 88 82 10 01 00 00 48 89
[   48.155842][ T6650] RSP: 0018:ffffc90001673e80 EFLAGS: 00010246
[   48.157620][ T6650] RAX: 0000000000000000 RBX: ffff888035d5d880 RCX: 0000000000000000
[   48.160000][ T6650] RDX: 0000000000000000 RSI: ffffffff817736a4 RDI: 0000000000000000
[   48.162767][ T6650] RBP: ffff888035d5d9a0 R08: 0000000000000000 R09: 0000000000000001
[   48.166281][ T6650] R10: ffffc90001673e80 R11: ffff88800e8782b8 R12: 0000000000000000
[   48.168662][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   48.171244][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   48.174117][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   48.176102][ T6650] CR2: 0000000000000110 CR3: 0000000007541000 CR4: 00000000001506e0
[   48.178665][ T6650] Call Trace:
[   48.180000][ T6650]  ? kernfs_remove_by_name_ns+0x5c/0x80
[   48.182079][ T6650]  loop_remove+0x10/0x40 [loop]
[   48.183585][ T6650]  loop_control_ioctl+0x1a1/0x1b0 [loop]
[   48.185286][ T6650]  __x64_sys_ioctl+0x6a/0xa0
[   48.186660][ T6650]  do_syscall_64+0x35/0xb0
[   48.188024][ T6650]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   48.189781][ T6650] RIP: 0033:0x7f2a593f689d
[   48.191199][ T6650] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[   48.197405][ T6650] RSP: 002b:00007ffe1f075b78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   48.199943][ T6650] RAX: ffffffffffffffda RBX: 431bde82d7b634db RCX: 00007f2a593f689d
[   48.202364][ T6650] RDX: 0000000000000001 RSI: 0000000000004c81 RDI: 0000000000000005
[   48.204803][ T6650] RBP: 00007ffe1f075bb0 R08: 0000000000000000 R09: 0000000000000000
[   48.207191][ T6650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe1f075bb0
[   48.209799][ T6650] R13: 000000000000b44f R14: 00007ffe1f075b8c R15: 0000000000000000
[   48.212396][ T6650] Modules linked in: loop(E)
[   48.213774][ T6650] CR2: 0000000000000110
[   48.215095][ T6650] ---[ end trace 864118b148d3643f ]---
[   48.217741][ T6650] RIP: 0010:device_del+0x46/0x410
[   48.220866][ T6650] Code: 48 89 ef 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 c0 e8 9c fb 07 01 8b 05 42 94 2d 02 85 c0 0f 85 8c 03 00 00 48 8b 53 48 <0f> b6 82 10 01 00 00 a8 01 75 09 83 c8 01 88 82 10 01 00 00 48 89
[   48.232486][ T6650] RSP: 0018:ffffc90001673e80 EFLAGS: 00010246
[   48.236009][ T6650] RAX: 0000000000000000 RBX: ffff888035d5d880 RCX: 0000000000000000
[   48.241064][ T6650] RDX: 0000000000000000 RSI: ffffffff817736a4 RDI: 0000000000000000
[   48.246055][ T6650] RBP: ffff888035d5d9a0 R08: 0000000000000000 R09: 0000000000000001
[   48.250737][ T6650] R10: ffffc90001673e80 R11: ffff88800e8782b8 R12: 0000000000000000
[   48.253477][ T6650] R13: 0000000000000001 R14: ffff888010cba340 R15: 0000000000000000
[   48.256002][ T6650] FS:  00007f2a594ce580(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000
[   48.258903][ T6650] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   48.261021][ T6650] CR2: 0000000000000110 CR3: 0000000007541000 CR4: 00000000001506e0
[   48.263737][ T6650] Kernel panic - not syncing: Fatal exception
[   48.265651][ T6650] Kernel Offset: disabled
[   48.569511][ T6650] Rebooting in 10 seconds..
----------------------------------------

Your series description is

  this series sorts out lock order reversals involving the block layer
  open_mutex by drastically reducing the scope of loop_ctl_mutex.  To do
  so it first merges the cryptoloop module into the main loop driver
  as the unregistrtion of transfers on live loop devices was causing
  some nasty locking interactions.

but the actual result after your patch is

  Circular locking dependency problem was not fixed at all.
  The kernel crashed in 2 seconds due to NULL pointer dereference.
  syzbot will crash earlier due to add_disk()/del_gendisk() racing

> 
>> That is, you decided not to provide patches which can be backported.
> 
> Please stop this bullshit.
> 

Please stop posting broken series.

 b/drivers/block/Kconfig    |    6 +--
 b/drivers/block/Makefile   |    1
 b/drivers/block/loop.c     |  327 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------
 b/drivers/block/loop.h     |   30 +++---------------
 drivers/block/cryptoloop.c |  204 ------------------------------------------------------------------------------------------------------------------------------
 5 files changed, 188 insertions(+), 380 deletions(-)

You need to also read Documentation/process/stable-kernel-rules.rst .

 - It must be obviously correct and tested.
 - It cannot be bigger than 100 lines, with context.
 - It must fix only one thing.
 - It must fix a real bug that bothers people (not a, "This could be a
   problem..." type thing).

Your series is far away from conforming to the stable kernel rules.

My "block: genhd: don't call probe function with major_names_lock held" patch is

 block/genhd.c         | 33 ++++++++++++++++++++++++++-------
 include/linux/genhd.h | 11 +++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

without any dependent commit, and my "loop: break loop_ctl_mutex into
loop_idr_spinlock and loop_removal_mutex" patch is

 drivers/block/loop.c | 130 +++++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 41 deletions(-)

which is border line for stable due to dependent commit.
Therefore, I propose "block: genhd: don't call probe function with major_names_lock held"
for 5.14 and stable, then "loop: break loop_ctl_mutex into loop_idr_spinlock and
loop_removal_mutex" for 5.15, and then your patch (which removes cryptoloop module)
for 5.16 and later.

Please read on the paragraphs after the paragraph which you called it "bullshit".
These are suggestions which helps you to write thechnically correct and safe patches.

If you don't like my patches, please do post patches that are thechnically correct
and safe. Can you do it?


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-19  9:16                   ` Christoph Hellwig
@ 2021-08-19 14:47                     ` Tetsuo Handa
  0 siblings, 0 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-19 14:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Greg KH, Jens Axboe, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On 2021/08/19 18:16, Christoph Hellwig wrote:
> On Wed, Aug 18, 2021 at 11:51:24PM +0900, Tetsuo Handa wrote:
>> The loop module is one of critical components for syzkaller fuzzing.
>> Bugs which involves the loop module prevents syzkaller from finding/testing
>> other bugs. But changes are continuously applied without careful review.
>> Therefore, the changes and code are not getting correct. Stable work cannot
>> come afterwards...
> 
> I know a very simple solution: review them and provide detailed feedback.
> It's not like patches just appear in a tree somewhere.
> 

You don't know, and nobody knows, a simple solution that can apply to my case:

  I'm not your exclusive reviewer, and I'm not familiar with the block layer.
  Actually, I'm not a developer living in block layer, and I'm not subscribed
  to linux-block mailing list. I check bugs reported by syzbot, and analyze
  and try to write patches. I can't afford monitoring patches of all mailing
  lists.

  In order words, I can't review your patches and can't provide detailed feedback
  (unless you cc me with detailed explanation of why it is safe to make such change).

Bugs are merged everywhere but I can't prevent bugs from getting merged. I monitor
bugs reported by syzbot. It is exactly like patches just appear in a tree somewhere.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v4] block: genhd: don't call probe function with major_names_lock held
  2021-08-19 14:23                 ` Tetsuo Handa
@ 2021-08-19 15:10                   ` Greg KH
  0 siblings, 0 replies; 29+ messages in thread
From: Greg KH @ 2021-08-19 15:10 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

On Thu, Aug 19, 2021 at 11:23:36PM +0900, Tetsuo Handa wrote:
> You need to also read Documentation/process/stable-kernel-rules.rst .
> 
>  - It must be obviously correct and tested.
>  - It cannot be bigger than 100 lines, with context.
>  - It must fix only one thing.
>  - It must fix a real bug that bothers people (not a, "This could be a
>    problem..." type thing).
> 
> Your series is far away from conforming to the stable kernel rules.

Again DO NOT WORRY ABOUT STABLE KERNELS when trying to fix a problem
correctly.  Fix it right, and then, if needed, we can worry about the
stable trees.

The first goal of the stable kernels is to NOT cause extra work for the
upstream kernel developers for those that do not want to care about
them.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v5] block: genhd: don't call probe function with major_names_lock held
  2021-08-18 15:28               ` Greg KH
@ 2021-08-21  6:12                 ` Tetsuo Handa
  0 siblings, 0 replies; 29+ messages in thread
From: Tetsuo Handa @ 2021-08-21  6:12 UTC (permalink / raw)
  To: Greg KH, Jens Axboe
  Cc: Christoph Hellwig, Linus Torvalds, Hannes Reinecke,
	Chaitanya Kulkarni, Hillf Danton, Desmond Cheong Zhi Xi,
	linux-block, Tyler Hicks, Pavel Tatashin

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit a160c6159d4a0cf8 ("block: add an optional probe callback to
major_names") is calling the module's probe function with major_names_lock
held.

When copying content of /proc/devices to another file via sendfile(),

  sb_writers#$N => &p->lock => major_names_lock

dependency is recorded.

When loop_process_work() from WQ context performs a write request,

  (wq_completion)loop$M => (work_completion)&lo->rootcg_work =>
  sb_writers#$N

dependency is recorded.

When flush_workqueue() from drain_workqueue() from destroy_workqueue()
 from __loop_clr_fd() from blkdev_put() from blkdev_close() from __fput()
is called,

  &disk->open_mutex => &lo->lo_mutex => (wq_completion)loop$M

dependency is recorded.

When loop_control_remove() from loop_control_ioctl(LOOP_CTL_REMOVE) is
called,

  loop_ctl_mutex => &lo->lo_mutex

dependency is recorded.

As a result, lockdep thinks that there is

  loop_ctl_mutex => &lo->lo_mutex => (wq_completion)loop$M =>
  (work_completion)&lo->rootcg_work => sb_writers#$N => &p->lock =>
  major_names_lock

dependency chain.

Then, if loop_add() from loop_probe() from blk_request_module() from
blkdev_get_no_open() from blkdev_get_by_dev() from blkdev_open() from
do_dentry_open() from path_openat() from do_filp_open() is called,

  major_names_lock => loop_ctl_mutex

dependency is appended to the dependency chain.

There would be two approaches for breaking this circular dependency.
One is to kill loop_ctl_mutex => &lo->lo_mutex chain. The other is to
kill major_names_lock => loop_ctl_mutex chain. This patch implements
the latter, due to the following reasons.

 (1) sb_writers#$N => &p->lock => major_names_lock chain is unavoidable

 (2) this patch can also fix similar problem in other modules [2] which
     is caused by calling the probe function with major_names_lock held

 (3) I believe that this patch is principally safer than e.g.
     commit bd5c39edad535d9f ("loop: reduce loop_ctl_mutex coverage in
     loop_exit") which waits until the probe function finishes using
     global mutex in order to fix deadlock reproducible by sleep
     injection [3]

This patch adds THIS_MODULE parameter to __register_blkdev() as with
usb_register(), and drops major_names_lock before calling probe function
by holding a reference to that module which contains that probe function.

Link: https://syzkaller.appspot.com/bug?id=7bb10e8b62f83e4d445cdf4c13d69e407e629558 [1]
Link: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb [2]
Link: https://lkml.kernel.org/r/c4edf07f-92e1-a350-2743-f0b0234a2b6c@i-love.sakura.ne.jp [3]
Reported-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Reported-by: syzbot <syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+f61766d5763f9e7a118f@syzkaller.appspotmail.com>
Fixes: a160c6159d4a0cf8 ("block: add an optional probe callback to major_names")
---
Changes in v5:
  Repost of v4, with history added.

  We should admit that the locking dependency complication caused by
  a160c6159d4a0cf8 is currently an untamable beast, for we can't allocate
  resources for finding approaches that can reduce the locking complexity
  of all individual modules using probe function.

  Keeping this locking dependency problem unresolved degrades quality of
  future kernels by "patches are merged and kernels are released without
  testing, for it is impossible to test due to this locking dependency".
  I'm not worrying about only stable kernels. Stopping this untamable
  beast immediately is important for "releasing tested kernels".

  As an example which kills loop_ctl_mutex => &lo->lo_mutex chain, I wrote
  "[PATCH] loop: break loop_ctl_mutex into loop_idr_spinlock and
  loop_removal_mutex"
  ( https://lkml.kernel.org/r/2901b9c2-f798-413e-4073-451259718288@i-love.sakura.ne.jp ).
  However, this approach is not considered as reducing the locking
  complexity
  ( https://lkml.kernel.org/r/20210819091941.GB12883@lst.de ).
  Then, what different approach is possible? We are run out of ideas
  which can stop this untamable beast from the loop module side.

  I consider that "making it possible to release tested kernels" (instead
  of leaving this problem by refusing to convert from multi-purposed
  long-held single lock into single-purposed short-held two locks) is
  also "Fix it right". But if we can't add new locks, we have no choice
  but addressing this problem in the common path.

  Note that "[PATCH v5] block: genhd: don't call probe function with
  major_names_lock held" is easily revetrtable because this patch can
  fix the locking dependency without adding new locks and without any
  dependent commits. We can revert this patch after we succeeded in making
  locking dependency of all individual modules using probe function
  simple enough.

Changes in v4:
  Use a macro and an inline function for transparently passing THIS_MODULE
  argument to all __register_blkdev() users, as a response to "avoid
  directly modifying __register_blkdev() users" feedback.

Changes in v3:
  Directly add THIS_MODULE argument to all __register_blkdev() users,
  as a response to "avoid use of ____ naming scheme" feedback.

Changes in v2:
  Changed the strategy for addressing this problem from individual modules
  to common path. Chose ____ naming scheme for transparently passing
  THIS_MODULE argument.
---
 block/genhd.c         | 33 ++++++++++++++++++++++++++-------
 include/linux/genhd.h | 11 +++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 298ee78c1bda..a3e2e5666457 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -162,6 +162,7 @@ static struct blk_major_name {
 	int major;
 	char name[16];
 	void (*probe)(dev_t devt);
+	struct module *owner;
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 
@@ -190,7 +191,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
  *         @major = 0, try to allocate any unused major number.
  * @name: the name of the new block device as a zero terminated string
- * @probe: allback that is called on access to any minor number of @major
+ * @probe: callback that is called on access to any minor number of @major
+ * @owner: THIS_MODULE if @probe is not NULL, ignored if @probe is NULL.
  *
  * The @name must be unique within the system.
  *
@@ -207,8 +209,9 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
  *
  * Use register_blkdev instead for any new code.
  */
+#undef __register_blkdev
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt))
+		      void (*probe)(dev_t devt), struct module *owner)
 {
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
@@ -248,6 +251,7 @@ int __register_blkdev(unsigned int major, const char *name,
 
 	p->major = major;
 	p->probe = probe;
+	p->owner = owner;
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -653,14 +657,29 @@ void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
 	struct blk_major_name **n;
+	void (*probe_fn)(dev_t devt);
 
 	mutex_lock(&major_names_lock);
 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
-		if ((*n)->major == major && (*n)->probe) {
-			(*n)->probe(devt);
-			mutex_unlock(&major_names_lock);
-			return;
-		}
+		if ((*n)->major != major || !(*n)->probe)
+			continue;
+		if (!try_module_get((*n)->owner))
+			break;
+		/*
+		 * Calling probe function with major_names_lock held causes
+		 * circular locking dependency problem. Thus, call it after
+		 * releasing major_names_lock.
+		 */
+		probe_fn = (*n)->probe;
+		mutex_unlock(&major_names_lock);
+		/*
+		 * Assuming that unregister_blkdev() is called from module's
+		 * __exit function, a module refcount taken above allows us
+		 * to safely call probe function without major_names_lock held.
+		 */
+		probe_fn(devt);
+		module_put((*n)->owner);
+		return;
 	}
 	mutex_unlock(&major_names_lock);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 13b34177cc85..2ed856616d47 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -303,9 +303,16 @@ struct gendisk *__blk_alloc_disk(int node);
 void blk_cleanup_disk(struct gendisk *disk);
 
 int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
+		      void (*probe)(dev_t devt), struct module *owner);
+static inline int register_blkdev_with_probe(unsigned int major, const char *name,
+					     void (*probe)(dev_t devt), struct module *owner)
+{
+	return __register_blkdev(major, name, probe, owner);
+}
+#define __register_blkdev(major, name, probe) \
+	register_blkdev_with_probe(major, name, probe, THIS_MODULE)
 #define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
+	register_blkdev_with_probe(major, name, NULL, NULL)
 void unregister_blkdev(unsigned int major, const char *name);
 
 bool bdev_check_media_change(struct block_device *bdev);
-- 
2.18.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2021-08-21  6:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-19  1:05 [PATCH v2] block: genhd: don't call probe function with major_names_lock held Tetsuo Handa
2021-06-19  3:24 ` kernel test robot
2021-06-19  6:14 ` kernel test robot
2021-06-19  6:44 ` Greg KH
2021-06-19  8:47   ` Tetsuo Handa
     [not found]   ` <20210620024403.820-1-hdanton@sina.com>
2021-06-20 13:54     ` Tetsuo Handa
2021-06-21  8:54       ` Greg KH
2021-06-21  6:18 ` Christoph Hellwig
2021-08-15  6:52 ` [PATCH v3] " Tetsuo Handa
2021-08-15  7:06   ` Greg KH
2021-08-15  7:49     ` Tetsuo Handa
2021-08-15  9:19       ` Greg KH
2021-08-18 11:07         ` [PATCH v4] " Tetsuo Handa
2021-08-18 13:27           ` Greg KH
2021-08-18 14:44             ` Tetsuo Handa
2021-08-18 15:28               ` Greg KH
2021-08-21  6:12                 ` [PATCH v5] " Tetsuo Handa
2021-08-18 13:47           ` [PATCH v4] " Christoph Hellwig
2021-08-18 14:34             ` Tetsuo Handa
2021-08-18 14:41               ` Greg KH
2021-08-18 14:51                 ` Tetsuo Handa
2021-08-19  9:16                   ` Christoph Hellwig
2021-08-19 14:47                     ` Tetsuo Handa
2021-08-19  9:19               ` Christoph Hellwig
2021-08-19 14:23                 ` Tetsuo Handa
2021-08-19 15:10                   ` Greg KH
2021-08-16  7:33   ` [PATCH v3] " Christoph Hellwig
2021-08-16 14:44     ` Tetsuo Handa
     [not found]     ` <20210817081045.3609-1-hdanton@sina.com>
2021-08-17 10:18       ` Tetsuo Handa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).