All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Mikulas Patocka <mpatocka@redhat.com>,
	Mike Snitzer <snitzer@redhat.com>
Subject: [PATCH 3.4 10/24] dm sysfs: fix a module unload race
Date: Tue, 18 Feb 2014 14:46:57 -0800	[thread overview]
Message-ID: <20140218224550.520789201@linuxfoundation.org> (raw)
In-Reply-To: <20140218224550.221535225@linuxfoundation.org>

3.4-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Mikulas Patocka <mpatocka@redhat.com>

commit 2995fa78e423d7193f3b57835f6c1c75006a0315 upstream.

This reverts commit be35f48610 ("dm: wait until embedded kobject is
released before destroying a device") and provides an improved fix.

The kobject release code that calls the completion must be placed in a
non-module file, otherwise there is a module unload race (if the process
calling dm_kobject_release is preempted and the DM module unloaded after
the completion is triggered, but before dm_kobject_release returns).

To fix this race, this patch moves the completion code to dm-builtin.c
which is always compiled directly into the kernel if BLK_DEV_DM is
selected.

The patch introduces a new dm_kobject_holder structure, its purpose is
to keep the completion and kobject in one place, so that it can be
accessed from non-module code without the need to export the layout of
struct mapped_device to that code.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 drivers/md/Kconfig      |    4 +++
 drivers/md/Makefile     |    1 
 drivers/md/dm-builtin.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-sysfs.c   |    5 ----
 drivers/md/dm.c         |   26 ++++--------------------
 drivers/md/dm.h         |   17 +++++++++++++++-
 6 files changed, 76 insertions(+), 27 deletions(-)

--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -185,8 +185,12 @@ config MD_FAULTY
 
 	  In unsure, say N.
 
+config BLK_DEV_DM_BUILTIN
+	boolean
+
 config BLK_DEV_DM
 	tristate "Device mapper support"
+	select BLK_DEV_DM_BUILTIN
 	---help---
 	  Device-mapper is a low level volume manager.  It works by allowing
 	  people to specify mappings for ranges of logical sectors.  Various
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_MD_MULTIPATH)	+= multipath.
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,50 @@
+#include "dm.h"
+
+#include <linux/export.h>
+
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ *  1. Some other process takes a reference to dm kobject
+ *  2. The user issues ioctl function to unload the dm device
+ *  3. dm_sysfs_exit calls kobject_put, however the object is not released
+ *     because of the other reference taken at step 1
+ *  4. dm_sysfs_exit waits on the completion
+ *  5. The other process that took the reference in step 1 drops it,
+ *     dm_kobject_release is called from this process
+ *  6. dm_kobject_release calls complete()
+ *  7. a reschedule happens before dm_kobject_release returns
+ *  8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ *     count is decremented
+ *  9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ *     it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+
+void dm_kobject_release(struct kobject *kobj)
+{
+	complete(dm_get_completion_from_kobject(kobj));
+}
+
+EXPORT_SYMBOL(dm_kobject_release);
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -79,11 +79,6 @@ static const struct sysfs_ops dm_sysfs_o
 	.show	= dm_attr_show,
 };
 
-static void dm_kobject_release(struct kobject *kobj)
-{
-	complete(dm_get_completion_from_kobject(kobj));
-}
-
 /*
  * dm kobject is embedded in mapped_device structure
  * no need to define release function here
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -191,11 +191,8 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
-	/* sysfs handle */
-	struct kobject kobj;
-
-	/* wait until the kobject is released */
-	struct completion kobj_completion;
+	/* kobject and completion */
+	struct dm_kobject_holder kobj_holder;
 
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
@@ -1894,7 +1891,7 @@ static struct mapped_device *alloc_dev(i
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
-	init_completion(&md->kobj_completion);
+	init_completion(&md->kobj_holder.completion);
 
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
@@ -2686,20 +2683,14 @@ struct gendisk *dm_disk(struct mapped_de
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
-	return &md->kobj;
+	return &md->kobj_holder.kobj;
 }
 
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
 	struct mapped_device *md;
 
-	md = container_of(kobj, struct mapped_device, kobj);
-	if (&md->kobj != kobj)
-		return NULL;
+	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md))
@@ -2709,13 +2700,6 @@ struct mapped_device *dm_get_from_kobjec
 	return md;
 }
 
-struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
-{
-	struct mapped_device *md = container_of(kobj, struct mapped_device, kobj);
-
-	return &md->kobj_completion;
-}
-
 int dm_suspended_md(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
+#include <linux/kobject.h>
 
 /*
  * Suspend feature flags
@@ -120,11 +121,25 @@ void dm_interface_exit(void);
 /*
  * sysfs interface
  */
+struct dm_kobject_holder {
+	struct kobject kobj;
+	struct completion completion;
+};
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
-struct completion *dm_get_completion_from_kobject(struct kobject *kobj);
+
+/*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
 
 /*
  * Targets for linear and striped mappings



  parent reply	other threads:[~2014-02-18 23:36 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-02-18 22:46 [PATCH 3.4 00/24] 3.4.81-stable review Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 01/24] SELinux: Fix kernel BUG on empty security contexts Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 02/24] mm: __set_page_dirty_nobuffers() uses spin_lock_irqsave() instead of spin_lock_irq() Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 03/24] mm: __set_page_dirty uses spin_lock_irqsave instead of spin_lock_irq Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 04/24] x86, hweight: Fix BUG when booting with CONFIG_GCOV_PROFILE_ALL=y Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 05/24] printk: Fix scheduling-while-atomic problem in console_cpu_notify() Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 06/24] ext4: protect group inode free counting with group lock Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 07/24] drm/i915: kick any firmware framebuffers before claiming the gtt Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 08/24] mm/page_alloc.c: remove pageblock_default_order() Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 09/24] mm: setup pageblock_order before its used by sparsemem Greg Kroah-Hartman
2014-02-18 22:46 ` Greg Kroah-Hartman [this message]
2014-02-18 22:46 ` [PATCH 3.4 11/24] ftrace: Synchronize setting function_trace_op with ftrace_trace_function Greg Kroah-Hartman
2014-02-18 22:46 ` [PATCH 3.4 12/24] ftrace: Fix synchronization location disabling and freeing ftrace_ops Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 13/24] ftrace: Have function graph only trace based on global_ops filters Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 14/24] sched/nohz: Fix rq->cpu_load[] calculations Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 15/24] sched/nohz: Fix rq->cpu_load calculations some more Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 16/24] IB/qib: Convert qib_user_sdma_pin_pages() to use get_user_pages_fast() Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 17/24] target/file: Use O_DSYNC by default for FILEIO backends Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 18/24] target/file: Re-enable optional fd_buffered_io=1 operation Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 19/24] KVM: Fix buffer overflow in kvm_set_irq() Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 20/24] PM / Hibernate: Hibernate/thaw fixes/improvements Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 21/24] Input: synaptics - handle out of bounds values from the hardware Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 22/24] virtio-blk: Use block layer provided spinlock Greg Kroah-Hartman
2014-02-18 22:47   ` Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 23/24] lib/vsprintf.c: kptr_restrict: fix pK-error in SysRq show-all-timers(Q) Greg Kroah-Hartman
2014-02-18 22:47 ` [PATCH 3.4 24/24] nfs: tear down caches in nfs_init_writepagecache when allocation fails Greg Kroah-Hartman
2014-02-19  4:26 ` [PATCH 3.4 00/24] 3.4.81-stable review Guenter Roeck
2014-02-20  0:03 ` Shuah Khan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140218224550.520789201@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mpatocka@redhat.com \
    --cc=snitzer@redhat.com \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.