linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2 v3] nbd: fix possible page fault for nbd disk
@ 2019-08-22  7:59 xiubli
  2019-08-22  7:59 ` [PATCH 1/2 v3] nbd: rename the runtime flags as NBD_RT_ prefixed xiubli
  2019-08-22  7:59 ` [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk xiubli
  0 siblings, 2 replies; 8+ messages in thread
From: xiubli @ 2019-08-22  7:59 UTC (permalink / raw)
  To: josef, axboe; +Cc: mchristi, linux-block, nbd, linux-kernel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

V3:
- fix the case that when the NBD_CFLAG_DESTROY_ON_DISCONNECT bit is not
  set.
- add "nbd: rename the runtime flags as NBD_RT_ prefixed"


Xiubo Li (2):
  nbd: rename the runtime flags as NBD_RT_ prefixed
  nbd: fix possible page fault for nbd disk

 drivers/block/nbd.c | 108 +++++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 37 deletions(-)

-- 
2.21.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2 v3] nbd: rename the runtime flags as NBD_RT_ prefixed
  2019-08-22  7:59 [PATCH 0/2 v3] nbd: fix possible page fault for nbd disk xiubli
@ 2019-08-22  7:59 ` xiubli
  2019-08-22  7:59 ` [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk xiubli
  1 sibling, 0 replies; 8+ messages in thread
From: xiubli @ 2019-08-22  7:59 UTC (permalink / raw)
  To: josef, axboe; +Cc: mchristi, linux-block, nbd, linux-kernel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Preparing for the destory when disconnecting crash fixing.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 drivers/block/nbd.c | 74 ++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e21d2ded732b..8c2f17b99224 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -71,14 +71,14 @@ struct link_dead_args {
 	int index;
 };
 
-#define NBD_TIMEDOUT			0
-#define NBD_DISCONNECT_REQUESTED	1
-#define NBD_DISCONNECTED		2
-#define NBD_HAS_PID_FILE		3
-#define NBD_HAS_CONFIG_REF		4
-#define NBD_BOUND			5
-#define NBD_DESTROY_ON_DISCONNECT	6
-#define NBD_DISCONNECT_ON_CLOSE 	7
+#define NBD_RT_TIMEDOUT			0
+#define NBD_RT_DISCONNECT_REQUESTED	1
+#define NBD_RT_DISCONNECTED		2
+#define NBD_RT_HAS_PID_FILE		3
+#define NBD_RT_HAS_CONFIG_REF		4
+#define NBD_RT_BOUND			5
+#define NBD_RT_DESTROY_ON_DISCONNECT	6
+#define NBD_RT_DISCONNECT_ON_CLOSE	7
 
 struct nbd_config {
 	u32 flags;
@@ -237,8 +237,8 @@ static void nbd_put(struct nbd_device *nbd)
 
 static int nbd_disconnected(struct nbd_config *config)
 {
-	return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
-		test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+	return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
+		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 }
 
 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
@@ -256,9 +256,9 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 	if (!nsock->dead) {
 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
-			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
 					       &nbd->config->runtime_flags)) {
-				set_bit(NBD_DISCONNECTED,
+				set_bit(NBD_RT_DISCONNECTED,
 					&nbd->config->runtime_flags);
 				dev_info(nbd_to_dev(nbd),
 					"Disconnected due to user request.\n");
@@ -332,7 +332,7 @@ static void sock_shutdown(struct nbd_device *nbd)
 
 	if (config->num_connections == 0)
 		return;
-	if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return;
 
 	for (i = 0; i < config->num_connections; i++) {
@@ -393,7 +393,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 		dev_err_ratelimited(nbd_to_dev(nbd),
 				    "Connection timed out\n");
 	}
-	set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
 	cmd->status = BLK_STS_IOERR;
 	mutex_unlock(&cmd->lock);
 	sock_shutdown(nbd);
@@ -773,7 +773,7 @@ static int find_fallback(struct nbd_device *nbd, int index)
 	struct nbd_sock *nsock = config->socks[index];
 	int fallback = nsock->fallback_index;
 
-	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return new_index;
 
 	if (config->num_connections <= 1) {
@@ -814,7 +814,7 @@ static int wait_for_reconnect(struct nbd_device *nbd)
 	struct nbd_config *config = nbd->config;
 	if (!config->dead_conn_timeout)
 		return 0;
-	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return 0;
 	return wait_event_timeout(config->conn_wait,
 				  atomic_read(&config->live_connections) > 0,
@@ -947,12 +947,12 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 		return err;
 
 	if (!netlink && !nbd->task_setup &&
-	    !test_bit(NBD_BOUND, &config->runtime_flags))
+	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
 		nbd->task_setup = current;
 
 	if (!netlink &&
 	    (nbd->task_setup != current ||
-	     test_bit(NBD_BOUND, &config->runtime_flags))) {
+	     test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Device being setup by another task");
 		sockfd_put(sock);
@@ -1031,7 +1031,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 		mutex_unlock(&nsock->tx_lock);
 		sockfd_put(old);
 
-		clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
+		clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
 
 		/* We take the tx_mutex in an error path in the recv_work, so we
 		 * need to queue_work outside of the tx_mutex.
@@ -1102,7 +1102,7 @@ static int nbd_disconnect(struct nbd_device *nbd)
 	struct nbd_config *config = nbd->config;
 
 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-	set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 	send_disconnects(nbd);
 	return 0;
 }
@@ -1121,7 +1121,7 @@ static void nbd_config_put(struct nbd_device *nbd)
 		struct nbd_config *config = nbd->config;
 		nbd_dev_dbg_close(nbd);
 		nbd_size_clear(nbd);
-		if (test_and_clear_bit(NBD_HAS_PID_FILE,
+		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
 				       &config->runtime_flags))
 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
 		nbd->task_recv = NULL;
@@ -1175,7 +1175,7 @@ static int nbd_start_device(struct nbd_device *nbd)
 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
 		return error;
 	}
-	set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
+	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
 
 	nbd_dev_dbg_init(nbd);
 	for (i = 0; i < num_connections; i++) {
@@ -1220,9 +1220,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 	mutex_lock(&nbd->config_lock);
 	nbd_bdev_reset(bdev);
 	/* user requested, ignore socket errors */
-	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
 		ret = 0;
-	if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+	if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
 		ret = -ETIMEDOUT;
 	return ret;
 }
@@ -1233,7 +1233,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
 	sock_shutdown(nbd);
 	__invalidate_device(bdev, true);
 	nbd_bdev_reset(bdev);
-	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
 }
@@ -1324,7 +1324,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 	/* Don't allow ioctl operations on a nbd device that was created with
 	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
 	 */
-	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
 	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
 		error = __nbd_ioctl(bdev, nbd, cmd, arg);
 	else
@@ -1395,7 +1395,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode)
 	struct nbd_device *nbd = disk->private_data;
 	struct block_device *bdev = bdget_disk(disk, 0);
 
-	if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
+	if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
 			bdev->bd_openers == 0)
 		nbd_disconnect_and_put(nbd);
 
@@ -1793,7 +1793,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 		return -ENOMEM;
 	}
 	refcount_set(&nbd->config_refs, 1);
-	set_bit(NBD_BOUND, &config->runtime_flags);
+	set_bit(NBD_RT_BOUND, &config->runtime_flags);
 
 	ret = nbd_genl_size_set(info, nbd);
 	if (ret)
@@ -1815,12 +1815,12 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
-			set_bit(NBD_DESTROY_ON_DISCONNECT,
+			set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 				&config->runtime_flags);
 			put_dev = true;
 		}
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
-			set_bit(NBD_DISCONNECT_ON_CLOSE,
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 				&config->runtime_flags);
 		}
 	}
@@ -1859,7 +1859,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 out:
 	mutex_unlock(&nbd->config_lock);
 	if (!ret) {
-		set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+		set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
 		refcount_inc(&nbd->config_refs);
 		nbd_connect_reply(info, nbd->index);
 	}
@@ -1875,7 +1875,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
 	nbd_disconnect(nbd);
 	nbd_clear_sock(nbd);
 	mutex_unlock(&nbd->config_lock);
-	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
 }
@@ -1959,7 +1959,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 
 	mutex_lock(&nbd->config_lock);
 	config = nbd->config;
-	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
 	    !nbd->task_recv) {
 		dev_err(nbd_to_dev(nbd),
 			"not configured, cannot reconfigure\n");
@@ -1984,20 +1984,20 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
-			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+			if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					      &config->runtime_flags))
 				put_dev = true;
 		} else {
-			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+			if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					       &config->runtime_flags))
 				refcount_inc(&nbd->refs);
 		}
 
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
-			set_bit(NBD_DISCONNECT_ON_CLOSE,
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 					&config->runtime_flags);
 		} else {
-			clear_bit(NBD_DISCONNECT_ON_CLOSE,
+			clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 					&config->runtime_flags);
 		}
 	}
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-08-22  7:59 [PATCH 0/2 v3] nbd: fix possible page fault for nbd disk xiubli
  2019-08-22  7:59 ` [PATCH 1/2 v3] nbd: rename the runtime flags as NBD_RT_ prefixed xiubli
@ 2019-08-22  7:59 ` xiubli
  2019-08-29 23:49   ` Mike Christie
  1 sibling, 1 reply; 8+ messages in thread
From: xiubli @ 2019-08-22  7:59 UTC (permalink / raw)
  To: josef, axboe; +Cc: mchristi, linux-block, nbd, linux-kernel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same
time when the socket is closed due to the server daemon is restarted,
just before the last DISCONNET is totally done if we start a new connection
by using the old nbd_index, there will be crashing randomly, like:

<3>[  110.151949] block nbd1: Receive control failed (result -32)
<1>[  110.152024] BUG: unable to handle page fault for address: 0000058000000840
<1>[  110.152063] #PF: supervisor read access in kernel mode
<1>[  110.152083] #PF: error_code(0x0000) - not-present page
<6>[  110.152094] PGD 0 P4D 0
<4>[  110.152106] Oops: 0000 [#1] SMP PTI
<4>[  110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded Not tainted 5.3.0-rc4+ #2
<4>[  110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
<4>[  110.152166] Workqueue: knbd-recv recv_work [nbd]
<4>[  110.152187] RIP: 0010:__dev_printk+0xd/0x67
<4>[  110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c 25 28 00 [...]
<4>[  110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206
<4>[  110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX: 0000000000000000
<4>[  110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI: ffffffff970ec24f
<4>[  110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09: 0000000000002e56
<4>[  110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12: ffff96dd7374e900
<4>[  110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15: ffff96dd6e3b02c8
<4>[  110.152329] FS:  0000000000000000(0000) GS:ffff96dd7fc00000(0000) knlGS:0000000000000000
<4>[  110.152362] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4>[  110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4: 00000000001606f0
<4>[  110.152401] Call Trace:
<4>[  110.152422]  _dev_err+0x6c/0x83
<4>[  110.152435]  nbd_read_stat.cold+0xda/0x578 [nbd]
<4>[  110.152448]  ? __switch_to_asm+0x34/0x70
<4>[  110.152468]  ? __switch_to_asm+0x40/0x70
<4>[  110.152478]  ? __switch_to_asm+0x34/0x70
<4>[  110.152491]  ? __switch_to_asm+0x40/0x70
<4>[  110.152501]  ? __switch_to_asm+0x34/0x70
<4>[  110.152511]  ? __switch_to_asm+0x40/0x70
<4>[  110.152522]  ? __switch_to_asm+0x34/0x70
<4>[  110.152533]  recv_work+0x35/0x9e [nbd]
<4>[  110.152547]  process_one_work+0x19d/0x340
<4>[  110.152558]  worker_thread+0x50/0x3b0
<4>[  110.152568]  kthread+0xfb/0x130
<4>[  110.152577]  ? process_one_work+0x340/0x340
<4>[  110.152609]  ? kthread_park+0x80/0x80
<4>[  110.152637]  ret_from_fork+0x35/0x40

This is very easy to reproduce by running the nbd-runner.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 drivers/block/nbd.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8c2f17b99224..a25b59725c6e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -26,6 +26,7 @@
 #include <linux/ioctl.h>
 #include <linux/mutex.h>
 #include <linux/compiler.h>
+#include <linux/completion.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -80,6 +81,9 @@ struct link_dead_args {
 #define NBD_RT_DESTROY_ON_DISCONNECT	6
 #define NBD_RT_DISCONNECT_ON_CLOSE	7
 
+#define NBD_DESTROY_ON_DISCONNECT	0
+#define NBD_DISCONNECT_REQUESTED	1
+
 struct nbd_config {
 	u32 flags;
 	unsigned long runtime_flags;
@@ -112,6 +116,9 @@ struct nbd_device {
 	struct list_head list;
 	struct task_struct *task_recv;
 	struct task_struct *task_setup;
+
+	struct completion destroy_complete;
+	unsigned long flags;
 };
 
 #define NBD_CMD_REQUEUED	1
@@ -222,6 +229,16 @@ static void nbd_dev_remove(struct nbd_device *nbd)
 		disk->private_data = NULL;
 		put_disk(disk);
 	}
+
+	/*
+	 * Place this in the last just before the nbd is freed to
+	 * make sure that the disk and the related kobject are also
+	 * totally removed to avoid duplicate creation of the same
+	 * one.
+	 */
+	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+		complete(&nbd->destroy_complete);
+
 	kfree(nbd);
 }
 
@@ -230,8 +247,8 @@ static void nbd_put(struct nbd_device *nbd)
 	if (refcount_dec_and_mutex_lock(&nbd->refs,
 					&nbd_index_mutex)) {
 		idr_remove(&nbd_index_idr, nbd->index);
-		mutex_unlock(&nbd_index_mutex);
 		nbd_dev_remove(nbd);
+		mutex_unlock(&nbd_index_mutex);
 	}
 }
 
@@ -1103,6 +1120,7 @@ static int nbd_disconnect(struct nbd_device *nbd)
 
 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
+	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
 	send_disconnects(nbd);
 	return 0;
 }
@@ -1596,6 +1614,7 @@ static int nbd_dev_add(int index)
 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
 		BLK_MQ_F_BLOCKING;
 	nbd->tag_set.driver_data = nbd;
+	init_completion(&nbd->destroy_complete);
 
 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
 	if (err)
@@ -1761,6 +1780,16 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 		mutex_unlock(&nbd_index_mutex);
 		return -EINVAL;
 	}
+
+	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
+	    test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
+		mutex_unlock(&nbd_index_mutex);
+
+		/* Wait untill the recv_work exit */
+		wait_for_completion(&nbd->destroy_complete);
+		goto again;
+	}
+
 	if (!refcount_inc_not_zero(&nbd->refs)) {
 		mutex_unlock(&nbd_index_mutex);
 		if (index == -1)
@@ -1817,7 +1846,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
 			set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 				&config->runtime_flags);
+			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 			put_dev = true;
+		} else {
+			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		}
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
 			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
@@ -1987,10 +2019,12 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 			if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					      &config->runtime_flags))
 				put_dev = true;
+			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		} else {
 			if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					       &config->runtime_flags))
 				refcount_inc(&nbd->refs);
+			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		}
 
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-08-22  7:59 ` [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk xiubli
@ 2019-08-29 23:49   ` Mike Christie
  2019-08-30  0:58     ` Xiubo Li
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Christie @ 2019-08-29 23:49 UTC (permalink / raw)
  To: xiubli, josef, axboe; +Cc: linux-block, nbd, linux-kernel

On 08/22/2019 02:59 AM, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same
> time when the socket is closed due to the server daemon is restarted,
> just before the last DISCONNET is totally done if we start a new connection
> by using the old nbd_index, there will be crashing randomly, like:
> 
> <3>[  110.151949] block nbd1: Receive control failed (result -32)
> <1>[  110.152024] BUG: unable to handle page fault for address: 0000058000000840
> <1>[  110.152063] #PF: supervisor read access in kernel mode
> <1>[  110.152083] #PF: error_code(0x0000) - not-present page
> <6>[  110.152094] PGD 0 P4D 0
> <4>[  110.152106] Oops: 0000 [#1] SMP PTI
> <4>[  110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded Not tainted 5.3.0-rc4+ #2
> <4>[  110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> <4>[  110.152166] Workqueue: knbd-recv recv_work [nbd]
> <4>[  110.152187] RIP: 0010:__dev_printk+0xd/0x67
> <4>[  110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c 25 28 00 [...]
> <4>[  110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206
> <4>[  110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX: 0000000000000000
> <4>[  110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI: ffffffff970ec24f
> <4>[  110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09: 0000000000002e56
> <4>[  110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12: ffff96dd7374e900
> <4>[  110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15: ffff96dd6e3b02c8
> <4>[  110.152329] FS:  0000000000000000(0000) GS:ffff96dd7fc00000(0000) knlGS:0000000000000000
> <4>[  110.152362] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[  110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4: 00000000001606f0
> <4>[  110.152401] Call Trace:
> <4>[  110.152422]  _dev_err+0x6c/0x83
> <4>[  110.152435]  nbd_read_stat.cold+0xda/0x578 [nbd]
> <4>[  110.152448]  ? __switch_to_asm+0x34/0x70
> <4>[  110.152468]  ? __switch_to_asm+0x40/0x70
> <4>[  110.152478]  ? __switch_to_asm+0x34/0x70
> <4>[  110.152491]  ? __switch_to_asm+0x40/0x70
> <4>[  110.152501]  ? __switch_to_asm+0x34/0x70
> <4>[  110.152511]  ? __switch_to_asm+0x40/0x70
> <4>[  110.152522]  ? __switch_to_asm+0x34/0x70
> <4>[  110.152533]  recv_work+0x35/0x9e [nbd]
> <4>[  110.152547]  process_one_work+0x19d/0x340
> <4>[  110.152558]  worker_thread+0x50/0x3b0
> <4>[  110.152568]  kthread+0xfb/0x130
> <4>[  110.152577]  ? process_one_work+0x340/0x340
> <4>[  110.152609]  ? kthread_park+0x80/0x80
> <4>[  110.152637]  ret_from_fork+0x35/0x40
> 
> This is very easy to reproduce by running the nbd-runner.
> 
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  drivers/block/nbd.c | 36 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 8c2f17b99224..a25b59725c6e 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -26,6 +26,7 @@
>  #include <linux/ioctl.h>
>  #include <linux/mutex.h>
>  #include <linux/compiler.h>
> +#include <linux/completion.h>
>  #include <linux/err.h>
>  #include <linux/kernel.h>
>  #include <linux/slab.h>
> @@ -80,6 +81,9 @@ struct link_dead_args {
>  #define NBD_RT_DESTROY_ON_DISCONNECT	6
>  #define NBD_RT_DISCONNECT_ON_CLOSE	7
>  
> +#define NBD_DESTROY_ON_DISCONNECT	0
> +#define NBD_DISCONNECT_REQUESTED	1
> +
>  struct nbd_config {
>  	u32 flags;
>  	unsigned long runtime_flags;
> @@ -112,6 +116,9 @@ struct nbd_device {
>  	struct list_head list;
>  	struct task_struct *task_recv;
>  	struct task_struct *task_setup;
> +
> +	struct completion destroy_complete;
> +	unsigned long flags;
>  };
>  
>  #define NBD_CMD_REQUEUED	1
> @@ -222,6 +229,16 @@ static void nbd_dev_remove(struct nbd_device *nbd)
>  		disk->private_data = NULL;
>  		put_disk(disk);
>  	}
> +
> +	/*
> +	 * Place this in the last just before the nbd is freed to
> +	 * make sure that the disk and the related kobject are also
> +	 * totally removed to avoid duplicate creation of the same
> +	 * one.
> +	 */
> +	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
> +		complete(&nbd->destroy_complete);
> +
>  	kfree(nbd);
>  }
>  
> @@ -230,8 +247,8 @@ static void nbd_put(struct nbd_device *nbd)
>  	if (refcount_dec_and_mutex_lock(&nbd->refs,
>  					&nbd_index_mutex)) {
>  		idr_remove(&nbd_index_idr, nbd->index);
> -		mutex_unlock(&nbd_index_mutex);
>  		nbd_dev_remove(nbd);
> +		mutex_unlock(&nbd_index_mutex);
>  	}
>  }
>  
> @@ -1103,6 +1120,7 @@ static int nbd_disconnect(struct nbd_device *nbd)
>  
>  	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
>  	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
> +	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
>  	send_disconnects(nbd);
>  	return 0;
>  }
> @@ -1596,6 +1614,7 @@ static int nbd_dev_add(int index)
>  	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
>  		BLK_MQ_F_BLOCKING;
>  	nbd->tag_set.driver_data = nbd;
> +	init_completion(&nbd->destroy_complete);
>  
>  	err = blk_mq_alloc_tag_set(&nbd->tag_set);
>  	if (err)
> @@ -1761,6 +1780,16 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
>  		mutex_unlock(&nbd_index_mutex);
>  		return -EINVAL;
>  	}
> +
> +	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&

Why does this have to be set? If this is not set would you end up
hitting the config_refs check:

if (refcount_read(&nbd->config_refs)) {

and possibly returning failure?

If you moved the complete() to nbd_config_put would it work if this bit
was set or not?


> +	    test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
> +		mutex_unlock(&nbd_index_mutex);
> +
> +		/* Wait untill the recv_work exit */

If that is all you need we could do a flush_workqueue(nbd->recv_workq)
(you would need Jens's for next branch which has some work queue changes
in nbd).

I think that might be too messy with how we do the puts for the
nbd_device and config and the locking though.


> +		wait_for_completion(&nbd->destroy_complete);

The completion is allocated as part of the nbd device struct. Right
after the other thread does a complete() we will free the nbd device
struct, and we could still access the destroy_complete completion in
this thread in wait_for_completion.

You would want to do DECLARE_COMPLETION_ONSTACK in this function, make
destroy_complete a pointer and set the destroy_complete pointer to the
completion declared in this function.

In nbd_dev_remove you would then just check if destroy_complete is set
to a non-NULL pointer.




> +		goto again;
> +	}
> +
>  	if (!refcount_inc_not_zero(&nbd->refs)) {
>  		mutex_unlock(&nbd_index_mutex);
>  		if (index == -1)
> @@ -1817,7 +1846,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
>  		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
>  			set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>  				&config->runtime_flags);
> +			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>  			put_dev = true;
> +		} else {
> +			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>  		}
>  		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
>  			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
> @@ -1987,10 +2019,12 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
>  			if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>  					      &config->runtime_flags))
>  				put_dev = true;
> +			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>  		} else {
>  			if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>  					       &config->runtime_flags))
>  				refcount_inc(&nbd->refs);
> +			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>  		}
>  
>  		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-08-29 23:49   ` Mike Christie
@ 2019-08-30  0:58     ` Xiubo Li
  2019-08-30  5:22       ` Xiubo Li
  2019-09-02 21:30       ` Mike Christie
  0 siblings, 2 replies; 8+ messages in thread
From: Xiubo Li @ 2019-08-30  0:58 UTC (permalink / raw)
  To: Mike Christie, josef, axboe; +Cc: linux-block, nbd, linux-kernel

On 2019/8/30 7:49, Mike Christie wrote:
> On 08/22/2019 02:59 AM, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same
>> time when the socket is closed due to the server daemon is restarted,
>> just before the last DISCONNET is totally done if we start a new connection
>> by using the old nbd_index, there will be crashing randomly, like:
>>
>> <3>[  110.151949] block nbd1: Receive control failed (result -32)
>> <1>[  110.152024] BUG: unable to handle page fault for address: 0000058000000840
>> <1>[  110.152063] #PF: supervisor read access in kernel mode
>> <1>[  110.152083] #PF: error_code(0x0000) - not-present page
>> <6>[  110.152094] PGD 0 P4D 0
>> <4>[  110.152106] Oops: 0000 [#1] SMP PTI
>> <4>[  110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded Not tainted 5.3.0-rc4+ #2
>> <4>[  110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
>> <4>[  110.152166] Workqueue: knbd-recv recv_work [nbd]
>> <4>[  110.152187] RIP: 0010:__dev_printk+0xd/0x67
>> <4>[  110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c 25 28 00 [...]
>> <4>[  110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206
>> <4>[  110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX: 0000000000000000
>> <4>[  110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI: ffffffff970ec24f
>> <4>[  110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09: 0000000000002e56
>> <4>[  110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12: ffff96dd7374e900
>> <4>[  110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15: ffff96dd6e3b02c8
>> <4>[  110.152329] FS:  0000000000000000(0000) GS:ffff96dd7fc00000(0000) knlGS:0000000000000000
>> <4>[  110.152362] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> <4>[  110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4: 00000000001606f0
>> <4>[  110.152401] Call Trace:
>> <4>[  110.152422]  _dev_err+0x6c/0x83
>> <4>[  110.152435]  nbd_read_stat.cold+0xda/0x578 [nbd]
>> <4>[  110.152448]  ? __switch_to_asm+0x34/0x70
>> <4>[  110.152468]  ? __switch_to_asm+0x40/0x70
>> <4>[  110.152478]  ? __switch_to_asm+0x34/0x70
>> <4>[  110.152491]  ? __switch_to_asm+0x40/0x70
>> <4>[  110.152501]  ? __switch_to_asm+0x34/0x70
>> <4>[  110.152511]  ? __switch_to_asm+0x40/0x70
>> <4>[  110.152522]  ? __switch_to_asm+0x34/0x70
>> <4>[  110.152533]  recv_work+0x35/0x9e [nbd]
>> <4>[  110.152547]  process_one_work+0x19d/0x340
>> <4>[  110.152558]  worker_thread+0x50/0x3b0
>> <4>[  110.152568]  kthread+0xfb/0x130
>> <4>[  110.152577]  ? process_one_work+0x340/0x340
>> <4>[  110.152609]  ? kthread_park+0x80/0x80
>> <4>[  110.152637]  ret_from_fork+0x35/0x40
>>
>> This is very easy to reproduce by running the nbd-runner.
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   drivers/block/nbd.c | 36 +++++++++++++++++++++++++++++++++++-
>>   1 file changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
>> index 8c2f17b99224..a25b59725c6e 100644
>> --- a/drivers/block/nbd.c
>> +++ b/drivers/block/nbd.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/ioctl.h>
>>   #include <linux/mutex.h>
>>   #include <linux/compiler.h>
>> +#include <linux/completion.h>
>>   #include <linux/err.h>
>>   #include <linux/kernel.h>
>>   #include <linux/slab.h>
>> @@ -80,6 +81,9 @@ struct link_dead_args {
>>   #define NBD_RT_DESTROY_ON_DISCONNECT	6
>>   #define NBD_RT_DISCONNECT_ON_CLOSE	7
>>   
>> +#define NBD_DESTROY_ON_DISCONNECT	0
>> +#define NBD_DISCONNECT_REQUESTED	1
>> +
>>   struct nbd_config {
>>   	u32 flags;
>>   	unsigned long runtime_flags;
>> @@ -112,6 +116,9 @@ struct nbd_device {
>>   	struct list_head list;
>>   	struct task_struct *task_recv;
>>   	struct task_struct *task_setup;
>> +
>> +	struct completion destroy_complete;
>> +	unsigned long flags;
>>   };
>>   
>>   #define NBD_CMD_REQUEUED	1
>> @@ -222,6 +229,16 @@ static void nbd_dev_remove(struct nbd_device *nbd)
>>   		disk->private_data = NULL;
>>   		put_disk(disk);
>>   	}
>> +
>> +	/*
>> +	 * Place this in the last just before the nbd is freed to
>> +	 * make sure that the disk and the related kobject are also
>> +	 * totally removed to avoid duplicate creation of the same
>> +	 * one.
>> +	 */
>> +	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
>> +		complete(&nbd->destroy_complete);
>> +
>>   	kfree(nbd);
>>   }
>>   
>> @@ -230,8 +247,8 @@ static void nbd_put(struct nbd_device *nbd)
>>   	if (refcount_dec_and_mutex_lock(&nbd->refs,
>>   					&nbd_index_mutex)) {
>>   		idr_remove(&nbd_index_idr, nbd->index);
>> -		mutex_unlock(&nbd_index_mutex);
>>   		nbd_dev_remove(nbd);
>> +		mutex_unlock(&nbd_index_mutex);
>>   	}
>>   }
>>   
>> @@ -1103,6 +1120,7 @@ static int nbd_disconnect(struct nbd_device *nbd)
>>   
>>   	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
>>   	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
>> +	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
>>   	send_disconnects(nbd);
>>   	return 0;
>>   }
>> @@ -1596,6 +1614,7 @@ static int nbd_dev_add(int index)
>>   	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
>>   		BLK_MQ_F_BLOCKING;
>>   	nbd->tag_set.driver_data = nbd;
>> +	init_completion(&nbd->destroy_complete);
>>   
>>   	err = blk_mq_alloc_tag_set(&nbd->tag_set);
>>   	if (err)
>> @@ -1761,6 +1780,16 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
>>   		mutex_unlock(&nbd_index_mutex);
>>   		return -EINVAL;
>>   	}
>> +
>> +	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
> Why does this have to be set? If this is not set would you end up
> hitting the config_refs check:
>
> if (refcount_read(&nbd->config_refs)) {
>
> and possibly returning failure?

Yeah, this is a good question. Before I have also tried to fix it with 
this, but it still won't work for me.

 From my test cases almost more than 50% times, the crash will be hit in 
the gap just after the nbd->config already been released, and before the 
nbd itself not yet, so the nbd->config_refs will be 0.


>
> If you moved the complete() to nbd_config_put would it work if this bit
> was set or not?

Tried it already, it still won't work.

There is one case that when disconnecting the nbd device, the userspace 
service will do the open()/release()  things, please see [1], and the 
sequence is not the same every time, if the 
NBD_CFLAG_DESTROY_ON_DISCONNECT bit is set the crash still exists.

So sometimes when the nbd_put() called from the nbd_config_put(), the 
&nbd->refs in nbd_put won't be 0, it could be 1. And it will be 0 just 
after the release() is triggered later.

So I just place the complete() before "free(nbd);", or there will be 
another Call trace will be seen very often:

    2489 Aug 20 18:10:04 lxbfd2 kernel: sysfs: cannot create duplicate 
filename '/devices/virtual/block/nbd0'
    2490 Aug 20 18:10:04 lxbfd2 kernel: CPU: 0 PID: 8635 Comm: nbd-clid 
Kdump: loaded Tainted: G      D 5.1.18-300.fc30.x86_64 #1
    2491 Aug 20 18:10:04 lxbfd2 kernel: Hardware name: Red Hat KVM, BIOS 
0.5.1 01/01/2011
    2492 Aug 20 18:10:04 lxbfd2 kernel: Call Trace:
    2493 Aug 20 18:10:04 lxbfd2 kernel: dump_stack+0x5c/0x80
    2494 Aug 20 18:10:04 lxbfd2 kernel: sysfs_warn_dup.cold+0x17/0x2d
    2495 Aug 20 18:10:04 lxbfd2 kernel: sysfs_create_dir_ns+0xb6/0xd0
    2496 Aug 20 18:10:04 lxbfd2 kernel: kobject_add_internal+0xb7/0x280
    2497 Aug 20 18:10:04 lxbfd2 kernel: kobject_add+0x7e/0xb0
    2498 Aug 20 18:10:04 lxbfd2 kernel: ? _cond_resched+0x15/0x30
    2499 Aug 20 18:10:04 lxbfd2 kernel: device_add+0x12b/0x690
    2500 Aug 20 18:10:04 lxbfd2 kernel: __device_add_disk+0x1b5/0x470
    2501 Aug 20 18:10:04 lxbfd2 kernel: nbd_dev_add+0x21d/0x2b0 [nbd]
    2502 Aug 20 18:10:04 lxbfd2 kernel: nbd_genl_connect+0x16e/0x630 [nbd]
    2503 Aug 20 18:10:04 lxbfd2 kernel: genl_family_rcv_msg+0x1a9/0x3b0
    2504 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x35/0x70
    2505 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
    2506 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to+0x11f/0x4c0
    2507 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
    [...]


[1] https://lkml.org/lkml/2019/5/29/125


>
>> +	    test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
>> +		mutex_unlock(&nbd_index_mutex);
>> +
>> +		/* Wait untill the recv_work exit */
> If that is all you need we could do a flush_workqueue(nbd->recv_workq)
> (you would need Jens's for next branch which has some work queue changes
> in nbd).

Yeah, this makes sense.

> I think that might be too messy with how we do the puts for the
> nbd_device and config and the locking though.
>
>
>> +		wait_for_completion(&nbd->destroy_complete);
> The completion is allocated as part of the nbd device struct. Right
> after the other thread does a complete() we will free the nbd device
> struct, and we could still access the destroy_complete completion in
> this thread in wait_for_completion.
>
> You would want to do DECLARE_COMPLETION_ONSTACK in this function, make
> destroy_complete a pointer and set the destroy_complete pointer to the
> completion declared in this function.
>
> In nbd_dev_remove you would then just check if destroy_complete is set
> to a non-NULL pointer.

Hmm, good catch.

Will fix it.

Thanks Mike.
BRs
Xiubo


>
>
>
>> +		goto again;
>> +	}
>> +
>>   	if (!refcount_inc_not_zero(&nbd->refs)) {
>>   		mutex_unlock(&nbd_index_mutex);
>>   		if (index == -1)
>> @@ -1817,7 +1846,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
>>   		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
>>   			set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>>   				&config->runtime_flags);
>> +			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>>   			put_dev = true;
>> +		} else {
>> +			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>>   		}
>>   		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
>>   			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
>> @@ -1987,10 +2019,12 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
>>   			if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>>   					      &config->runtime_flags))
>>   				put_dev = true;
>> +			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>>   		} else {
>>   			if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
>>   					       &config->runtime_flags))
>>   				refcount_inc(&nbd->refs);
>> +			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
>>   		}
>>   
>>   		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
>>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-08-30  0:58     ` Xiubo Li
@ 2019-08-30  5:22       ` Xiubo Li
  2019-09-02 21:30       ` Mike Christie
  1 sibling, 0 replies; 8+ messages in thread
From: Xiubo Li @ 2019-08-30  5:22 UTC (permalink / raw)
  To: Mike Christie, josef, axboe; +Cc: linux-block, nbd, linux-kernel

On 2019/8/30 8:58, Xiubo Li wrote:
> On 2019/8/30 7:49, Mike Christie wrote:
>> On 08/22/2019 02:59 AM, xiubli@redhat.com wrote:
[...]
>>
>>> + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
>>> +        mutex_unlock(&nbd_index_mutex);
>>> +
>>> +        /* Wait untill the recv_work exit */
>> If that is all you need we could do a flush_workqueue(nbd->recv_workq)
>> (you would need Jens's for next branch which has some work queue changes
>> in nbd).
>
> Yeah, this makes sense.

This has already been done in nbd_disconnect_and_put() in the Jen's for 
next branch code. So here it will make no sense.

I will rebase this patch set to that branch.

Thanks.
BRs
Xiubo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-08-30  0:58     ` Xiubo Li
  2019-08-30  5:22       ` Xiubo Li
@ 2019-09-02 21:30       ` Mike Christie
  2019-09-03  0:45         ` Xiubo Li
  1 sibling, 1 reply; 8+ messages in thread
From: Mike Christie @ 2019-09-02 21:30 UTC (permalink / raw)
  To: Xiubo Li, josef, axboe; +Cc: linux-block, nbd, linux-kernel

On 08/29/2019 07:58 PM, Xiubo Li wrote:
> On 2019/8/30 7:49, Mike Christie wrote:
>> On 08/22/2019 02:59 AM, xiubli@redhat.com wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same
>>> time when the socket is closed due to the server daemon is restarted,
>>> just before the last DISCONNET is totally done if we start a new
>>> connection
>>> by using the old nbd_index, there will be crashing randomly, like:
>>>
>>> <3>[  110.151949] block nbd1: Receive control failed (result -32)
>>> <1>[  110.152024] BUG: unable to handle page fault for address:
>>> 0000058000000840
>>> <1>[  110.152063] #PF: supervisor read access in kernel mode
>>> <1>[  110.152083] #PF: error_code(0x0000) - not-present page
>>> <6>[  110.152094] PGD 0 P4D 0
>>> <4>[  110.152106] Oops: 0000 [#1] SMP PTI
>>> <4>[  110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded
>>> Not tainted 5.3.0-rc4+ #2
>>> <4>[  110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
>>> <4>[  110.152166] Workqueue: knbd-recv recv_work [nbd]
>>> <4>[  110.152187] RIP: 0010:__dev_printk+0xd/0x67
>>> <4>[  110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c
>>> 25 28 00 [...]
>>> <4>[  110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206
>>> <4>[  110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX:
>>> 0000000000000000
>>> <4>[  110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI:
>>> ffffffff970ec24f
>>> <4>[  110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09:
>>> 0000000000002e56
>>> <4>[  110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12:
>>> ffff96dd7374e900
>>> <4>[  110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15:
>>> ffff96dd6e3b02c8
>>> <4>[  110.152329] FS:  0000000000000000(0000)
>>> GS:ffff96dd7fc00000(0000) knlGS:0000000000000000
>>> <4>[  110.152362] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> <4>[  110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4:
>>> 00000000001606f0
>>> <4>[  110.152401] Call Trace:
>>> <4>[  110.152422]  _dev_err+0x6c/0x83
>>> <4>[  110.152435]  nbd_read_stat.cold+0xda/0x578 [nbd]
>>> <4>[  110.152448]  ? __switch_to_asm+0x34/0x70
>>> <4>[  110.152468]  ? __switch_to_asm+0x40/0x70
>>> <4>[  110.152478]  ? __switch_to_asm+0x34/0x70
>>> <4>[  110.152491]  ? __switch_to_asm+0x40/0x70
>>> <4>[  110.152501]  ? __switch_to_asm+0x34/0x70
>>> <4>[  110.152511]  ? __switch_to_asm+0x40/0x70
>>> <4>[  110.152522]  ? __switch_to_asm+0x34/0x70
>>> <4>[  110.152533]  recv_work+0x35/0x9e [nbd]
>>> <4>[  110.152547]  process_one_work+0x19d/0x340
>>> <4>[  110.152558]  worker_thread+0x50/0x3b0
>>> <4>[  110.152568]  kthread+0xfb/0x130
>>> <4>[  110.152577]  ? process_one_work+0x340/0x340
>>> <4>[  110.152609]  ? kthread_park+0x80/0x80
>>> <4>[  110.152637]  ret_from_fork+0x35/0x40
>>>
>>> This is very easy to reproduce by running the nbd-runner.
>>>
>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> ---
>>>   drivers/block/nbd.c | 36 +++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 35 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
>>> index 8c2f17b99224..a25b59725c6e 100644
>>> --- a/drivers/block/nbd.c
>>> +++ b/drivers/block/nbd.c
>>> @@ -26,6 +26,7 @@
>>>   #include <linux/ioctl.h>
>>>   #include <linux/mutex.h>
>>>   #include <linux/compiler.h>
>>> +#include <linux/completion.h>
>>>   #include <linux/err.h>
>>>   #include <linux/kernel.h>
>>>   #include <linux/slab.h>
>>> @@ -80,6 +81,9 @@ struct link_dead_args {
>>>   #define NBD_RT_DESTROY_ON_DISCONNECT    6
>>>   #define NBD_RT_DISCONNECT_ON_CLOSE    7
>>>   +#define NBD_DESTROY_ON_DISCONNECT    0
>>> +#define NBD_DISCONNECT_REQUESTED    1
>>> +
>>>   struct nbd_config {
>>>       u32 flags;
>>>       unsigned long runtime_flags;
>>> @@ -112,6 +116,9 @@ struct nbd_device {
>>>       struct list_head list;
>>>       struct task_struct *task_recv;
>>>       struct task_struct *task_setup;
>>> +
>>> +    struct completion destroy_complete;
>>> +    unsigned long flags;
>>>   };
>>>     #define NBD_CMD_REQUEUED    1
>>> @@ -222,6 +229,16 @@ static void nbd_dev_remove(struct nbd_device *nbd)
>>>           disk->private_data = NULL;
>>>           put_disk(disk);
>>>       }
>>> +
>>> +    /*
>>> +     * Place this in the last just before the nbd is freed to
>>> +     * make sure that the disk and the related kobject are also
>>> +     * totally removed to avoid duplicate creation of the same
>>> +     * one.
>>> +     */
>>> +    if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
>>> +        complete(&nbd->destroy_complete);
>>> +
>>>       kfree(nbd);
>>>   }
>>>   @@ -230,8 +247,8 @@ static void nbd_put(struct nbd_device *nbd)
>>>       if (refcount_dec_and_mutex_lock(&nbd->refs,
>>>                       &nbd_index_mutex)) {
>>>           idr_remove(&nbd_index_idr, nbd->index);
>>> -        mutex_unlock(&nbd_index_mutex);
>>>           nbd_dev_remove(nbd);
>>> +        mutex_unlock(&nbd_index_mutex);
>>>       }
>>>   }
>>>   @@ -1103,6 +1120,7 @@ static int nbd_disconnect(struct nbd_device
>>> *nbd)
>>>         dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
>>>       set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
>>> +    set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
>>>       send_disconnects(nbd);
>>>       return 0;
>>>   }
>>> @@ -1596,6 +1614,7 @@ static int nbd_dev_add(int index)
>>>       nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
>>>           BLK_MQ_F_BLOCKING;
>>>       nbd->tag_set.driver_data = nbd;
>>> +    init_completion(&nbd->destroy_complete);
>>>         err = blk_mq_alloc_tag_set(&nbd->tag_set);
>>>       if (err)
>>> @@ -1761,6 +1780,16 @@ static int nbd_genl_connect(struct sk_buff
>>> *skb, struct genl_info *info)
>>>           mutex_unlock(&nbd_index_mutex);
>>>           return -EINVAL;
>>>       }
>>> +
>>> +    if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
>> Why does this have to be set? If this is not set would you end up
>> hitting the config_refs check:
>>
>> if (refcount_read(&nbd->config_refs)) {
>>
>> and possibly returning failure?
> 
> Yeah, this is a good question. Before I have also tried to fix it with
> this, but it still won't work for me.
> 
> From my test cases almost more than 50% times, the crash will be hit in
> the gap just after the nbd->config already been released, and before the
> nbd itself not yet, so the nbd->config_refs will be 0.
> 
> 
>>
>> If you moved the complete() to nbd_config_put would it work if this bit
>> was set or not?
> 
> Tried it already, it still won't work.
> 
> There is one case that when disconnecting the nbd device, the userspace
> service will do the open()/release()  things, please see [1], and the
> sequence is not the same every time, if the
> NBD_CFLAG_DESTROY_ON_DISCONNECT bit is set the crash still exists.
> 
> So sometimes when the nbd_put() called from the nbd_config_put(), the
> &nbd->refs in nbd_put won't be 0, it could be 1. And it will be 0 just
> after the release() is triggered later.
> 
> So I just place the complete() before "free(nbd);", or there will be
> another Call trace will be seen very often:

Did this happen because you race with

nbd_put->nbd_dev_remove->del_gendisk->device_del->sysfs_remove_dir

? If so, does that still happen after you moved

mutex_unlock(&nbd_index_mutex);

in nbd_put? It seems before that part of your patch was added we could
hit this race and got the duplicate sysfs entry trace below:

1. nbd_put -> idr_remove.
2. nbd_put drops mutex.
3. nbd_genl_connect takes mutex (index != -1 for this call).
4. nbd_genl_connect-> idr_find fails due to remove in #1.
5. nbd_genl_connect->nbd_dev_add is then able to try to add the device
to sysfs before the nbd_put->nbd_dev_remove path has deleted the device.


When you now do the idr and sysfs removal and idr addition/search and
sysfs addition all under the nbd_index_mutex it shouldn't happen anymore.




> 
>    2489 Aug 20 18:10:04 lxbfd2 kernel: sysfs: cannot create duplicate
> filename '/devices/virtual/block/nbd0'
>    2490 Aug 20 18:10:04 lxbfd2 kernel: CPU: 0 PID: 8635 Comm: nbd-clid
> Kdump: loaded Tainted: G      D 5.1.18-300.fc30.x86_64 #1
>    2491 Aug 20 18:10:04 lxbfd2 kernel: Hardware name: Red Hat KVM, BIOS
> 0.5.1 01/01/2011
>    2492 Aug 20 18:10:04 lxbfd2 kernel: Call Trace:
>    2493 Aug 20 18:10:04 lxbfd2 kernel: dump_stack+0x5c/0x80
>    2494 Aug 20 18:10:04 lxbfd2 kernel: sysfs_warn_dup.cold+0x17/0x2d
>    2495 Aug 20 18:10:04 lxbfd2 kernel: sysfs_create_dir_ns+0xb6/0xd0
>    2496 Aug 20 18:10:04 lxbfd2 kernel: kobject_add_internal+0xb7/0x280
>    2497 Aug 20 18:10:04 lxbfd2 kernel: kobject_add+0x7e/0xb0
>    2498 Aug 20 18:10:04 lxbfd2 kernel: ? _cond_resched+0x15/0x30
>    2499 Aug 20 18:10:04 lxbfd2 kernel: device_add+0x12b/0x690
>    2500 Aug 20 18:10:04 lxbfd2 kernel: __device_add_disk+0x1b5/0x470
>    2501 Aug 20 18:10:04 lxbfd2 kernel: nbd_dev_add+0x21d/0x2b0 [nbd]
>    2502 Aug 20 18:10:04 lxbfd2 kernel: nbd_genl_connect+0x16e/0x630 [nbd]
>    2503 Aug 20 18:10:04 lxbfd2 kernel: genl_family_rcv_msg+0x1a9/0x3b0
>    2504 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x35/0x70
>    2505 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
>    2506 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to+0x11f/0x4c0
>    2507 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
>    [...]
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk
  2019-09-02 21:30       ` Mike Christie
@ 2019-09-03  0:45         ` Xiubo Li
  0 siblings, 0 replies; 8+ messages in thread
From: Xiubo Li @ 2019-09-03  0:45 UTC (permalink / raw)
  To: Mike Christie, josef, axboe; +Cc: linux-block, nbd, linux-kernel

On 2019/9/3 5:30, Mike Christie wrote:
> On 08/29/2019 07:58 PM, Xiubo Li wrote:
>> On 2019/8/30 7:49, Mike Christie wrote:
>>> On 08/22/2019 02:59 AM, xiubli@redhat.com wrote:
[...]
>>> @@ -1596,6 +1614,7 @@ static int nbd_dev_add(int index)
>>>        nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
>>>            BLK_MQ_F_BLOCKING;
>>>        nbd->tag_set.driver_data = nbd;
>>> +    init_completion(&nbd->destroy_complete);
>>>          err = blk_mq_alloc_tag_set(&nbd->tag_set);
>>>        if (err)
>>> @@ -1761,6 +1780,16 @@ static int nbd_genl_connect(struct sk_buff
>>> *skb, struct genl_info *info)
>>>            mutex_unlock(&nbd_index_mutex);
>>>            return -EINVAL;
>>>        }
>>> +
>>> +    if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
>>> Why does this have to be set? If this is not set would you end up
>>> hitting the config_refs check:
>>>
>>> if (refcount_read(&nbd->config_refs)) {
>>>
>>> and possibly returning failure?
>> Yeah, this is a good question. Before I have also tried to fix it with
>> this, but it still won't work for me.
>>
>>  From my test cases almost more than 50% times, the crash will be hit in
>> the gap just after the nbd->config already been released, and before the
>> nbd itself not yet, so the nbd->config_refs will be 0.
>>
>>
>>> If you moved the complete() to nbd_config_put would it work if this bit
>>> was set or not?
>> Tried it already, it still won't work.
>>
>> There is one case that when disconnecting the nbd device, the userspace
>> service will do the open()/release()  things, please see [1], and the
>> sequence is not the same every time, if the
>> NBD_CFLAG_DESTROY_ON_DISCONNECT bit is set the crash still exists.
>>
>> So sometimes when the nbd_put() called from the nbd_config_put(), the
>> &nbd->refs in nbd_put won't be 0, it could be 1. And it will be 0 just
>> after the release() is triggered later.
>>
>> So I just place the complete() before "free(nbd);", or there will be
>> another Call trace will be seen very often:
> Did this happen because you race with
>
> nbd_put->nbd_dev_remove->del_gendisk->device_del->sysfs_remove_dir
>
> ? If so, does that still happen after you moved
>
> mutex_unlock(&nbd_index_mutex);
>
> in nbd_put?

Currently with this fix, there is no any Call Traces anymore from my 
test cases. I ran the test for almost a whole night long without any 
problem.


>   It seems before that part of your patch was added we could
> hit this race and got the duplicate sysfs entry trace below:
>
> 1. nbd_put -> idr_remove.
> 2. nbd_put drops mutex.
> 3. nbd_genl_connect takes mutex (index != -1 for this call).
> 4. nbd_genl_connect-> idr_find fails due to remove in #1.
> 5. nbd_genl_connect->nbd_dev_add is then able to try to add the device
> to sysfs before the nbd_put->nbd_dev_remove path has deleted the device.

Yeah, it is. Just before the old stale sysfs entry is totally 
removed/released, the nbd driver is trying to create it again with the 
same nbd_index.


>
> When you now do the idr and sysfs removal and idr addition/search and
> sysfs addition all under the nbd_index_mutex it shouldn't happen anymore.
>
Correctly.

Thanks,
BRs


>
>
>>     2489 Aug 20 18:10:04 lxbfd2 kernel: sysfs: cannot create duplicate
>> filename '/devices/virtual/block/nbd0'
>>     2490 Aug 20 18:10:04 lxbfd2 kernel: CPU: 0 PID: 8635 Comm: nbd-clid
>> Kdump: loaded Tainted: G      D 5.1.18-300.fc30.x86_64 #1
>>     2491 Aug 20 18:10:04 lxbfd2 kernel: Hardware name: Red Hat KVM, BIOS
>> 0.5.1 01/01/2011
>>     2492 Aug 20 18:10:04 lxbfd2 kernel: Call Trace:
>>     2493 Aug 20 18:10:04 lxbfd2 kernel: dump_stack+0x5c/0x80
>>     2494 Aug 20 18:10:04 lxbfd2 kernel: sysfs_warn_dup.cold+0x17/0x2d
>>     2495 Aug 20 18:10:04 lxbfd2 kernel: sysfs_create_dir_ns+0xb6/0xd0
>>     2496 Aug 20 18:10:04 lxbfd2 kernel: kobject_add_internal+0xb7/0x280
>>     2497 Aug 20 18:10:04 lxbfd2 kernel: kobject_add+0x7e/0xb0
>>     2498 Aug 20 18:10:04 lxbfd2 kernel: ? _cond_resched+0x15/0x30
>>     2499 Aug 20 18:10:04 lxbfd2 kernel: device_add+0x12b/0x690
>>     2500 Aug 20 18:10:04 lxbfd2 kernel: __device_add_disk+0x1b5/0x470
>>     2501 Aug 20 18:10:04 lxbfd2 kernel: nbd_dev_add+0x21d/0x2b0 [nbd]
>>     2502 Aug 20 18:10:04 lxbfd2 kernel: nbd_genl_connect+0x16e/0x630 [nbd]
>>     2503 Aug 20 18:10:04 lxbfd2 kernel: genl_family_rcv_msg+0x1a9/0x3b0
>>     2504 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x35/0x70
>>     2505 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
>>     2506 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to+0x11f/0x4c0
>>     2507 Aug 20 18:10:04 lxbfd2 kernel: ? __switch_to_asm+0x41/0x70
>>     [...]
>>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-09-03  0:45 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-22  7:59 [PATCH 0/2 v3] nbd: fix possible page fault for nbd disk xiubli
2019-08-22  7:59 ` [PATCH 1/2 v3] nbd: rename the runtime flags as NBD_RT_ prefixed xiubli
2019-08-22  7:59 ` [PATCH 2/2 v3] nbd: fix possible page fault for nbd disk xiubli
2019-08-29 23:49   ` Mike Christie
2019-08-30  0:58     ` Xiubo Li
2019-08-30  5:22       ` Xiubo Li
2019-09-02 21:30       ` Mike Christie
2019-09-03  0:45         ` Xiubo Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).