* [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
@ 2023-03-20 13:33 Sagi Grimberg
2023-03-21 5:12 ` Chaitanya Kulkarni
2023-03-22 8:20 ` Christoph Hellwig
0 siblings, 2 replies; 11+ messages in thread
From: Sagi Grimberg @ 2023-03-20 13:33 UTC (permalink / raw)
To: linux-nvme
Cc: Christoph Hellwig, Keith Busch, Chaitanya Kulkarni,
Hannes Reinecke, Yanjun Zhang
When we allocate a nvme-tcp queue, we set the data_ready callback before
we actually need to use it. This creates the potential that if a stray
controller sends us data on the socket before we connect, we can trigger
the io_work and start consuming the socket.
In this case reported: we failed to allocate one of the io queues, and
as we start releasing the queues that we already allocated, we get
a UAF [1] from the io_work which is running before it should really.
Fix this by setting the socket ops callbacks only before we start the
queue, so that we can't accidentally schedule the io_work in the
initialization phase before the queue started. While we are at it,
rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops.
[1]:
[16802.107284] nvme nvme4: starting error recovery
[16802.109166] nvme nvme4: Reconnecting in 10 seconds...
[16812.173535] nvme nvme4: failed to connect socket: -111
[16812.173745] nvme nvme4: Failed reconnect attempt 1
[16812.173747] nvme nvme4: Reconnecting in 10 seconds...
[16822.413555] nvme nvme4: failed to connect socket: -111
[16822.413762] nvme nvme4: Failed reconnect attempt 2
[16822.413765] nvme nvme4: Reconnecting in 10 seconds...
[16832.661274] nvme nvme4: creating 32 I/O queues.
[16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088
[16833.920068] nvme nvme4: Failed reconnect attempt 3
[16833.920094] #PF: supervisor write access in kernel mode
[16833.920261] nvme nvme4: Reconnecting in 10 seconds...
[16833.920368] #PF: error_code(0x0002) - not-present page
[16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
[16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30
...
[16833.923138] Call Trace:
[16833.923271] <TASK>
[16833.923402] lock_sock_nested+0x1e/0x50
[16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp]
[16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp]
[16833.923824] process_one_work+0x1e8/0x390
[16833.923969] worker_thread+0x53/0x3d0
[16833.924104] ? process_one_work+0x390/0x390
[16833.924240] kthread+0x124/0x150
[16833.924376] ? set_kthread_struct+0x50/0x50
[16833.924518] ret_from_fork+0x1f/0x30
[16833.924655] </TASK>
Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
Changes from v2:
- Move sock ops assignment to its own function (Hannes)
Changes from v1:
- Fix silly compliation error
Yanjun, I'll be waiting for your Tested-by tag. We need it
in order to apply this fix.
drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++++------------------
1 file changed, 26 insertions(+), 20 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 42c0598c31f2..49c9e7bc9116 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1620,22 +1620,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
if (ret)
goto err_init_connect;
- queue->rd_enabled = true;
set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
- nvme_tcp_init_recv_ctx(queue);
-
- write_lock_bh(&queue->sock->sk->sk_callback_lock);
- queue->sock->sk->sk_user_data = queue;
- queue->state_change = queue->sock->sk->sk_state_change;
- queue->data_ready = queue->sock->sk->sk_data_ready;
- queue->write_space = queue->sock->sk->sk_write_space;
- queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
- queue->sock->sk->sk_state_change = nvme_tcp_state_change;
- queue->sock->sk->sk_write_space = nvme_tcp_write_space;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- queue->sock->sk->sk_ll_usec = 1;
-#endif
- write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
@@ -1655,7 +1640,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
return ret;
}
-static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
{
struct socket *sock = queue->sock;
@@ -1670,7 +1655,7 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
{
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
- nvme_tcp_restore_sock_calls(queue);
+ nvme_tcp_restore_sock_ops(queue);
cancel_work_sync(&queue->io_work);
}
@@ -1688,21 +1673,42 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
mutex_unlock(&queue->queue_lock);
}
+static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
+{
+ write_lock_bh(&queue->sock->sk->sk_callback_lock);
+ queue->sock->sk->sk_user_data = queue;
+ queue->state_change = queue->sock->sk->sk_state_change;
+ queue->data_ready = queue->sock->sk->sk_data_ready;
+ queue->write_space = queue->sock->sk->sk_write_space;
+ queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+ queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+ queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ queue->sock->sk->sk_ll_usec = 1;
+#endif
+ write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+}
+
static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[idx];
int ret;
+ queue->rd_enabled = true;
+ nvme_tcp_init_recv_ctx(queue);
+ nvme_tcp_setup_sock_ops(queue);
+
if (idx)
ret = nvmf_connect_io_queue(nctrl, idx);
else
ret = nvmf_connect_admin_queue(nctrl);
if (!ret) {
- set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+ set_bit(NVME_TCP_Q_LIVE, &queue->flags);
} else {
- if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
- __nvme_tcp_stop_queue(&ctrl->queues[idx]);
+ if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+ __nvme_tcp_stop_queue(queue);
dev_err(nctrl->device,
"failed to connect queue: %d ret=%d\n", idx, ret);
}
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-20 13:33 [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue Sagi Grimberg
@ 2023-03-21 5:12 ` Chaitanya Kulkarni
2023-03-22 8:15 ` Christoph Hellwig
2023-03-22 8:20 ` Christoph Hellwig
1 sibling, 1 reply; 11+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-21 5:12 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Christoph Hellwig, linux-nvme, Keith Busch, Chaitanya Kulkarni,
Hannes Reinecke, Yanjun Zhang
> +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> +{
> + write_lock_bh(&queue->sock->sk->sk_callback_lock);
> + queue->sock->sk->sk_user_data = queue;
> + queue->state_change = queue->sock->sk->sk_state_change;
> + queue->data_ready = queue->sock->sk->sk_data_ready;
> + queue->write_space = queue->sock->sk->sk_write_space;
> + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> + queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> + queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + queue->sock->sk->sk_ll_usec = 1;
> +#endif
> + write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> +}
> +
since its is not in the fast path, is there a particular reason not
to use following ?
if (IS_ENABLED(CONFIG_NET_RX_BUSY_POLL)
queue->sock->sk->sk_ll_usec = 1;
-ck
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-21 5:12 ` Chaitanya Kulkarni
@ 2023-03-22 8:15 ` Christoph Hellwig
2023-03-22 8:18 ` Sagi Grimberg
0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2023-03-22 8:15 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: Sagi Grimberg, Christoph Hellwig, linux-nvme, Keith Busch,
Chaitanya Kulkarni, Hannes Reinecke, Yanjun Zhang
On Tue, Mar 21, 2023 at 05:12:57AM +0000, Chaitanya Kulkarni wrote:
> > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > + queue->sock->sk->sk_ll_usec = 1;
> > +#endif
> > + write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> > +}
> > +
>
> since its is not in the fast path, is there a particular reason not
> to use following ?
>
> if (IS_ENABLED(CONFIG_NET_RX_BUSY_POLL)
> queue->sock->sk->sk_ll_usec = 1;
That won't compile, as the sk_ll_usec is not defined without
CONFIG_NET_RX_BUSY_POLL.
Note that this has nothing to do with a fast path - for cases where
IS_ENABLED works there is no performance benefit to use an ifdef
given that IS_ENABLED is ѕpecifically designed to lead to compiler
dead code elimination.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-22 8:15 ` Christoph Hellwig
@ 2023-03-22 8:18 ` Sagi Grimberg
0 siblings, 0 replies; 11+ messages in thread
From: Sagi Grimberg @ 2023-03-22 8:18 UTC (permalink / raw)
To: Christoph Hellwig, Chaitanya Kulkarni
Cc: linux-nvme, Keith Busch, Chaitanya Kulkarni, Hannes Reinecke,
Yanjun Zhang
>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>> + queue->sock->sk->sk_ll_usec = 1;
>>> +#endif
>>> + write_unlock_bh(&queue->sock->sk->sk_callback_lock);
>>> +}
>>> +
>>
>> since its is not in the fast path, is there a particular reason not
>> to use following ?
>>
>> if (IS_ENABLED(CONFIG_NET_RX_BUSY_POLL)
>> queue->sock->sk->sk_ll_usec = 1;
>
> That won't compile, as the sk_ll_usec is not defined without
> CONFIG_NET_RX_BUSY_POLL.
I didn't check with CONFIG_NET_RX_BUSY_POLL=n, so you can
disregard v4 if that's the case.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-20 13:33 [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue Sagi Grimberg
2023-03-21 5:12 ` Chaitanya Kulkarni
@ 2023-03-22 8:20 ` Christoph Hellwig
2023-03-22 8:24 ` Sagi Grimberg
1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2023-03-22 8:20 UTC (permalink / raw)
To: Sagi Grimberg
Cc: linux-nvme, Christoph Hellwig, Keith Busch, Chaitanya Kulkarni,
Hannes Reinecke, Yanjun Zhang
Thanks,
applied to nvme-6.3.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-22 8:20 ` Christoph Hellwig
@ 2023-03-22 8:24 ` Sagi Grimberg
2023-03-22 8:27 ` Christoph Hellwig
0 siblings, 1 reply; 11+ messages in thread
From: Sagi Grimberg @ 2023-03-22 8:24 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-nvme, Keith Busch, Chaitanya Kulkarni, Hannes Reinecke,
Yanjun Zhang
> Thanks,
>
> applied to nvme-6.3.
Actually, I would like to waut for Yanjun to explicitly
give out his Tested-by.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-22 8:24 ` Sagi Grimberg
@ 2023-03-22 8:27 ` Christoph Hellwig
2023-03-22 13:06 ` Sagi Grimberg
0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2023-03-22 8:27 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Christoph Hellwig, linux-nvme, Keith Busch, Chaitanya Kulkarni,
Hannes Reinecke, Yanjun Zhang
On Wed, Mar 22, 2023 at 10:24:15AM +0200, Sagi Grimberg wrote:
>
>> Thanks,
>>
>> applied to nvme-6.3.
>
> Actually, I would like to waut for Yanjun to explicitly
> give out his Tested-by.
I can drop it for now, but that basically guarantees that we're going
to miss -rc4.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-22 8:27 ` Christoph Hellwig
@ 2023-03-22 13:06 ` Sagi Grimberg
0 siblings, 0 replies; 11+ messages in thread
From: Sagi Grimberg @ 2023-03-22 13:06 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-nvme, Keith Busch, Chaitanya Kulkarni, Hannes Reinecke,
Yanjun Zhang
>>> Thanks,
>>>
>>> applied to nvme-6.3.
>>
>> Actually, I would like to waut for Yanjun to explicitly
>> give out his Tested-by.
>
> I can drop it for now, but that basically guarantees that we're going
> to miss -rc4.
This has been buggy for a while now, we can wait and then have stable
pick it up.
Thanks
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
2023-03-29 6:23 zhangyanjun
@ 2023-03-30 2:27 ` Christoph Hellwig
0 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2023-03-30 2:27 UTC (permalink / raw)
To: zhangyanjun
Cc: sagi, hch, kbusch, Chaitanya.Kulkarni, hare, linux-nvme, Yanjun Zhang
Thanks,
I've added this to nvme-6.3.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
@ 2023-03-30 2:14 zhangyanjun
0 siblings, 0 replies; 11+ messages in thread
From: zhangyanjun @ 2023-03-30 2:14 UTC (permalink / raw)
To: sagi, hch
Cc: kbusch, Chaitanya.Kulkarni, hare, linux-nvme, Yanjun Zhang, Yanjun Zhang
From: Yanjun Zhang <zhangyanjun@cestc.cn>
> When we allocate a nvme-tcp queue, we set the data_ready callback before
> we actually need to use it. This creates the potential that if a stray
> controller sends us data on the socket before we connect, we can trigger
> the io_work and start consuming the socket.
>
> In this case reported: we failed to allocate one of the io queues, and
> as we start releasing the queues that we already allocated, we get
> a UAF [1] from the io_work which is running before it should really.
>
> Fix this by setting the socket ops callbacks only before we start the
> queue, so that we can't accidentally schedule the io_work in the
> initialization phase before the queue started. While we are at it,
> rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops.
>
> [1]:
> [16802.107284] nvme nvme4: starting error recovery
> [16802.109166] nvme nvme4: Reconnecting in 10 seconds...
> [16812.173535] nvme nvme4: failed to connect socket: -111
> [16812.173745] nvme nvme4: Failed reconnect attempt 1
> [16812.173747] nvme nvme4: Reconnecting in 10 seconds...
> [16822.413555] nvme nvme4: failed to connect socket: -111
> [16822.413762] nvme nvme4: Failed reconnect attempt 2
> [16822.413765] nvme nvme4: Reconnecting in 10 seconds...
> [16832.661274] nvme nvme4: creating 32 I/O queues.
> [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088
> [16833.920068] nvme nvme4: Failed reconnect attempt 3
> [16833.920094] #PF: supervisor write access in kernel mode
> [16833.920261] nvme nvme4: Reconnecting in 10 seconds...
> [16833.920368] #PF: error_code(0x0002) - not-present page
> [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
> [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30
> ...
> [16833.923138] Call Trace:
> [16833.923271] <TASK>
> [16833.923402] lock_sock_nested+0x1e/0x50
> [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp]
> [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp]
> [16833.923824] process_one_work+0x1e8/0x390
> [16833.923969] worker_thread+0x53/0x3d0
> [16833.924104] ? process_one_work+0x390/0x390
> [16833.924240] kthread+0x124/0x150
> [16833.924376] ? set_kthread_struct+0x50/0x50
> [16833.924518] ret_from_fork+0x1f/0x30
> [16833.924655] </TASK>
>
> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
> Changes from v2:
> - Move sock ops assignment to its own function (Hannes)
>
> Changes from v1:
> - Fix silly compliation error
>
> Yanjun, I'll be waiting for your Tested-by tag. We need it
> in order to apply this fix.
>
> drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++++------------------
> 1 file changed, 26 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 42c0598c31f2..49c9e7bc9116 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1620,22 +1620,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
> if (ret)
> goto err_init_connect;
>
> - queue->rd_enabled = true;
> set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
> - nvme_tcp_init_recv_ctx(queue);
> -
> - write_lock_bh(&queue->sock->sk->sk_callback_lock);
> - queue->sock->sk->sk_user_data = queue;
> - queue->state_change = queue->sock->sk->sk_state_change;
> - queue->data_ready = queue->sock->sk->sk_data_ready;
> - queue->write_space = queue->sock->sk->sk_write_space;
> - queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> - queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> - queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> -#ifdef CONFIG_NET_RX_BUSY_POLL
> - queue->sock->sk->sk_ll_usec = 1;
> -#endif
> - write_unlock_bh(&queue->sock->sk->sk_callback_lock);
>
> return 0;
>
> @@ -1655,7 +1640,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
> return ret;
> }
>
> -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
> {
> struct socket *sock = queue->sock;
>
> @@ -1670,7 +1655,7 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> {
> kernel_sock_shutdown(queue->sock, SHUT_RDWR);
> - nvme_tcp_restore_sock_calls(queue);
> + nvme_tcp_restore_sock_ops(queue);
> cancel_work_sync(&queue->io_work);
> }
>
> @@ -1688,21 +1673,42 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> mutex_unlock(&queue->queue_lock);
> }
>
> +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> +{
> + write_lock_bh(&queue->sock->sk->sk_callback_lock);
> + queue->sock->sk->sk_user_data = queue;
> + queue->state_change = queue->sock->sk->sk_state_change;
> + queue->data_ready = queue->sock->sk->sk_data_ready;
> + queue->write_space = queue->sock->sk->sk_write_space;
> + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> + queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> + queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + queue->sock->sk->sk_ll_usec = 1;
> +#endif
> + write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> +}
> +
> static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
> {
> struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> + struct nvme_tcp_queue *queue = &ctrl->queues[idx];
> int ret;
>
> + queue->rd_enabled = true;
> + nvme_tcp_init_recv_ctx(queue);
> + nvme_tcp_setup_sock_ops(queue);
> +
> if (idx)
> ret = nvmf_connect_io_queue(nctrl, idx);
> else
> ret = nvmf_connect_admin_queue(nctrl);
>
> if (!ret) {
> - set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
> + set_bit(NVME_TCP_Q_LIVE, &queue->flags);
> } else {
> - if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
> - __nvme_tcp_stop_queue(&ctrl->queues[idx]);
> + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
> + __nvme_tcp_stop_queue(queue);
> dev_err(nctrl->device,
> "failed to connect queue: %d ret=%d\n", idx, ret);
> }
> --
> 2.34.1
With this patch our cluster nodes have not shown up the same problem recently. I think it is solved.
Tested-by: Yanjun Zhang <zhangyanjun@cestc.com>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue
@ 2023-03-29 6:23 zhangyanjun
2023-03-30 2:27 ` Christoph Hellwig
0 siblings, 1 reply; 11+ messages in thread
From: zhangyanjun @ 2023-03-29 6:23 UTC (permalink / raw)
To: sagi, hch
Cc: kbusch, Chaitanya.Kulkarni, hare, linux-nvme, Yanjun Zhang, Yanjun Zhang
From: Yanjun Zhang <zhangyanjun@cestc.cn>
> When we allocate a nvme-tcp queue, we set the data_ready callback before
> we actually need to use it. This creates the potential that if a stray
> controller sends us data on the socket before we connect, we can trigger
> the io_work and start consuming the socket.
>
> In this case reported: we failed to allocate one of the io queues, and
> as we start releasing the queues that we already allocated, we get
> a UAF [1] from the io_work which is running before it should really.
>
> Fix this by setting the socket ops callbacks only before we start the
> queue, so that we can't accidentally schedule the io_work in the
> initialization phase before the queue started. While we are at it,
> rename nvme_tcp_restore_sock_calls to pair with nvme_tcp_setup_sock_ops.
>
> [1]:
> [16802.107284] nvme nvme4: starting error recovery
> [16802.109166] nvme nvme4: Reconnecting in 10 seconds...
> [16812.173535] nvme nvme4: failed to connect socket: -111
> [16812.173745] nvme nvme4: Failed reconnect attempt 1
> [16812.173747] nvme nvme4: Reconnecting in 10 seconds...
> [16822.413555] nvme nvme4: failed to connect socket: -111
> [16822.413762] nvme nvme4: Failed reconnect attempt 2
> [16822.413765] nvme nvme4: Reconnecting in 10 seconds...
> [16832.661274] nvme nvme4: creating 32 I/O queues.
> [16833.919887] BUG: kernel NULL pointer dereference, address: 0000000000000088
> [16833.920068] nvme nvme4: Failed reconnect attempt 3
> [16833.920094] #PF: supervisor write access in kernel mode
> [16833.920261] nvme nvme4: Reconnecting in 10 seconds...
> [16833.920368] #PF: error_code(0x0002) - not-present page
> [16833.921086] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
> [16833.921191] RIP: 0010:_raw_spin_lock_bh+0x17/0x30
> ...
> [16833.923138] Call Trace:
> [16833.923271] <TASK>
> [16833.923402] lock_sock_nested+0x1e/0x50
> [16833.923545] nvme_tcp_try_recv+0x40/0xa0 [nvme_tcp]
> [16833.923685] nvme_tcp_io_work+0x68/0xa0 [nvme_tcp]
> [16833.923824] process_one_work+0x1e8/0x390
> [16833.923969] worker_thread+0x53/0x3d0
> [16833.924104] ? process_one_work+0x390/0x390
> [16833.924240] kthread+0x124/0x150
> [16833.924376] ? set_kthread_struct+0x50/0x50
> [16833.924518] ret_from_fork+0x1f/0x30
> [16833.924655] </TASK>
>
> Reported-by: Yanjun Zhang <zhangyanjun@cestc.cn>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
> Changes from v2:
> - Move sock ops assignment to its own function (Hannes)
>
> Changes from v1:
> - Fix silly compliation error
>
> Yanjun, I'll be waiting for your Tested-by tag. We need it
> in order to apply this fix.
>
> drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++++------------------
> 1 file changed, 26 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 42c0598c31f2..49c9e7bc9116 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1620,22 +1620,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
> if (ret)
> goto err_init_connect;
>
> - queue->rd_enabled = true;
> set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
> - nvme_tcp_init_recv_ctx(queue);
> -
> - write_lock_bh(&queue->sock->sk->sk_callback_lock);
> - queue->sock->sk->sk_user_data = queue;
> - queue->state_change = queue->sock->sk->sk_state_change;
> - queue->data_ready = queue->sock->sk->sk_data_ready;
> - queue->write_space = queue->sock->sk->sk_write_space;
> - queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> - queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> - queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> -#ifdef CONFIG_NET_RX_BUSY_POLL
> - queue->sock->sk->sk_ll_usec = 1;
> -#endif
> - write_unlock_bh(&queue->sock->sk->sk_callback_lock);
>
> return 0;
>
> @@ -1655,7 +1640,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
> return ret;
> }
>
> -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
> {
> struct socket *sock = queue->sock;
>
> @@ -1670,7 +1655,7 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
> static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> {
> kernel_sock_shutdown(queue->sock, SHUT_RDWR);
> - nvme_tcp_restore_sock_calls(queue);
> + nvme_tcp_restore_sock_ops(queue);
> cancel_work_sync(&queue->io_work);
> }
>
> @@ -1688,21 +1673,42 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> mutex_unlock(&queue->queue_lock);
> }
>
> +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> +{
> + write_lock_bh(&queue->sock->sk->sk_callback_lock);
> + queue->sock->sk->sk_user_data = queue;
> + queue->state_change = queue->sock->sk->sk_state_change;
> + queue->data_ready = queue->sock->sk->sk_data_ready;
> + queue->write_space = queue->sock->sk->sk_write_space;
> + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
> + queue->sock->sk->sk_state_change = nvme_tcp_state_change;
> + queue->sock->sk->sk_write_space = nvme_tcp_write_space;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + queue->sock->sk->sk_ll_usec = 1;
> +#endif
> + write_unlock_bh(&queue->sock->sk->sk_callback_lock);
> +}
> +
> static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
> {
> struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> + struct nvme_tcp_queue *queue = &ctrl->queues[idx];
> int ret;
>
> + queue->rd_enabled = true;
> + nvme_tcp_init_recv_ctx(queue);
> + nvme_tcp_setup_sock_ops(queue);
> +
> if (idx)
> ret = nvmf_connect_io_queue(nctrl, idx);
> else
> ret = nvmf_connect_admin_queue(nctrl);
>
> if (!ret) {
> - set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
> + set_bit(NVME_TCP_Q_LIVE, &queue->flags);
> } else {
> - if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
> - __nvme_tcp_stop_queue(&ctrl->queues[idx]);
> + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
> + __nvme_tcp_stop_queue(queue);
> dev_err(nctrl->device,
> "failed to connect queue: %d ret=%d\n", idx, ret);
> }
> --
> 2.34.1
With this patch our cluster nodes have not shown up the same problem recently. I think it is solved.
Tested-by: Yanjun Zhang <zhangyanjun@cestc.com>
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2023-04-03 14:40 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-20 13:33 [PATCH v3] nvme-tcp: fix a possible UAF when failing to allocate an io queue Sagi Grimberg
2023-03-21 5:12 ` Chaitanya Kulkarni
2023-03-22 8:15 ` Christoph Hellwig
2023-03-22 8:18 ` Sagi Grimberg
2023-03-22 8:20 ` Christoph Hellwig
2023-03-22 8:24 ` Sagi Grimberg
2023-03-22 8:27 ` Christoph Hellwig
2023-03-22 13:06 ` Sagi Grimberg
2023-03-29 6:23 zhangyanjun
2023-03-30 2:27 ` Christoph Hellwig
2023-03-30 2:14 zhangyanjun
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.