io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Kernel BUG when registering the ring
@ 2020-02-11  1:22 Glauber Costa
  2020-02-11  3:25 ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Glauber Costa @ 2020-02-11  1:22 UTC (permalink / raw)
  To: io-uring, Avi Kivity, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 430 bytes --]

Hello my dear io_uring friends

Today I tried to run my tests on a different, more powerful hardware
(70+ Hyperthreads) and it crashed on creating the ring.

I don't recall anything fancy in my code for creating the ring -
except maybe that I size the cq ring differently than the sq ring.

The backtrace attached leads me to believe that we just accessed a
null struct somewhere

Hash is ba2db2d4d262f7ccf6fe86b00c3538056d7c5218

[-- Attachment #2: creation.txt --]
[-- Type: text/plain, Size: 5666 bytes --]

[  894.918927] XFS (nvme0n1p1): Mounting V5 Filesystem
[  894.928964] XFS (nvme0n1p1): Ending clean mount
[  894.930111] xfs filesystem being mounted at /var/disk1 supports timestamps until 2038 (0x7fffffff)
[  901.000820] BUG: unable to handle page fault for address: 0000000000002088
[  901.000887] #PF: supervisor read access in kernel mode
[  901.000927] #PF: error_code(0x0000) - not-present page
[  901.000969] PGD 174101b067 P4D 174101b067 PUD 17c30cd067 PMD 0 
[  901.001019] Oops: 0000 [#1] SMP NOPTI
[  901.001052] CPU: 40 PID: 2144 Comm: io_tester Not tainted 5.5.0+ #6
[  901.001101] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
[  901.001187] RIP: 0010:__alloc_pages_nodemask+0x132/0x340
[  901.001231] Code: 18 01 75 04 41 80 ce 80 89 e8 48 8b 54 24 08 8b 74 24 1c c1 e8 0c 48 8b 3c 24 83 e0 01 88 44 24 20 48 85 d2 0f 85 74 01 00 00 <3b> 77 08 0f 82 6b 01 00 00 48 89 7c 24 10 89 ea 48 8b 07 b9 00 02
[  901.001370] RSP: 0018:ffffb8be4d0b7c28 EFLAGS: 00010246
[  901.001413] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000e8e8
[  901.001466] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[  901.001499] RBP: 0000000000012cc0 R08: 0000000000000000 R09: 0000000000000002
[  901.001516] R10: 0000000000000dc0 R11: ffff995c60400100 R12: 0000000000000000
[  901.001534] R13: 0000000000012cc0 R14: 0000000000000001 R15: ffff995c60db00f0
[  901.001552] FS:  00007f4d115ca900(0000) GS:ffff995c60d80000(0000) knlGS:0000000000000000
[  901.001572] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  901.001586] CR2: 0000000000002088 CR3: 00000017cca66002 CR4: 00000000007606e0
[  901.001604] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  901.001622] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  901.001640] PKRU: 55555554
[  901.001647] Call Trace:
[  901.001663]  alloc_slab_page+0x46/0x320
[  901.001676]  new_slab+0x9d/0x4e0
[  901.001687]  ___slab_alloc+0x507/0x6a0
[  901.001702]  ? io_wq_create+0xb4/0x2a0
[  901.001713]  __slab_alloc+0x1c/0x30
[  901.001725]  kmem_cache_alloc_node_trace+0xa6/0x260
[  901.001738]  io_wq_create+0xb4/0x2a0
[  901.001750]  io_uring_setup+0x97f/0xaa0
[  901.001762]  ? io_remove_personalities+0x30/0x30
[  901.001776]  ? io_poll_trigger_evfd+0x30/0x30
[  901.001791]  do_syscall_64+0x5b/0x1c0
[  901.001806]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  901.001820] RIP: 0033:0x7f4d116cb1ed
[  901.001831] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
[  901.001875] RSP: 002b:00007fff641ddf58 EFLAGS: 00000202 ORIG_RAX: 00000000000001a9
[  901.001894] RAX: ffffffffffffffda RBX: 00007fff641de0f0 RCX: 00007f4d116cb1ed
[  901.001911] RDX: 0000000000000000 RSI: 00007fff641ddfb0 RDI: 0000000000000080
[  901.001928] RBP: 0000000000000080 R08: 0000000000000000 R09: 0000600000081c20
[  901.002471] R10: 00007f4d115c9800 R11: 0000000000000202 R12: 00007fff641ddfb0
[  901.002971] R13: 00007fff641de0f0 R14: 00007fff641de0c0 R15: 00007fff641de4e8
[  901.003470] Modules linked in: ip6t_REJECT nf_reject_ipv6 ip6t_rpfilter ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat iptable_mangle iptable_raw iptable_security nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nfnetlink ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ib_isert iscsi_target_mod ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib vfat fat ib_umad intel_rapl_msr intel_rapl_common rpcrdma isst_if_common sunrpc rdma_ucm ib_iser skx_edac rdma_cm x86_pkg_temp_thermal intel_powerclamp iw_cm coretemp kvm_intel ib_cm libiscsi scsi_transport_iscsi kvm irqbypass crct10dif_pclmul crc32_pclmul i40iw ghash_clmulni_intel ib_uverbs iTCO_wdt iTCO_vendor_support intel_cstate ib_core ipmi_ssif intel_uncore joydev intel_rapl_perf mei_me i2c_i801 ioatdma switchtec pcspkr lpc_ich mei ipmi_si dca ipmi_devintf ipmi_msghandler dax_pmem dax_pmem_core acpi_power_meter acpi_pad
[  901.003504]  ip_tables xfs libcrc32c rfkill nd_pmem nd_btt ast i2c_algo_bit drm_vram_helper drm_ttm_helper ttm drm_kms_helper cec drm i40e megaraid_sas crc32c_intel nvme nvme_core nfit libnvdimm wmi pkcs8_key_parser
[  901.009213] CR2: 0000000000002088
[  901.009814] ---[ end trace 2bb8b12f7dc58981 ]---
[  901.109907] RIP: 0010:__alloc_pages_nodemask+0x132/0x340
[  901.110546] Code: 18 01 75 04 41 80 ce 80 89 e8 48 8b 54 24 08 8b 74 24 1c c1 e8 0c 48 8b 3c 24 83 e0 01 88 44 24 20 48 85 d2 0f 85 74 01 00 00 <3b> 77 08 0f 82 6b 01 00 00 48 89 7c 24 10 89 ea 48 8b 07 b9 00 02
[  901.111761] RSP: 0018:ffffb8be4d0b7c28 EFLAGS: 00010246
[  901.112368] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000e8e8
[  901.112982] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[  901.113596] RBP: 0000000000012cc0 R08: 0000000000000000 R09: 0000000000000002
[  901.114206] R10: 0000000000000dc0 R11: ffff995c60400100 R12: 0000000000000000
[  901.114821] R13: 0000000000012cc0 R14: 0000000000000001 R15: ffff995c60db00f0
[  901.115418] FS:  00007f4d115ca900(0000) GS:ffff995c60d80000(0000) knlGS:0000000000000000
[  901.116019] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  901.116619] CR2: 0000000000002088 CR3: 00000017cca66002 CR4: 00000000007606e0
[  901.117233] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  901.117839] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  901.118444] PKRU: 55555554


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11  1:22 Kernel BUG when registering the ring Glauber Costa
@ 2020-02-11  3:25 ` Jens Axboe
  2020-02-11  3:45   ` Glauber Costa
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2020-02-11  3:25 UTC (permalink / raw)
  To: Glauber Costa, io-uring, Avi Kivity

On 2/10/20 6:22 PM, Glauber Costa wrote:
> Hello my dear io_uring friends
> 
> Today I tried to run my tests on a different, more powerful hardware
> (70+ Hyperthreads) and it crashed on creating the ring.
> 
> I don't recall anything fancy in my code for creating the ring -
> except maybe that I size the cq ring differently than the sq ring.
> 
> The backtrace attached leads me to believe that we just accessed a
> null struct somewhere

Yeah, but it's in the alloc code, it's not in io-wq/io_uring.
Here's where it is crashing:

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
	[...]
	for_each_node(node) {
		struct io_wqe *wqe;

		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);

I guess the node isn't online, and that's why it's crashing. Try the
below for starters, it should get it going.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 182aa17dc2ca..3898165baccb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1115,8 +1116,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	for_each_node(node) {
 		struct io_wqe *wqe;
+		int alloc_node = node;
 
-		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+		if (!node_online(alloc_node))
+			alloc_node = NUMA_NO_NODE;
+		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
 		if (!wqe)
 			goto err;
 		wq->wqes[node] = wqe;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11  3:25 ` Jens Axboe
@ 2020-02-11  3:45   ` Glauber Costa
  2020-02-11  3:50     ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Glauber Costa @ 2020-02-11  3:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Avi Kivity

[-- Attachment #1: Type: text/plain, Size: 1826 bytes --]

It crashes all the same

New backtrace attached - looks very similar to the old one, although
not identical.

On Mon, Feb 10, 2020 at 10:25 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 2/10/20 6:22 PM, Glauber Costa wrote:
> > Hello my dear io_uring friends
> >
> > Today I tried to run my tests on a different, more powerful hardware
> > (70+ Hyperthreads) and it crashed on creating the ring.
> >
> > I don't recall anything fancy in my code for creating the ring -
> > except maybe that I size the cq ring differently than the sq ring.
> >
> > The backtrace attached leads me to believe that we just accessed a
> > null struct somewhere
>
> Yeah, but it's in the alloc code, it's not in io-wq/io_uring.
> Here's where it is crashing:
>
> struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
> {
>         [...]
>         for_each_node(node) {
>                 struct io_wqe *wqe;
>
>                 wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
>
> I guess the node isn't online, and that's why it's crashing. Try the
> below for starters, it should get it going.
>
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 182aa17dc2ca..3898165baccb 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -1115,8 +1116,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>
>         for_each_node(node) {
>                 struct io_wqe *wqe;
> +               int alloc_node = node;
>
> -               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
> +               if (!node_online(alloc_node))
> +                       alloc_node = NUMA_NO_NODE;
> +               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
>                 if (!wqe)
>                         goto err;
>                 wq->wqes[node] = wqe;
>
> --
> Jens Axboe
>

[-- Attachment #2: newtrace.txt --]
[-- Type: text/plain, Size: 4601 bytes --]

[   52.976723] Oops: 0000 [#1] SMP NOPTI
[   52.976763] CPU: 56 PID: 2107 Comm: io_wq_manager Not tainted 5.5.0+ #7
[   52.976826] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
[   52.976923] RIP: 0010:__alloc_pages_nodemask+0x132/0x340
[   52.976975] Code: 18 01 75 04 41 80 ce 80 89 e8 48 8b 54 24 08 8b 74 24 1c c1 e8 0c 48 8b 3c 24 83 e0 01 88 44 24 20 48 85 d2 0f 85 74 01 00 00 <3b> 77 08 0f 82 6b 01 00 00 48 89 7c 24 10 89 ea 48 8b 07 b9 00 02
[   52.977144] RSP: 0018:ffffa48ece75bc88 EFLAGS: 00010246
[   52.977198] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000e8e8
[   52.977265] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[   52.977332] RBP: 0000000000012cc0 R08: ffff8b54d83a8a00 R09: 0000000000000002
[   52.977399] R10: 0000000000000dc0 R11: ffff8b54e0800100 R12: 0000000000000000
[   52.977465] R13: 0000000000012cc0 R14: 0000000000000001 R15: ffff8b54e11300f0
[   52.977530] FS:  0000000000000000(0000) GS:ffff8b54e1100000(0000) knlGS:0000000000000000
[   52.977605] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   52.977660] CR2: 0000000000002088 CR3: 000000d7e660a004 CR4: 00000000007606e0
[   52.977725] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   52.977791] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   52.977858] PKRU: 55555554
[   52.977887] Call Trace:
[   52.977926]  alloc_slab_page+0x46/0x320
[   52.977967]  new_slab+0x9d/0x4e0
[   52.978004]  ? account_entity_enqueue+0x9c/0xd0
[   52.978055]  ___slab_alloc+0x507/0x6a0
[   52.978097]  ? create_io_worker.isra.0+0x35/0x180
[   52.978147]  ? activate_task+0x7a/0x160
[   52.978187]  ? check_preempt_curr+0x4a/0x90
[   52.978230]  ? ttwu_do_wakeup+0x19/0x140
[   52.978273]  __slab_alloc+0x1c/0x30
[   52.978312]  kmem_cache_alloc_node_trace+0xa6/0x260
[   52.978364]  create_io_worker.isra.0+0x35/0x180
[   52.978412]  io_wq_manager+0xa4/0x250
[   52.978448]  kthread+0xf9/0x130
[   52.978479]  ? create_io_worker.isra.0+0x180/0x180
[   52.978521]  ? kthread_park+0x90/0x90
[   52.978560]  ret_from_fork+0x1f/0x40
[   52.978598] Modules linked in: ip6t_REJECT nf_reject_ipv6 ip6t_rpfilter ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat iptable_mangle iptable_raw iptable_security nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nfnetlink ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ib_isert iscsi_target_mod ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib vfat fat ib_umad rpcrdma sunrpc rdma_ucm ib_iser rdma_cm intel_rapl_msr intel_rapl_common iw_cm ib_cm isst_if_common libiscsi scsi_transport_iscsi skx_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass i40iw crct10dif_pclmul crc32_pclmul iTCO_wdt ghash_clmulni_intel ib_uverbs iTCO_vendor_support intel_cstate ipmi_ssif ib_core intel_uncore joydev intel_rapl_perf ioatdma mei_me pcspkr lpc_ich i2c_i801 mei switchtec ipmi_si dca ipmi_devintf ipmi_msghandler dax_pmem dax_pmem_core acpi_power_meter acpi_pad
[   52.978655]  ip_tables xfs libcrc32c rfkill nd_pmem nd_btt ast i2c_algo_bit drm_vram_helper drm_ttm_helper ttm drm_kms_helper cec drm i40e megaraid_sas crc32c_intel nvme nvme_core nfit libnvdimm wmi pkcs8_key_parser
[   52.994250] CR2: 0000000000002088
[   52.995607] ---[ end trace 56b95aaef917fdfe ]---
[   53.087074] RIP: 0010:__alloc_pages_nodemask+0x132/0x340
[   53.087807] Code: 18 01 75 04 41 80 ce 80 89 e8 48 8b 54 24 08 8b 74 24 1c c1 e8 0c 48 8b 3c 24 83 e0 01 88 44 24 20 48 85 d2 0f 85 74 01 00 00 <3b> 77 08 0f 82 6b 01 00 00 48 89 7c 24 10 89 ea 48 8b 07 b9 00 02
[   53.089181] RSP: 0018:ffffa48ece75bc88 EFLAGS: 00010246
[   53.089845] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000e8e8
[   53.090502] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080
[   53.091141] RBP: 0000000000012cc0 R08: ffff8b54d83a8a00 R09: 0000000000000002
[   53.091770] R10: 0000000000000dc0 R11: ffff8b54e0800100 R12: 0000000000000000
[   53.092391] R13: 0000000000012cc0 R14: 0000000000000001 R15: ffff8b54e11300f0
[   53.093020] FS:  0000000000000000(0000) GS:ffff8b54e1100000(0000) knlGS:0000000000000000
[   53.093659] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   53.094298] CR2: 0000000000002088 CR3: 000000d7e660a004 CR4: 00000000007606e0
[   53.094947] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   53.095588] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11  3:45   ` Glauber Costa
@ 2020-02-11  3:50     ` Jens Axboe
  2020-02-11 13:01       ` Glauber Costa
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2020-02-11  3:50 UTC (permalink / raw)
  To: Glauber Costa; +Cc: io-uring, Avi Kivity

On 2/10/20 8:45 PM, Glauber Costa wrote:
> It crashes all the same
> 
> New backtrace attached - looks very similar to the old one, although
> not identical.

I missed the other spot we do the same thing... Try this.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 182aa17dc2ca..b8ef5a5483de 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1115,12 +1116,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	for_each_node(node) {
 		struct io_wqe *wqe;
+		int alloc_node = node;
 
-		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+		if (!node_online(alloc_node))
+			alloc_node = NUMA_NO_NODE;
+		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
 		if (!wqe)
 			goto err;
 		wq->wqes[node] = wqe;
-		wqe->node = node;
+		wqe->node = alloc_node;
 		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
 		if (wq->user) {
@@ -1128,7 +1132,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 					task_rlimit(current, RLIMIT_NPROC);
 		}
 		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
-		wqe->node = node;
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
 		INIT_WQ_LIST(&wqe->work_list);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11  3:50     ` Jens Axboe
@ 2020-02-11 13:01       ` Glauber Costa
  2020-02-11 18:58         ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Glauber Costa @ 2020-02-11 13:01 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Avi Kivity

This works.

Thanks

On Mon, Feb 10, 2020 at 10:50 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 2/10/20 8:45 PM, Glauber Costa wrote:
> > It crashes all the same
> >
> > New backtrace attached - looks very similar to the old one, although
> > not identical.
>
> I missed the other spot we do the same thing... Try this.
>
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 182aa17dc2ca..b8ef5a5483de 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -1115,12 +1116,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>
>         for_each_node(node) {
>                 struct io_wqe *wqe;
> +               int alloc_node = node;
>
> -               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
> +               if (!node_online(alloc_node))
> +                       alloc_node = NUMA_NO_NODE;
> +               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
>                 if (!wqe)
>                         goto err;
>                 wq->wqes[node] = wqe;
> -               wqe->node = node;
> +               wqe->node = alloc_node;
>                 wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
>                 atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
>                 if (wq->user) {
> @@ -1128,7 +1132,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>                                         task_rlimit(current, RLIMIT_NPROC);
>                 }
>                 atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
> -               wqe->node = node;
>                 wqe->wq = wq;
>                 spin_lock_init(&wqe->lock);
>                 INIT_WQ_LIST(&wqe->work_list);
>
> --
> Jens Axboe
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11 13:01       ` Glauber Costa
@ 2020-02-11 18:58         ` Jens Axboe
  2020-02-11 19:23           ` Glauber Costa
  2020-02-12 22:31           ` Jann Horn
  0 siblings, 2 replies; 10+ messages in thread
From: Jens Axboe @ 2020-02-11 18:58 UTC (permalink / raw)
  To: Glauber Costa; +Cc: io-uring, Avi Kivity

On 2/11/20 6:01 AM, Glauber Costa wrote:
> This works.

Can you try this one as well?


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 182aa17dc2ca..2d741fb76098 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -699,11 +699,16 @@ static int io_wq_manager(void *data)
 	/* create fixed workers */
 	refcount_set(&wq->refs, workers_to_create);
 	for_each_node(node) {
+		if (!node_online(node))
+			continue;
 		if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
 			goto err;
 		workers_to_create--;
 	}
 
+	while (workers_to_create--)
+		refcount_dec(&wq->refs);
+
 	complete(&wq->done);
 
 	while (!kthread_should_stop()) {
@@ -711,6 +716,9 @@ static int io_wq_manager(void *data)
 			struct io_wqe *wqe = wq->wqes[node];
 			bool fork_worker[2] = { false, false };
 
+			if (!node_online(node))
+				continue;
+
 			spin_lock_irq(&wqe->lock);
 			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
 				fork_worker[IO_WQ_ACCT_BOUND] = true;
@@ -849,6 +857,8 @@ void io_wq_cancel_all(struct io_wq *wq)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
+		if (!node_online(node))
+			continue;
 		io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
 	}
 	rcu_read_unlock();
@@ -929,6 +939,8 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
+		if (!node_online(node))
+			continue;
 		ret = io_wqe_cancel_cb_work(wqe, cancel, data);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1021,6 +1033,8 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
+		if (!node_online(node))
+			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1050,6 +1064,8 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
+		if (!node_online(node))
+			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1084,6 +1100,8 @@ void io_wq_flush(struct io_wq *wq)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
+		if (!node_online(node))
+			continue;
 		init_completion(&data.done);
 		INIT_IO_WORK(&data.work, io_wq_flush_func);
 		data.work.flags |= IO_WQ_WORK_INTERNAL;
@@ -1115,12 +1133,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	for_each_node(node) {
 		struct io_wqe *wqe;
+		int alloc_node = node;
 
-		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+		if (!node_online(alloc_node))
+			alloc_node = NUMA_NO_NODE;
+		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
 		if (!wqe)
 			goto err;
 		wq->wqes[node] = wqe;
-		wqe->node = node;
+		wqe->node = alloc_node;
 		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
 		if (wq->user) {
@@ -1128,7 +1149,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 					task_rlimit(current, RLIMIT_NPROC);
 		}
 		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
-		wqe->node = node;
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
 		INIT_WQ_LIST(&wqe->work_list);
@@ -1184,8 +1204,11 @@ static void __io_wq_destroy(struct io_wq *wq)
 		kthread_stop(wq->manager);
 
 	rcu_read_lock();
-	for_each_node(node)
+	for_each_node(node) {
+		if (!node_online(node))
+			continue;
 		io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
+	}
 	rcu_read_unlock();
 
 	wait_for_completion(&wq->done);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11 18:58         ` Jens Axboe
@ 2020-02-11 19:23           ` Glauber Costa
  2020-02-11 19:24             ` Jens Axboe
  2020-02-12 22:31           ` Jann Horn
  1 sibling, 1 reply; 10+ messages in thread
From: Glauber Costa @ 2020-02-11 19:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Avi Kivity

Tested-by: Glauber Costa <glauber@scylladb.com>

On Tue, Feb 11, 2020 at 1:58 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 2/11/20 6:01 AM, Glauber Costa wrote:
> > This works.
>
> Can you try this one as well?
>
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 182aa17dc2ca..2d741fb76098 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -699,11 +699,16 @@ static int io_wq_manager(void *data)
>         /* create fixed workers */
>         refcount_set(&wq->refs, workers_to_create);
>         for_each_node(node) {
> +               if (!node_online(node))
> +                       continue;
>                 if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
>                         goto err;
>                 workers_to_create--;
>         }
>
> +       while (workers_to_create--)
> +               refcount_dec(&wq->refs);
> +
>         complete(&wq->done);
>
>         while (!kthread_should_stop()) {
> @@ -711,6 +716,9 @@ static int io_wq_manager(void *data)
>                         struct io_wqe *wqe = wq->wqes[node];
>                         bool fork_worker[2] = { false, false };
>
> +                       if (!node_online(node))
> +                               continue;
> +
>                         spin_lock_irq(&wqe->lock);
>                         if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
>                                 fork_worker[IO_WQ_ACCT_BOUND] = true;
> @@ -849,6 +857,8 @@ void io_wq_cancel_all(struct io_wq *wq)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
>         }
>         rcu_read_unlock();
> @@ -929,6 +939,8 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 ret = io_wqe_cancel_cb_work(wqe, cancel, data);
>                 if (ret != IO_WQ_CANCEL_NOTFOUND)
>                         break;
> @@ -1021,6 +1033,8 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 ret = io_wqe_cancel_work(wqe, &match);
>                 if (ret != IO_WQ_CANCEL_NOTFOUND)
>                         break;
> @@ -1050,6 +1064,8 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 ret = io_wqe_cancel_work(wqe, &match);
>                 if (ret != IO_WQ_CANCEL_NOTFOUND)
>                         break;
> @@ -1084,6 +1100,8 @@ void io_wq_flush(struct io_wq *wq)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 init_completion(&data.done);
>                 INIT_IO_WORK(&data.work, io_wq_flush_func);
>                 data.work.flags |= IO_WQ_WORK_INTERNAL;
> @@ -1115,12 +1133,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>
>         for_each_node(node) {
>                 struct io_wqe *wqe;
> +               int alloc_node = node;
>
> -               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
> +               if (!node_online(alloc_node))
> +                       alloc_node = NUMA_NO_NODE;
> +               wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
>                 if (!wqe)
>                         goto err;
>                 wq->wqes[node] = wqe;
> -               wqe->node = node;
> +               wqe->node = alloc_node;
>                 wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
>                 atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
>                 if (wq->user) {
> @@ -1128,7 +1149,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>                                         task_rlimit(current, RLIMIT_NPROC);
>                 }
>                 atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
> -               wqe->node = node;
>                 wqe->wq = wq;
>                 spin_lock_init(&wqe->lock);
>                 INIT_WQ_LIST(&wqe->work_list);
> @@ -1184,8 +1204,11 @@ static void __io_wq_destroy(struct io_wq *wq)
>                 kthread_stop(wq->manager);
>
>         rcu_read_lock();
> -       for_each_node(node)
> +       for_each_node(node) {
> +               if (!node_online(node))
> +                       continue;
>                 io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
> +       }
>         rcu_read_unlock();
>
>         wait_for_completion(&wq->done);
>
> --
> Jens Axboe
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11 19:23           ` Glauber Costa
@ 2020-02-11 19:24             ` Jens Axboe
  0 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2020-02-11 19:24 UTC (permalink / raw)
  To: Glauber Costa; +Cc: io-uring, Avi Kivity

On 2/11/20 12:23 PM, Glauber Costa wrote:
> Tested-by: Glauber Costa <glauber@scylladb.com>

Thanks!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-11 18:58         ` Jens Axboe
  2020-02-11 19:23           ` Glauber Costa
@ 2020-02-12 22:31           ` Jann Horn
  2020-02-13 15:20             ` Jens Axboe
  1 sibling, 1 reply; 10+ messages in thread
From: Jann Horn @ 2020-02-12 22:31 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Glauber Costa, io-uring, Avi Kivity

On Tue, Feb 11, 2020 at 7:58 PM Jens Axboe <axboe@kernel.dk> wrote:
[...]
> @@ -849,6 +857,8 @@ void io_wq_cancel_all(struct io_wq *wq)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
>         }
>         rcu_read_unlock();

What is this going to do if a NUMA node is marked as offline (through
a call to node_set_offline() from try_offline_node()) while it has a
worker running, and then afterwards, with the worker still running,
io_wq_cancel_all() is executed? Is that going to potentially hang
because some op is still executing on that node's worker? Or is there
a reason why that can't happen?

[...]
> @@ -1084,6 +1100,8 @@ void io_wq_flush(struct io_wq *wq)
>         for_each_node(node) {
>                 struct io_wqe *wqe = wq->wqes[node];
>
> +               if (!node_online(node))
> +                       continue;
>                 init_completion(&data.done);
>                 INIT_IO_WORK(&data.work, io_wq_flush_func);
>                 data.work.flags |= IO_WQ_WORK_INTERNAL;

(io_wq_flush() is dead code since 05f3fb3c5397, right? Are there plans
to use it again?)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Kernel BUG when registering the ring
  2020-02-12 22:31           ` Jann Horn
@ 2020-02-13 15:20             ` Jens Axboe
  0 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2020-02-13 15:20 UTC (permalink / raw)
  To: Jann Horn; +Cc: Glauber Costa, io-uring, Avi Kivity

On 2/12/20 3:31 PM, Jann Horn wrote:
> On Tue, Feb 11, 2020 at 7:58 PM Jens Axboe <axboe@kernel.dk> wrote:
> [...]
>> @@ -849,6 +857,8 @@ void io_wq_cancel_all(struct io_wq *wq)
>>         for_each_node(node) {
>>                 struct io_wqe *wqe = wq->wqes[node];
>>
>> +               if (!node_online(node))
>> +                       continue;
>>                 io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
>>         }
>>         rcu_read_unlock();
> 
> What is this going to do if a NUMA node is marked as offline (through
> a call to node_set_offline() from try_offline_node()) while it has a
> worker running, and then afterwards, with the worker still running,
> io_wq_cancel_all() is executed? Is that going to potentially hang
> because some op is still executing on that node's worker? Or is there
> a reason why that can't happen?

I folded in this incremental last night, I think it's a better
approach as well.

> [...]
>> @@ -1084,6 +1100,8 @@ void io_wq_flush(struct io_wq *wq)
>>         for_each_node(node) {
>>                 struct io_wqe *wqe = wq->wqes[node];
>>
>> +               if (!node_online(node))
>> +                       continue;
>>                 init_completion(&data.done);
>>                 INIT_IO_WORK(&data.work, io_wq_flush_func);
>>                 data.work.flags |= IO_WQ_WORK_INTERNAL;
> 
> (io_wq_flush() is dead code since 05f3fb3c5397, right? Are there plans
> to use it again?)

Should probably just be removed for now, I generally don't like carrying
dead code. It's easy enough to bring back if we need it, though I suspect
if we do need it, we'll probably make it work like the workqueue
flushing where we guarantee existing work is done at that point.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2d741fb76098..0a5ab1a8f69a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -837,7 +837,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 
 	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
 		if (io_worker_get(worker)) {
-			ret = func(worker, data);
+			/* no task if node is/was offline */
+			if (worker->task)
+				ret = func(worker, data);
 			io_worker_release(worker);
 			if (ret)
 				break;
@@ -857,8 +859,6 @@ void io_wq_cancel_all(struct io_wq *wq)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
 	}
 	rcu_read_unlock();
@@ -939,8 +939,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_cb_work(wqe, cancel, data);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1033,8 +1031,6 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1064,8 +1060,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1204,11 +1198,8 @@ static void __io_wq_destroy(struct io_wq *wq)
 		kthread_stop(wq->manager);
 
 	rcu_read_lock();
-	for_each_node(node) {
-		if (!node_online(node))
-			continue;
+	for_each_node(node)
 		io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
-	}
 	rcu_read_unlock();
 
 	wait_for_completion(&wq->done);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-02-13 15:20 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-11  1:22 Kernel BUG when registering the ring Glauber Costa
2020-02-11  3:25 ` Jens Axboe
2020-02-11  3:45   ` Glauber Costa
2020-02-11  3:50     ` Jens Axboe
2020-02-11 13:01       ` Glauber Costa
2020-02-11 18:58         ` Jens Axboe
2020-02-11 19:23           ` Glauber Costa
2020-02-11 19:24             ` Jens Axboe
2020-02-12 22:31           ` Jann Horn
2020-02-13 15:20             ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).