Hi Hans, Sorry to reply so late. Here is the reproducible C program that could trigger the following debug warning: ``` if ((wait_for_completion_killable_timeout(&adap->config_completion, msecs_to_jiffies(10000)) <= 0)) { dprintk(0, "wfc3: %px %d%d%d%d %x\n", adap->kthread_config, adap->is_configuring, adap->is_configured, adap->is_enabled, adap->must_reconfigure, adap->phys_addr); WARN_ON(1); } ``` Output: ``` [ 2147.996471][ T29] kauditd_printk_skb: 2 callbacks suppressed [ 2147.996480][ T29] audit: type=1804 audit(1707782435.859:14): pid=1281266 uid=0 auid=4290 [ 2148.025365][ T29] audit: type=1804 audit(1707782435.879:15): pid=1281266 uid=0 auid=4290 [54569.994174][T772636] cec-vivid-002-vid-cap0: wfc3: 0000000000000000 0110 0 [54569.995754][T772636] WARNING: CPU: 0 PID: 772636 at drivers/media/cec/core/cec-adap.c:1620 [54569.996578][T772636] Modules linked in: [54569.997066][T772636] CPU: 0 PID: 772636 Comm: exe Not tainted 6.8.0-rc1-00169-gecb1b8288d5 [54569.997804][T772636] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-14 [54569.998416][T772636] RIP: 0010:cec_claim_log_addrs+0x29b/0x7c0 [54569.998830][T772636] Code: 7c 08 84 d2 0f 85 50 04 00 00 44 8b 25 ee 33 9f 0b 31 ff 44 89f [54570.000124][T772636] RSP: 0018:ffffc90002117b30 EFLAGS: 00010293 [54570.000549][T772636] RAX: 0000000000000000 RBX: ffff88801d6a0000 RCX: ffffffff816a1959 [54570.001086][T772636] RDX: ffff888028e4b900 RSI: ffffffff8710b1ca RDI: 0000000000000005 [54570.001621][T772636] RBP: ffff88801d6a0638 R08: 0000000000000005 R09: 0000000000000000 [54570.002152][T772636] R10: 0000000080000000 R11: 0000000000000001 R12: 0000000000000000 [54570.002684][T772636] R13: ffff88801d6a07da R14: 0000000000000000 R15: 0000000000000001 [54570.004825][T772636] FS: 00007fee0aee1700(0000) GS:ffff88802ca00000(0000) knlGS:000000000 [54570.005445][T772636] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [54570.005893][T772636] CR2: 00007ffe793b19f0 CR3: 000000002130a000 CR4: 00000000000006f0 [54570.006430][T772636] Call Trace: [54570.006660][T772636] [54570.006871][T772636] ? show_regs+0x96/0xa0 [54570.007197][T772636] ? __warn+0xe6/0x390 [54570.007506][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.007875][T772636] ? report_bug+0x2dd/0x500 [54570.008203][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.008573][T772636] ? handle_bug+0x99/0x120 [54570.008891][T772636] ? exc_invalid_op+0x36/0x80 [54570.009220][T772636] ? asm_exc_invalid_op+0x1a/0x20 [54570.009584][T772636] ? __wake_up_klogd.part.0+0x99/0xf0 [54570.009962][T772636] ? cec_claim_log_addrs+0x29a/0x7c0 [54570.010344][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.010718][T772636] ? cec_adap_enable+0x534/0xbd0 [54570.011065][T772636] __cec_s_log_addrs+0xdfc/0x16e0 [54570.011437][T772636] cec_ioctl+0x1e7c/0x2690 [54570.011778][T772636] ? cec_release+0xbb0/0xbb0 [54570.012107][T772636] ? tomoyo_execute_permission+0x4a0/0x4a0 [54570.012520][T772636] ? __sanitizer_cov_trace_switch+0x54/0x90 [54570.012938][T772636] ? do_vfs_ioctl+0x138/0x16c0 [54570.013493][T772636] ? vfs_fileattr_set+0xc40/0xc40 [54570.013954][T772636] ? lock_downgrade+0x6a0/0x6a0 [54570.014422][T772636] ? bpf_lsm_file_ioctl+0x9/0x10 [54570.014791][T772636] ? cec_release+0xbb0/0xbb0 [54570.015117][T772636] __x64_sys_ioctl+0x19d/0x210 [54570.015466][T772636] do_syscall_64+0xd2/0x250 [54570.015785][T772636] entry_SYSCALL_64_after_hwframe+0x63/0x6b [54570.016191][T772636] RIP: 0033:0x7fee0affaf29 [54570.016502][T772636] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 488 [54570.017779][T772636] RSP: 002b:00007fee0aee0e98 EFLAGS: 00000202 ORIG_RAX: 000000000000000 [54570.018343][T772636] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fee0affaf29 [54570.018872][T772636] RDX: 0000000020000680 RSI: 00000000c05c6104 RDI: 0000000000000004 [54570.019411][T772636] RBP: 00007fee0aee0ec0 R08: 0000000000000000 R09: 0000000000000000 [54570.019941][T772636] R10: 0000000000000000 R11: 0000000000000202 R12: 00007ffe792f9dce [54570.020475][T772636] R13: 00007ffe792f9dcf R14: 00007fee0aee0fc0 R15: 0000000000022000 [54570.021013][T772636] [54570.021234][T772636] Kernel panic - not syncing: kernel: panic_on_warn set ... [54570.021720][T772636] CPU: 0 PID: 772636 Comm: exe Not tainted 6.8.0-rc1-00169-gecb1b8288d5 [54570.022329][T772636] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-14 [54570.022956][T772636] Call Trace: [54570.023192][T772636] [54570.023412][T772636] dump_stack_lvl+0xd9/0x150 [54570.023741][T772636] panic+0x6b9/0x760 [54570.024022][T772636] ? panic_smp_self_stop+0xa0/0xa0 [54570.024388][T772636] ? check_panic_on_warn+0x1f/0xc0 [54570.024746][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.025111][T772636] check_panic_on_warn+0xb1/0xc0 [54570.025459][T772636] __warn+0xf2/0x390 [54570.025740][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.026109][T772636] report_bug+0x2dd/0x500 [54570.026418][T772636] ? cec_claim_log_addrs+0x29b/0x7c0 [54570.026788][T772636] handle_bug+0x99/0x120 [54570.027085][T772636] exc_invalid_op+0x36/0x80 [54570.027408][T772636] asm_exc_invalid_op+0x1a/0x20 [54570.027748][T772636] RIP: 0010:cec_claim_log_addrs+0x29b/0x7c0 [54570.028154][T772636] Code: 7c 08 84 d2 0f 85 50 04 00 00 44 8b 25 ee 33 9f 0b 31 ff 44 89f [54570.029431][T772636] RSP: 0018:ffffc90002117b30 EFLAGS: 00010293 [54570.029847][T772636] RAX: 0000000000000000 RBX: ffff88801d6a0000 RCX: ffffffff816a1959 [54570.030380][T772636] RDX: ffff888028e4b900 RSI: ffffffff8710b1ca RDI: 0000000000000005 [54570.030907][T772636] RBP: ffff88801d6a0638 R08: 0000000000000005 R09: 0000000000000000 [54570.031443][T772636] R10: 0000000080000000 R11: 0000000000000001 R12: 0000000000000000 [54570.031972][T772636] R13: ffff88801d6a07da R14: 0000000000000000 R15: 0000000000000001 [54570.032507][T772636] ? __wake_up_klogd.part.0+0x99/0xf0 [54570.032896][T772636] ? cec_claim_log_addrs+0x29a/0x7c0 [54570.033271][T772636] ? cec_adap_enable+0x534/0xbd0 [54570.033620][T772636] __cec_s_log_addrs+0xdfc/0x16e0 [54570.033978][T772636] cec_ioctl+0x1e7c/0x2690 [54570.034296][T772636] ? cec_release+0xbb0/0xbb0 [54570.034623][T772636] ? tomoyo_execute_permission+0x4a0/0x4a0 [54570.035030][T772636] ? __sanitizer_cov_trace_switch+0x54/0x90 [54570.035447][T772636] ? do_vfs_ioctl+0x138/0x16c0 [54570.035782][T772636] ? vfs_fileattr_set+0xc40/0xc40 [54570.036138][T772636] ? lock_downgrade+0x6a0/0x6a0 [54570.036487][T772636] ? bpf_lsm_file_ioctl+0x9/0x10 [54570.036835][T772636] ? cec_release+0xbb0/0xbb0 [54570.037159][T772636] __x64_sys_ioctl+0x19d/0x210 [54570.037497][T772636] do_syscall_64+0xd2/0x250 [54570.037817][T772636] entry_SYSCALL_64_after_hwframe+0x63/0x6b ``` I will collect the programs to trigger another 2 warnings and send you soon. Best, Chenyuan On 2/12/24, 8:42 AM, "Hans Verkuil" > wrote: Hi Chenyuan, On 30/01/2024 15:35, Hans Verkuil wrote: > On 29/01/2024 04:03, Yang, Chenyuan wrote: >> Hi Hans, >> >> Thanks a lot for this new patch! >> >> After applying this new patch in the latest kernel (hash: ecb1b8288dc7ccbdcb3b9df005fa1c0e0c0388a7) and fuzzing with Syzkaller, it seems that the hang still exists. >> To help you better debug it, I attached the covered lines for the fuzz testing and the output of `git diff`. Hope this could help you. >> >> By the way, the syscall descriptions for CEC have been merged into the Syzkaller mainstream: https://urldefense.com/v3/__https://github.com/google/syzkaller/blob/master/sys/linux/dev_cec.txt__;!!DZ3fjg!8zd76_aCk9k_5Rn-mW78tk9IyhZUYovPw2SF9v6Pd5Tof8hm8qocM_NFBwbt0oRdAIH5uE8Ql8ysaoTJrMsn-dbvM9Xwwoo$ . >> >> Let me know if you need further information. >> >> Best, >> Chenyuan > > Here is another patch. This now time outs on all wait_for_completion calls > and reports a WARN_ON and shows additional info. Hopefully this will give me > better insight into what is going on. > > Unfortunately I was unable to reproduce this issue on my VM, so I have to > rely on you to run the test. Did you have time to run the test with this patch? It would be very useful to see the results. Regards, Hans > > Regards, > > Hans > > [PATCH] Test > > Signed-off-by: Hans Verkuil > > --- > diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c > index 5741adf09a2e..b1951eb7f5bd 100644 > --- a/drivers/media/cec/core/cec-adap.c > +++ b/drivers/media/cec/core/cec-adap.c > @@ -935,9 +935,12 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg, > * Release the lock and wait, retake the lock afterwards. > */ > mutex_unlock(&adap->lock); > - wait_for_completion_killable(&data->c); > - if (!data->completed) > - cancel_delayed_work_sync(&data->work); > + if (WARN_ON(wait_for_completion_killable_timeout(&data->c, msecs_to_jiffies(adap->xfer_timeout_ms + 1000)) <= 0)) { > + dprintk(0, "wfc1: %px %d%d%d%d %x\n", adap->kthread_config, > + adap->is_configuring, adap->is_configured, > + adap->is_enabled, adap->must_reconfigure, adap->phys_addr); > + } > + cancel_delayed_work_sync(&data->work); > mutex_lock(&adap->lock); > > /* Cancel the transmit if it was interrupted */ > @@ -1563,10 +1566,12 @@ static int cec_config_thread_func(void *arg) > cec_transmit_msg_fh(adap, &msg, NULL, false); > } > } > + mutex_unlock(&adap->lock); > + call_void_op(adap, configured); > + mutex_lock(&adap->lock); > adap->kthread_config = NULL; > complete(&adap->config_completion); > mutex_unlock(&adap->lock); > - call_void_op(adap, configured); > return 0; > > unconfigure: > @@ -1592,6 +1597,17 @@ static void cec_claim_log_addrs(struct cec_adapter *adap, bool block) > if (WARN_ON(adap->is_configuring || adap->is_configured)) > return; > > + if (adap->kthread_config) { > + mutex_unlock(&adap->lock); > +// wait_for_completion(&adap->config_completion); > + if (WARN_ON(wait_for_completion_killable_timeout(&adap->config_completion, msecs_to_jiffies(10000)) <= 0)) { > + dprintk(0, "wfc2: %px %d%d%d%d %x\n", adap->kthread_config, > + adap->is_configuring, adap->is_configured, > + adap->is_enabled, adap->must_reconfigure, adap->phys_addr); > + } > + mutex_lock(&adap->lock); > + } > + > init_completion(&adap->config_completion); > > /* Ready to kick off the thread */ > @@ -1599,11 +1615,17 @@ static void cec_claim_log_addrs(struct cec_adapter *adap, bool block) > adap->kthread_config = kthread_run(cec_config_thread_func, adap, > "ceccfg-%s", adap->name); > if (IS_ERR(adap->kthread_config)) { > - adap->kthread_config = NULL; > adap->is_configuring = false; > + adap->kthread_config = NULL; > } else if (block) { > mutex_unlock(&adap->lock); > - wait_for_completion(&adap->config_completion); > + //wait_for_completion(&adap->config_completion); > + if (WARN_ON(wait_for_completion_killable_timeout(&adap->config_completion, msecs_to_jiffies(10000)) <= 0)) { > + dprintk(0, "wfc3: %px %d%d%d%d %x\n", adap->kthread_config, > + adap->is_configuring, adap->is_configured, > + adap->is_enabled, adap->must_reconfigure, adap->phys_addr); > + > + } > mutex_lock(&adap->lock); > } > } > diff --git a/drivers/media/cec/core/cec-api.c b/drivers/media/cec/core/cec-api.c > index 67dc79ef1705..d64bb716f9c6 100644 > --- a/drivers/media/cec/core/cec-api.c > +++ b/drivers/media/cec/core/cec-api.c > @@ -664,6 +664,8 @@ static int cec_release(struct inode *inode, struct file *filp) > list_del_init(&data->xfer_list); > } > mutex_unlock(&adap->lock); > + > + mutex_lock(&fh->lock); > while (!list_empty(&fh->msgs)) { > struct cec_msg_entry *entry = > list_first_entry(&fh->msgs, struct cec_msg_entry, list); > @@ -681,6 +683,7 @@ static int cec_release(struct inode *inode, struct file *filp) > kfree(entry); > } > } > + mutex_unlock(&fh->lock); > kfree(fh); > > cec_put_device(devnode); >