linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* net: hang in ip_finish_output
@ 2016-01-15 17:57 Dmitry Vyukov
  2016-01-16  0:20 ` Craig Gallek
  0 siblings, 1 reply; 9+ messages in thread
From: Dmitry Vyukov @ 2016-01-15 17:57 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML
  Cc: syzkaller, Kostya Serebryany, Alexander Potapenko, Eric Dumazet,
	Sasha Levin

Hello,

The following program hangs kernel in ip_finish_output (if run in a
parallel loop):

// autogenerated by syzkaller (http://github.com/google/syzkaller)
#include <unistd.h>
#include <sys/syscall.h>
#include <string.h>
#include <stdint.h>
#include <pthread.h>

long r[21];

void *thr(void *arg)
{
        switch ((long)arg) {
        case 0:
                r[0] = syscall(SYS_mmap, 0x20000000ul, 0x1d000ul,
0x3ul, 0x32ul, 0xfffffffffffffffful, 0x0ul);
                break;
        case 1:
                r[1] = syscall(SYS_socket, 0x2ul, 0x2ul, 0x0ul, 0, 0, 0);
                break;
        case 2:
                *(uint32_t*)0x20016000 = (uint32_t)0xe4c;
                r[3] = syscall(SYS_setsockopt, r[1], 0x1ul, 0xful,
0x20016000ul, 0x4ul, 0);
                break;
        case 3:
                memcpy((void*)0x20009000,
"\x11\x8d\x57\x62\x7e\xc3\xd8\xb9\xe3\xd8\x63\x4d\x56\x77\x7b\xf1\x3c\x7e\x0c\x91\x62\xca\xc6\x26\xb1\xb9\x11\x34\x6a\xae",
30);
                r[5] = syscall(SYS_pwrite64, r[1], 0x20009000ul,
0x1eul, 0x0ul, 0, 0);
                break;
        case 4:
                memcpy((void*)0x20006f80,
"\x02\x00\x33\xd9\x7f\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
128);
                r[7] = syscall(SYS_bind, r[1], 0x20006f80ul, 0x80ul, 0, 0, 0);
                break;
        case 5:
                memcpy((void*)0x20016f80,
"\x02\x00\x33\xd9\x7f\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
128);
                r[9] = syscall(SYS_connect, r[1], 0x20016f80ul,
0x80ul, 0, 0, 0);
                break;
        case 6:
                *(uint32_t*)0x20011b08 = (uint32_t)0x18;
                *(uint32_t*)0x20011b0c = (uint32_t)0x2;
                *(uint64_t*)0x20011b10 = (uint64_t)0x0;
                *(uint32_t*)0x20011b18 = (uint32_t)0xfffffffffffffffc;
                r[14] = syscall(SYS_write, r[1], 0x20011b08ul, 0x18ul, 0, 0, 0);
                break;
        case 7:
                r[15] = syscall(SYS_getsockopt, r[1], 0x0ul, 0x3ul,
0x2000b000ul, 0x2000bffful, 0);
                break;
        case 8:
                memcpy((void*)0x20013c04,
"\xdb\x4c\xcc\xa8\x07\xbd\xaa\x58\x7c\x57\x37\x63\xa1\x4d\xdb\x5b\x85\x4e\x37\x3b\x20\xb3\x12\xef\x9b\x75\xf0\x88\x28\xa5\x43\x8e\x56\x59\x3c\x16\xfd\xa0\x01\x4f\x90\x83\x4c\x1b\x22\x3e\xd4\xea\x36\x6f\xb5\x43\x96\x02\x8e\x82\xa1\xc6\x47\xd7\xeb\x08\x56\x6f\x40\xb6\x00\x3f\x52\x38\x99\x2f\x57\x63\x9b\xe4\x0e\xb2\x59\xb2\x59\xbc\x9d\x46\xd0\x52\xd4\x91\xe8\xee\x7f\xcf\x81\xa0\xd5\x10\xc4\x77\xf6\xa1\xa1\x35\xb3\xeb\xb5\x46\xfe\xbc\x83\x74\x9f\x78\xa4\xf1\x0b\xf2\x3a\x41\xc3\x2d\x78\x32\x3b\x88\xe9\xb7\x9f\x56",
128);
                r[17] = syscall(SYS_write, r[1], 0x20013c04ul, 0x80ul, 0, 0, 0);
                break;
        case 9:
                r[18] = syscall(SYS_read, r[1], 0x2001afabul, 0x55ul, 0, 0, 0);
                break;
        case 10:
                memcpy((void*)0x200173b8,
"\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
128);
                r[20] = syscall(SYS_recvfrom, r[1], 0x20017000ul,
0xc7ul, 0x0ul, 0x200173b8ul, 0x80ul);
                break;
        }
        return 0;
}

int main()
{
        long i;
        pthread_t th[11];

        memset(r, -1, sizeof(r));
        for (i = 0; i < 11; i++) {
                pthread_create(&th[i], 0, thr, (void*)i);
                usleep(10000);
        }
        for (i = 0; i < 11; i++) {
                pthread_create(&th[i], 0, thr, (void*)i);
                if (i%2==0)
                        usleep(10000);
        }
        usleep(100000);
        return 0;
}


INFO: rcu_sched self-detected stall on CPU
1-...: (1 GPs behind) idle=a8b/140000000000001/0 softirq=83363/83364 fqs=6
(t=26000 jiffies g=29224 c=29223 q=50)
rcu_sched kthread starved for 25980 jiffies! g29224 c29223 f0x0
RCU_GP_WAIT_FQS(3) ->state=0x1
rcu_sched       S ffff88003d827a98 29272     8      2 0x00000000
 ffff88003d827a98 ffff88003ec16d40 ffff88003d80e718 00ffed0007b04f6f
 ffff88003ec20a70 ffff88003ec20a48 ffff88003ec200d8 ffff88003d80df08
 ffff88003ec200c0 ffff880034c517c0 ffff88003d80df00 ffff88003d820000
Call Trace:
 [<ffffffff8625cf27>] schedule+0x97/0x1c0 kernel/sched/core.c:3311
 [<ffffffff86268feb>] schedule_timeout+0x36b/0x920 kernel/time/timer.c:1531
 [<ffffffff814af122>] rcu_gp_kthread+0xad2/0x1b60 kernel/rcu/tree.c:2125
 [<ffffffff813b3b2f>] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303
 [<ffffffff8626c7ef>] ret_from_fork+0x3f/0x70 arch/x86/entry/entry_64.S:468
Task dump for CPU 1:
syz-executor    R  running task    27912 26692  11656 0x0000000a
 ffff880036ebed30 ffff88003ed07230 ffffffff813e7289 0000000000000005
 ffff88003ed200c0 0000000000000001 1ffffffff0ebbab0 ffffffff875dd548
 dffffc0000000000 ffff88003ed07250 ffffffff813ef951 ffffffff875dd500
Call Trace:
 <IRQ>  [<ffffffff813e7289>] sched_show_task+0x269/0x3b0
kernel/sched/core.c:5036
 [<ffffffff813ef951>] dump_cpu_task+0x71/0x90 kernel/sched/core.c:8717
 [<ffffffff814a4c71>] rcu_dump_cpu_stacks+0x181/0x270 kernel/rcu/tree.c:1247
 [<     inline     >] print_cpu_stall kernel/rcu/tree.c:1354
 [<     inline     >] check_cpu_stall kernel/rcu/tree.c:1418
 [<     inline     >] __rcu_pending kernel/rcu/tree.c:3906
 [<     inline     >] rcu_pending kernel/rcu/tree.c:3970
 [<ffffffff814b1b90>] rcu_check_callbacks+0xd40/0x1e10 kernel/rcu/tree.c:2795
 [<ffffffff814c23aa>] update_process_times+0x3a/0x70 kernel/time/timer.c:1420
 [<ffffffff814eb8df>] tick_sched_handle.isra.20+0xaf/0xe0
kernel/time/tick-sched.c:151
 [<ffffffff814ec005>] tick_sched_timer+0x75/0x100 kernel/time/tick-sched.c:1086
 [<     inline     >] __run_hrtimer kernel/time/hrtimer.c:1229
 [<ffffffff814c4173>] __hrtimer_run_queues+0x363/0xc10
kernel/time/hrtimer.c:1293
 [<ffffffff814c6172>] hrtimer_interrupt+0x182/0x430 kernel/time/hrtimer.c:1327
 [<ffffffff8124f052>] local_apic_timer_interrupt+0x72/0xe0
arch/x86/kernel/apic/apic.c:907
 [<ffffffff81252579>] smp_apic_timer_interrupt+0x79/0xa0
arch/x86/kernel/apic/apic.c:931
 [<ffffffff8626d22c>] apic_timer_interrupt+0x8c/0xa0
arch/x86/entry/entry_64.S:520
 [<ffffffff814a1244>] rcu_read_lock_held+0xa4/0xc0 kernel/rcu/update.c:293
 [<ffffffff851bdc78>] reuseport_select_sock+0x4a8/0x10c0
net/core/sock_reuseport.c:211
 [<ffffffff85529032>] udp4_lib_lookup2+0x362/0x7a0 net/ipv4/udp.c:518
 [<ffffffff8552a09d>] __udp4_lib_lookup+0x26d/0xd20 net/ipv4/udp.c:576
 [<     inline     >] __udp4_lib_lookup_skb net/ipv4/udp.c:651
 [<ffffffff85530785>] __udp4_lib_rcv+0x1135/0x23e0 net/ipv4/udp.c:1891
 [<ffffffff85532f91>] udp_rcv+0x21/0x30 net/ipv4/udp.c:2108
 [<ffffffff8545dd03>] ip_local_deliver_finish+0x2b3/0xa50
net/ipv4/ip_input.c:216
 [<     inline     >] NF_HOOK_THRESH include/linux/netfilter.h:226
 [<     inline     >] NF_HOOK include/linux/netfilter.h:249
 [<ffffffff8545eba4>] ip_local_deliver+0x1c4/0x2f0 net/ipv4/ip_input.c:257
 [<     inline     >] dst_input include/net/dst.h:498
 [<ffffffff8545c90c>] ip_rcv_finish+0x5ec/0x1730 net/ipv4/ip_input.c:365
 [<     inline     >] NF_HOOK_THRESH include/linux/netfilter.h:226
 [<     inline     >] NF_HOOK include/linux/netfilter.h:249
 [<ffffffff8545f633>] ip_rcv+0x963/0x1080 net/ipv4/ip_input.c:455
 [<ffffffff85165850>] __netif_receive_skb_core+0x1620/0x2f80 net/core/dev.c:4149
 [<ffffffff851671da>] __netif_receive_skb+0x2a/0x160 net/core/dev.c:4184
 [<ffffffff851675be>] process_backlog+0x2ae/0x820 net/core/dev.c:4765
 [<     inline     >] napi_poll net/core/dev.c:5069
 [<ffffffff8516cc9b>] net_rx_action+0x7eb/0xdf0 net/core/dev.c:5134
 [<ffffffff813615ca>] __do_softirq+0x26a/0x920 kernel/softirq.c:273
 [<ffffffff8626e35c>] do_softirq_own_stack+0x1c/0x30
arch/x86/entry/entry_64.S:875
 <EOI>  [<ffffffff8135fbb4>] do_softirq.part.22+0x104/0x130 kernel/softirq.c:317
 [<     inline     >] do_softirq kernel/softirq.c:165
 [<ffffffff8135fe62>] __local_bh_enable_ip+0x142/0x190 kernel/softirq.c:170
 [<     inline     >] local_bh_enable include/linux/bottom_half.h:31
 [<     inline     >] rcu_read_unlock_bh include/linux/rcupdate.h:970
 [<ffffffff854702c3>] ip_finish_output2+0x703/0x11a0 net/ipv4/ip_output.c:211
 [<ffffffff8547405d>] ip_finish_output+0x7bd/0xd00 net/ipv4/ip_output.c:283
 [<     inline     >] NF_HOOK_COND include/linux/netfilter.h:240
 [<ffffffff85477778>] ip_output+0x218/0x460 net/ipv4/ip_output.c:357
 [<     inline     >] dst_output include/net/dst.h:492
 [<ffffffff85474a3b>] ip_local_out+0xdb/0x1c0 net/ipv4/ip_output.c:115
 [<ffffffff8547a21c>] ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1439
 [<ffffffff85521506>] udp_send_skb+0x326/0xbf0 net/ipv4/udp.c:914
 [<ffffffff8552b43e>] udp_sendmsg+0x87e/0x1e70 net/ipv4/udp.c:1139
 [<ffffffff855590f7>] inet_sendmsg+0x2f7/0x4c0 net/ipv4/af_inet.c:736
 [<     inline     >] sock_sendmsg_nosec net/socket.c:611
 [<ffffffff8510417a>] sock_sendmsg+0xca/0x110 net/socket.c:621
 [<ffffffff851043d6>] sock_write_iter+0x216/0x3a0 net/socket.c:820
 [<     inline     >] new_sync_write fs/read_write.c:517
 [<ffffffff8178d122>] __vfs_write+0x302/0x480 fs/read_write.c:530
 [<ffffffff8178e9c7>] vfs_write+0x167/0x4a0 fs/read_write.c:577
 [<     inline     >] SYSC_write fs/read_write.c:624
 [<ffffffff81791cb1>] SyS_write+0x111/0x220 fs/read_write.c:616
 [<ffffffff8626c436>] entry_SYSCALL_64_fastpath+0x16/0x7a
arch/x86/entry/entry_64.S:185

On commit 67990608c8b95d2b8ccc29932376ae73d5818727 (Jan 12).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-15 17:57 net: hang in ip_finish_output Dmitry Vyukov
@ 2016-01-16  0:20 ` Craig Gallek
  2016-01-16  7:29   ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Craig Gallek @ 2016-01-16  0:20 UTC (permalink / raw)
  To: Dmitry Vyukov; +Cc: David S. Miller, netdev, LKML

On Fri, Jan 15, 2016 at 12:57 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> Hello,
>
> The following program hangs kernel in ip_finish_output (if run in a
> parallel loop):
>
> // autogenerated by syzkaller (http://github.com/google/syzkaller)
> #include <unistd.h>
> #include <sys/syscall.h>
> #include <string.h>
> #include <stdint.h>
> #include <pthread.h>
>
> long r[21];
>
> void *thr(void *arg)
> {
>         switch ((long)arg) {
>         case 0:
>                 r[0] = syscall(SYS_mmap, 0x20000000ul, 0x1d000ul,
> 0x3ul, 0x32ul, 0xfffffffffffffffful, 0x0ul);
>                 break;
>         case 1:
>                 r[1] = syscall(SYS_socket, 0x2ul, 0x2ul, 0x0ul, 0, 0, 0);
>                 break;
>         case 2:
>                 *(uint32_t*)0x20016000 = (uint32_t)0xe4c;
>                 r[3] = syscall(SYS_setsockopt, r[1], 0x1ul, 0xful,
> 0x20016000ul, 0x4ul, 0);
>                 break;
>         case 3:
>                 memcpy((void*)0x20009000,
> "\x11\x8d\x57\x62\x7e\xc3\xd8\xb9\xe3\xd8\x63\x4d\x56\x77\x7b\xf1\x3c\x7e\x0c\x91\x62\xca\xc6\x26\xb1\xb9\x11\x34\x6a\xae",
> 30);
>                 r[5] = syscall(SYS_pwrite64, r[1], 0x20009000ul,
> 0x1eul, 0x0ul, 0, 0);
>                 break;
>         case 4:
>                 memcpy((void*)0x20006f80,
> "\x02\x00\x33\xd9\x7f\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
> 128);
>                 r[7] = syscall(SYS_bind, r[1], 0x20006f80ul, 0x80ul, 0, 0, 0);
>                 break;
>         case 5:
>                 memcpy((void*)0x20016f80,
> "\x02\x00\x33\xd9\x7f\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
> 128);
>                 r[9] = syscall(SYS_connect, r[1], 0x20016f80ul,
> 0x80ul, 0, 0, 0);
>                 break;
>         case 6:
>                 *(uint32_t*)0x20011b08 = (uint32_t)0x18;
>                 *(uint32_t*)0x20011b0c = (uint32_t)0x2;
>                 *(uint64_t*)0x20011b10 = (uint64_t)0x0;
>                 *(uint32_t*)0x20011b18 = (uint32_t)0xfffffffffffffffc;
>                 r[14] = syscall(SYS_write, r[1], 0x20011b08ul, 0x18ul, 0, 0, 0);
>                 break;
>         case 7:
>                 r[15] = syscall(SYS_getsockopt, r[1], 0x0ul, 0x3ul,
> 0x2000b000ul, 0x2000bffful, 0);
>                 break;
>         case 8:
>                 memcpy((void*)0x20013c04,
> "\xdb\x4c\xcc\xa8\x07\xbd\xaa\x58\x7c\x57\x37\x63\xa1\x4d\xdb\x5b\x85\x4e\x37\x3b\x20\xb3\x12\xef\x9b\x75\xf0\x88\x28\xa5\x43\x8e\x56\x59\x3c\x16\xfd\xa0\x01\x4f\x90\x83\x4c\x1b\x22\x3e\xd4\xea\x36\x6f\xb5\x43\x96\x02\x8e\x82\xa1\xc6\x47\xd7\xeb\x08\x56\x6f\x40\xb6\x00\x3f\x52\x38\x99\x2f\x57\x63\x9b\xe4\x0e\xb2\x59\xb2\x59\xbc\x9d\x46\xd0\x52\xd4\x91\xe8\xee\x7f\xcf\x81\xa0\xd5\x10\xc4\x77\xf6\xa1\xa1\x35\xb3\xeb\xb5\x46\xfe\xbc\x83\x74\x9f\x78\xa4\xf1\x0b\xf2\x3a\x41\xc3\x2d\x78\x32\x3b\x88\xe9\xb7\x9f\x56",
> 128);
>                 r[17] = syscall(SYS_write, r[1], 0x20013c04ul, 0x80ul, 0, 0, 0);
>                 break;
>         case 9:
>                 r[18] = syscall(SYS_read, r[1], 0x2001afabul, 0x55ul, 0, 0, 0);
>                 break;
>         case 10:
>                 memcpy((void*)0x200173b8,
> "\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
> 128);
>                 r[20] = syscall(SYS_recvfrom, r[1], 0x20017000ul,
> 0xc7ul, 0x0ul, 0x200173b8ul, 0x80ul);
>                 break;
>         }
>         return 0;
> }
>
> int main()
> {
>         long i;
>         pthread_t th[11];
>
>         memset(r, -1, sizeof(r));
>         for (i = 0; i < 11; i++) {
>                 pthread_create(&th[i], 0, thr, (void*)i);
>                 usleep(10000);
>         }
>         for (i = 0; i < 11; i++) {
>                 pthread_create(&th[i], 0, thr, (void*)i);
>                 if (i%2==0)
>                         usleep(10000);
>         }
>         usleep(100000);
>         return 0;
> }
>
>
> INFO: rcu_sched self-detected stall on CPU
> 1-...: (1 GPs behind) idle=a8b/140000000000001/0 softirq=83363/83364 fqs=6
> (t=26000 jiffies g=29224 c=29223 q=50)
> rcu_sched kthread starved for 25980 jiffies! g29224 c29223 f0x0
> RCU_GP_WAIT_FQS(3) ->state=0x1
> rcu_sched       S ffff88003d827a98 29272     8      2 0x00000000
>  ffff88003d827a98 ffff88003ec16d40 ffff88003d80e718 00ffed0007b04f6f
>  ffff88003ec20a70 ffff88003ec20a48 ffff88003ec200d8 ffff88003d80df08
>  ffff88003ec200c0 ffff880034c517c0 ffff88003d80df00 ffff88003d820000
> Call Trace:
>  [<ffffffff8625cf27>] schedule+0x97/0x1c0 kernel/sched/core.c:3311
>  [<ffffffff86268feb>] schedule_timeout+0x36b/0x920 kernel/time/timer.c:1531
>  [<ffffffff814af122>] rcu_gp_kthread+0xad2/0x1b60 kernel/rcu/tree.c:2125
>  [<ffffffff813b3b2f>] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303
>  [<ffffffff8626c7ef>] ret_from_fork+0x3f/0x70 arch/x86/entry/entry_64.S:468
> Task dump for CPU 1:
> syz-executor    R  running task    27912 26692  11656 0x0000000a
>  ffff880036ebed30 ffff88003ed07230 ffffffff813e7289 0000000000000005
>  ffff88003ed200c0 0000000000000001 1ffffffff0ebbab0 ffffffff875dd548
>  dffffc0000000000 ffff88003ed07250 ffffffff813ef951 ffffffff875dd500
> Call Trace:
>  <IRQ>  [<ffffffff813e7289>] sched_show_task+0x269/0x3b0
> kernel/sched/core.c:5036
>  [<ffffffff813ef951>] dump_cpu_task+0x71/0x90 kernel/sched/core.c:8717
>  [<ffffffff814a4c71>] rcu_dump_cpu_stacks+0x181/0x270 kernel/rcu/tree.c:1247
>  [<     inline     >] print_cpu_stall kernel/rcu/tree.c:1354
>  [<     inline     >] check_cpu_stall kernel/rcu/tree.c:1418
>  [<     inline     >] __rcu_pending kernel/rcu/tree.c:3906
>  [<     inline     >] rcu_pending kernel/rcu/tree.c:3970
>  [<ffffffff814b1b90>] rcu_check_callbacks+0xd40/0x1e10 kernel/rcu/tree.c:2795
>  [<ffffffff814c23aa>] update_process_times+0x3a/0x70 kernel/time/timer.c:1420
>  [<ffffffff814eb8df>] tick_sched_handle.isra.20+0xaf/0xe0
> kernel/time/tick-sched.c:151
>  [<ffffffff814ec005>] tick_sched_timer+0x75/0x100 kernel/time/tick-sched.c:1086
>  [<     inline     >] __run_hrtimer kernel/time/hrtimer.c:1229
>  [<ffffffff814c4173>] __hrtimer_run_queues+0x363/0xc10
> kernel/time/hrtimer.c:1293
>  [<ffffffff814c6172>] hrtimer_interrupt+0x182/0x430 kernel/time/hrtimer.c:1327
>  [<ffffffff8124f052>] local_apic_timer_interrupt+0x72/0xe0
> arch/x86/kernel/apic/apic.c:907
>  [<ffffffff81252579>] smp_apic_timer_interrupt+0x79/0xa0
> arch/x86/kernel/apic/apic.c:931
>  [<ffffffff8626d22c>] apic_timer_interrupt+0x8c/0xa0
> arch/x86/entry/entry_64.S:520
>  [<ffffffff814a1244>] rcu_read_lock_held+0xa4/0xc0 kernel/rcu/update.c:293
>  [<ffffffff851bdc78>] reuseport_select_sock+0x4a8/0x10c0
> net/core/sock_reuseport.c:211
>  [<ffffffff85529032>] udp4_lib_lookup2+0x362/0x7a0 net/ipv4/udp.c:518
>  [<ffffffff8552a09d>] __udp4_lib_lookup+0x26d/0xd20 net/ipv4/udp.c:576
>  [<     inline     >] __udp4_lib_lookup_skb net/ipv4/udp.c:651
>  [<ffffffff85530785>] __udp4_lib_rcv+0x1135/0x23e0 net/ipv4/udp.c:1891
>  [<ffffffff85532f91>] udp_rcv+0x21/0x30 net/ipv4/udp.c:2108
>  [<ffffffff8545dd03>] ip_local_deliver_finish+0x2b3/0xa50
> net/ipv4/ip_input.c:216
>  [<     inline     >] NF_HOOK_THRESH include/linux/netfilter.h:226
>  [<     inline     >] NF_HOOK include/linux/netfilter.h:249
>  [<ffffffff8545eba4>] ip_local_deliver+0x1c4/0x2f0 net/ipv4/ip_input.c:257
>  [<     inline     >] dst_input include/net/dst.h:498
>  [<ffffffff8545c90c>] ip_rcv_finish+0x5ec/0x1730 net/ipv4/ip_input.c:365
>  [<     inline     >] NF_HOOK_THRESH include/linux/netfilter.h:226
>  [<     inline     >] NF_HOOK include/linux/netfilter.h:249
>  [<ffffffff8545f633>] ip_rcv+0x963/0x1080 net/ipv4/ip_input.c:455
>  [<ffffffff85165850>] __netif_receive_skb_core+0x1620/0x2f80 net/core/dev.c:4149
>  [<ffffffff851671da>] __netif_receive_skb+0x2a/0x160 net/core/dev.c:4184
>  [<ffffffff851675be>] process_backlog+0x2ae/0x820 net/core/dev.c:4765
>  [<     inline     >] napi_poll net/core/dev.c:5069
>  [<ffffffff8516cc9b>] net_rx_action+0x7eb/0xdf0 net/core/dev.c:5134
>  [<ffffffff813615ca>] __do_softirq+0x26a/0x920 kernel/softirq.c:273
>  [<ffffffff8626e35c>] do_softirq_own_stack+0x1c/0x30
> arch/x86/entry/entry_64.S:875
>  <EOI>  [<ffffffff8135fbb4>] do_softirq.part.22+0x104/0x130 kernel/softirq.c:317
>  [<     inline     >] do_softirq kernel/softirq.c:165
>  [<ffffffff8135fe62>] __local_bh_enable_ip+0x142/0x190 kernel/softirq.c:170
>  [<     inline     >] local_bh_enable include/linux/bottom_half.h:31
>  [<     inline     >] rcu_read_unlock_bh include/linux/rcupdate.h:970
>  [<ffffffff854702c3>] ip_finish_output2+0x703/0x11a0 net/ipv4/ip_output.c:211
>  [<ffffffff8547405d>] ip_finish_output+0x7bd/0xd00 net/ipv4/ip_output.c:283
>  [<     inline     >] NF_HOOK_COND include/linux/netfilter.h:240
>  [<ffffffff85477778>] ip_output+0x218/0x460 net/ipv4/ip_output.c:357
>  [<     inline     >] dst_output include/net/dst.h:492
>  [<ffffffff85474a3b>] ip_local_out+0xdb/0x1c0 net/ipv4/ip_output.c:115
>  [<ffffffff8547a21c>] ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1439
>  [<ffffffff85521506>] udp_send_skb+0x326/0xbf0 net/ipv4/udp.c:914
>  [<ffffffff8552b43e>] udp_sendmsg+0x87e/0x1e70 net/ipv4/udp.c:1139
>  [<ffffffff855590f7>] inet_sendmsg+0x2f7/0x4c0 net/ipv4/af_inet.c:736
>  [<     inline     >] sock_sendmsg_nosec net/socket.c:611
>  [<ffffffff8510417a>] sock_sendmsg+0xca/0x110 net/socket.c:621
>  [<ffffffff851043d6>] sock_write_iter+0x216/0x3a0 net/socket.c:820
>  [<     inline     >] new_sync_write fs/read_write.c:517
>  [<ffffffff8178d122>] __vfs_write+0x302/0x480 fs/read_write.c:530
>  [<ffffffff8178e9c7>] vfs_write+0x167/0x4a0 fs/read_write.c:577
>  [<     inline     >] SYSC_write fs/read_write.c:624
>  [<ffffffff81791cb1>] SyS_write+0x111/0x220 fs/read_write.c:616
>  [<ffffffff8626c436>] entry_SYSCALL_64_fastpath+0x16/0x7a
> arch/x86/entry/entry_64.S:185
>
> On commit 67990608c8b95d2b8ccc29932376ae73d5818727 (Jan 12).

I wasn't able to reproduce this exact stack trace, but I was able to
cause soft lockup messages with a fork bomb of your test program.  It
is certainly related to my recent SO_REUSEPORT change (reverting it
seems to fix the problem).  I haven't completely figured out the exact
cause yet, though.  Could you please post your configuration and
exactly how you are running this 'parallel loop'?

Craig

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-16  0:20 ` Craig Gallek
@ 2016-01-16  7:29   ` Eric Dumazet
  2016-01-18  3:12     ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2016-01-16  7:29 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Fri, 2016-01-15 at 19:20 -0500, Craig Gallek wrote:

> I wasn't able to reproduce this exact stack trace, but I was able to
> cause soft lockup messages with a fork bomb of your test program.  It
> is certainly related to my recent SO_REUSEPORT change (reverting it
> seems to fix the problem).  I haven't completely figured out the exact
> cause yet, though.  Could you please post your configuration and
> exactly how you are running this 'parallel loop'?

There is a problem in the lookup functions (udp4_lib_lookup2() &
__udp4_lib_lookup())

Because of RCU SLAB_DESTROY_BY_RCU semantics (check
Documentation/RCU/rculist_nulls.txt for some details), you should not
call reuseport_select_sock(sk, ...) without taking a stable reference on
the sk socket. (and checking the lookup keys again)

This is because sk could be freed, re-used by a totally different UDP
socket on a different port, and the incoming frame(s) could be delivered
on the wrong socket/channel/application :(

Note that we discussed some time ago to remove SLAB_DESTROY_BY_RCU for
UDP sockets (and freeing them after rcu grace period instead), so make
UDP rx path faster, as we would no longer need to increment/decrement
the socket refcount. This also would remove the added false sharing on
sk_refcnt for the case the UDP socket serves as a tunnel (up->encap_rcv
being non NULL)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-16  7:29   ` Eric Dumazet
@ 2016-01-18  3:12     ` Eric Dumazet
  2016-01-18 16:21       ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2016-01-18  3:12 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Fri, 2016-01-15 at 23:29 -0800, Eric Dumazet wrote:
> On Fri, 2016-01-15 at 19:20 -0500, Craig Gallek wrote:
> 
> > I wasn't able to reproduce this exact stack trace, but I was able to
> > cause soft lockup messages with a fork bomb of your test program.  It
> > is certainly related to my recent SO_REUSEPORT change (reverting it
> > seems to fix the problem).  I haven't completely figured out the exact
> > cause yet, though.  Could you please post your configuration and
> > exactly how you are running this 'parallel loop'?
> 
> There is a problem in the lookup functions (udp4_lib_lookup2() &
> __udp4_lib_lookup())
> 
> Because of RCU SLAB_DESTROY_BY_RCU semantics (check
> Documentation/RCU/rculist_nulls.txt for some details), you should not
> call reuseport_select_sock(sk, ...) without taking a stable reference on
> the sk socket. (and checking the lookup keys again)
> 
> This is because sk could be freed, re-used by a totally different UDP
> socket on a different port, and the incoming frame(s) could be delivered
> on the wrong socket/channel/application :(
> 
> Note that we discussed some time ago to remove SLAB_DESTROY_BY_RCU for
> UDP sockets (and freeing them after rcu grace period instead), so make
> UDP rx path faster, as we would no longer need to increment/decrement
> the socket refcount. This also would remove the added false sharing on
> sk_refcnt for the case the UDP socket serves as a tunnel (up->encap_rcv
> being non NULL)

Hmm... not it looks you do the lookup , refcnt change, re-lookup just
fine.

The problem here is that UDP connected sockets update the
sk->sk_incoming_cpu from __udp_queue_rcv_skb()

This means that we can find the first socket in hash table with a
matching incoming cpu, and badness == high_score + 1

Then, the reuseport_select_sock() can selects another socket from the
array (using bpf or the hash )

We do the atomic_inc_not_zero_hint() to update sk_refcnt on the new
socket, then compute_score2() returns high_score (< badness)

So we loop back to the beginning of udp4_lib_lookup2(), and we loop
forever (as long as the first socket in hash table has still this match
about incoming cpu)

In short, the recent SO_REUSE_PORT changes are not compatible with the
SO_INCOMING_CPU ones, if connected UDP sockets are used.

A fix could be to not check sk_incoming_cpu on connected sockets (this
makes really little sense, as this option was meant to spread traffic on
UDP _servers_ ). Also it collides with SO_REUSEPORT notion of a group of
sockets having the same score.

Dmitry, could you test it ? I could not get the trace you reported.

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc45b538e237..f76cf0ec82b1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -413,6 +413,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 		if (inet->inet_daddr != saddr)
 			return -1;
 		score += 4;
+	} else if (sk->sk_incoming_cpu == raw_smp_processor_id()) {
+		score++;
 	}
 
 	if (inet->inet_dport) {
@@ -426,8 +428,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 		score += 4;
 	}
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
-		score++;
 	return score;
 }
 
@@ -457,6 +457,8 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 		if (inet->inet_daddr != saddr)
 			return -1;
 		score += 4;
+	} else if (sk->sk_incoming_cpu == raw_smp_processor_id()) {
+		score++;
 	}
 
 	if (inet->inet_dport) {
@@ -470,10 +472,6 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 			return -1;
 		score += 4;
 	}
-
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
-		score++;
-
 	return score;
 }
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5d2c2afffe7b..0d87b1ded070 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -190,6 +190,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 		if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
 			return -1;
 		score++;
+	} else if (sk->sk_incoming_cpu == raw_smp_processor_id()) {
+		score++;
 	}
 
 	if (sk->sk_bound_dev_if) {
@@ -197,10 +199,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 		score++;
 	}
-
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
-		score++;
-
 	return score;
 }
 
@@ -233,6 +231,8 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 		if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
 			return -1;
 		score++;
+	} else if (sk->sk_incoming_cpu == raw_smp_processor_id()) {
+		score++;
 	}
 
 	if (sk->sk_bound_dev_if) {
@@ -240,10 +240,6 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 			return -1;
 		score++;
 	}
-
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
-		score++;
-
 	return score;
 }
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-18  3:12     ` Eric Dumazet
@ 2016-01-18 16:21       ` Eric Dumazet
  0 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2016-01-18 16:21 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Sun, 2016-01-17 at 19:12 -0800, Eric Dumazet wrote:
> On Fri, 2016-01-15 at 23:29 -0800, Eric Dumazet wrote:
> > On Fri, 2016-01-15 at 19:20 -0500, Craig Gallek wrote:
> > 
> > > I wasn't able to reproduce this exact stack trace, but I was able to
> > > cause soft lockup messages with a fork bomb of your test program.  It
> > > is certainly related to my recent SO_REUSEPORT change (reverting it
> > > seems to fix the problem).  I haven't completely figured out the exact
> > > cause yet, though.  Could you please post your configuration and
> > > exactly how you are running this 'parallel loop'?
> > 
> > There is a problem in the lookup functions (udp4_lib_lookup2() &
> > __udp4_lib_lookup())
> > 
> > Because of RCU SLAB_DESTROY_BY_RCU semantics (check
> > Documentation/RCU/rculist_nulls.txt for some details), you should not
> > call reuseport_select_sock(sk, ...) without taking a stable reference on
> > the sk socket. (and checking the lookup keys again)
> > 
> > This is because sk could be freed, re-used by a totally different UDP
> > socket on a different port, and the incoming frame(s) could be delivered
> > on the wrong socket/channel/application :(
> > 
> > Note that we discussed some time ago to remove SLAB_DESTROY_BY_RCU for
> > UDP sockets (and freeing them after rcu grace period instead), so make
> > UDP rx path faster, as we would no longer need to increment/decrement
> > the socket refcount. This also would remove the added false sharing on
> > sk_refcnt for the case the UDP socket serves as a tunnel (up->encap_rcv
> > being non NULL)
> 
> Hmm... not it looks you do the lookup , refcnt change, re-lookup just
> fine.
> 
> The problem here is that UDP connected sockets update the
> sk->sk_incoming_cpu from __udp_queue_rcv_skb()
> 
> This means that we can find the first socket in hash table with a
> matching incoming cpu, and badness == high_score + 1
> 
> Then, the reuseport_select_sock() can selects another socket from the
> array (using bpf or the hash )
> 
> We do the atomic_inc_not_zero_hint() to update sk_refcnt on the new
> socket, then compute_score2() returns high_score (< badness)
> 
> So we loop back to the beginning of udp4_lib_lookup2(), and we loop
> forever (as long as the first socket in hash table has still this match
> about incoming cpu)
> 
> In short, the recent SO_REUSE_PORT changes are not compatible with the
> SO_INCOMING_CPU ones, if connected UDP sockets are used.
> 
> A fix could be to not check sk_incoming_cpu on connected sockets (this
> makes really little sense, as this option was meant to spread traffic on
> UDP _servers_ ). Also it collides with SO_REUSEPORT notion of a group of
> sockets having the same score.
> 
> Dmitry, could you test it ? I could not get the trace you reported.

BTW, it could be the bug is hard to trigger because of IP early demux :

When connected UDP sockets are used, __udp4_lib_demux_lookup() returns
first socket found in the hash chain, so all incoming messages should be
delivered on this socket. (The normal reuseport hash/bpf spread does not
happen)

So to trigger the bug more easily we can disable early demux :

echo 0 >/proc/sys/net/ipv4/ip_early_demux

We also should disallow ip early demux on SO_REUSEPORT UDP sockets.

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc45b538e237..55954094ab17 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2026,7 +2026,8 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 	result = NULL;
 	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
 		if (INET_MATCH(sk, net, acookie,
-			       rmt_addr, loc_addr, ports, dif))
+			       rmt_addr, loc_addr, ports, dif) &&
+		    !sk->sk_reuseport)
 			result = sk;
 		/* Only check first socket in chain */
 		break;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-19  2:49   ` Eric Dumazet
@ 2016-01-19 16:13     ` Craig Gallek
  0 siblings, 0 replies; 9+ messages in thread
From: Craig Gallek @ 2016-01-19 16:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Mon, Jan 18, 2016 at 9:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2016-01-18 at 18:20 -0800, Eric Dumazet wrote:
>
>> Same reason really.
>>
>> Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and
>> bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into
>> the soreuseport array, with a score which is smaller than the score of
>> first socket sk1 found in hash table (I am speaking of the regular UDP
>> hash table), if sk1 had the connect() done, giving a +8 to its score.
>>
>> So the bug has nothing to do with rcu or rcu_bh, it is just an infinite
>> loop caused by different scores.
>>
>>
>> hash bucket [X] -> sk1 -> sk2 -> NULL
>>
>> sk1 score = 14  (because it did a connect())
>> sk2 score = 6
>>
>> I guess we should relax the test done after atomic_inc_not_zero_hint()
>> to only test the base keys :
>> (net, ipv6_only_sock, inet->inet_rcv_saddr & inet->inet_num)
>
> One way to fix the issue it to not call reuseport_select_sock() if loop
> was restarted, and fallback to the old mechanism : If the optimized
> version might have a problem, just fallback to the safe thing.

Ah, yes, this makes complete sense.  Thanks for the clarification.
It's obviously wrong to re-use this fast method in the case where the
loop in the lookup functions begins again.  I verified your patch
against Dmitry's test and it seems to work.  I think it makes sense to
move the 'select_ok = false' lines next to the 'goto begin' line
though.  It makes it more obvious that the fast lookup is incompatible
with the condition the goto handles.

I'll prepare this and the v6 version for review.  Do you think the
change to the scoring function for SO_INCOMING_CPU is still necessary
as well?  Thanks again!

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-19  2:20 ` Eric Dumazet
@ 2016-01-19  2:49   ` Eric Dumazet
  2016-01-19 16:13     ` Craig Gallek
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2016-01-19  2:49 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Mon, 2016-01-18 at 18:20 -0800, Eric Dumazet wrote:

> Same reason really.
> 
> Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and
> bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into
> the soreuseport array, with a score which is smaller than the score of
> first socket sk1 found in hash table (I am speaking of the regular UDP
> hash table), if sk1 had the connect() done, giving a +8 to its score.
> 
> So the bug has nothing to do with rcu or rcu_bh, it is just an infinite
> loop caused by different scores.
> 
> 
> hash bucket [X] -> sk1 -> sk2 -> NULL
> 
> sk1 score = 14  (because it did a connect())
> sk2 score = 6
> 
> I guess we should relax the test done after atomic_inc_not_zero_hint()
> to only test the base keys : 
> (net, ipv6_only_sock, inet->inet_rcv_saddr & inet->inet_num)

One way to fix the issue it to not call reuseport_select_sock() if loop
was restarted, and fallback to the old mechanism : If the optimized
version might have a problem, just fallback to the safe thing.

IPv4 only patch :

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc45b538e237..8f1b2a9c90f8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -500,6 +500,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 	struct hlist_nulls_node *node;
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
+	bool select_ok = true;
 
 begin:
 	result = NULL;
@@ -512,14 +513,18 @@ begin:
 			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-				struct sock *sk2;
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				sk2 = reuseport_select_sock(sk, hash, skb,
+				if (select_ok) {
+					struct sock *sk2;
+
+					sk2 = reuseport_select_sock(sk, hash, skb,
 							    sizeof(struct udphdr));
-				if (sk2) {
-					result = sk2;
-					goto found;
+					if (sk2) {
+						result = sk2;
+						select_ok = false;
+						goto found;
+					}
 				}
 				matches = 1;
 			}
@@ -564,6 +569,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
+	bool select_ok = true;
 
 	rcu_read_lock();
 	if (hslot->count > 10) {
@@ -601,14 +607,18 @@ begin:
 			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-				struct sock *sk2;
 				hash = udp_ehashfn(net, daddr, hnum,
 						   saddr, sport);
-				sk2 = reuseport_select_sock(sk, hash, skb,
+				if (select_ok) {
+					struct sock *sk2;
+
+					sk2 = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
-				if (sk2) {
-					result = sk2;
-					goto found;
+					if (sk2) {
+						select_ok = false;
+						result = sk2;
+						goto found;
+					}
 				}
 				matches = 1;
 			}

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
  2016-01-18 18:33 Craig Gallek
@ 2016-01-19  2:20 ` Eric Dumazet
  2016-01-19  2:49   ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2016-01-19  2:20 UTC (permalink / raw)
  To: Craig Gallek; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Mon, 2016-01-18 at 13:33 -0500, Craig Gallek wrote:

> Thanks Eric, I'm still scratching my head over this one.  Your patches
> make sense, but I don't think they solve this particular issue.  I was
> still able to trigger the soft lockup with them applied.
> 
> I thought it has something to do with relying on RCU to dereference
> the sk_reuseport_cb pointer from a soft interrupt.  As far as I can
> tell, though, the only difference between rcu_dereference and
> rcu_dereference_bh (and rcu_read_lock and rcu_read_lock_bh) is the
> lock analysis code that gets compiled in in debug mode (in which case
> we should almost certainly be using the bh versions of the rcu
> functions).  However, I can still trigger the soft lookup even when I
> completely remove the RCU functions and use the (racy) raw pointer.
> 
> Below is a stack with your patches applied and the RCU functions
> completely removed.  I'm able to trigger it using a bunch of parallel
> instances of Dmitry's test program running on separate CPU sockets (eg
> for i in `seq 100`; do taskset -c 10,40 /tmp/rcu_stall & done)

Same reason really.

Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and
bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into
the soreuseport array, with a score which is smaller than the score of
first socket sk1 found in hash table (I am speaking of the regular UDP
hash table), if sk1 had the connect() done, giving a +8 to its score.

So the bug has nothing to do with rcu or rcu_bh, it is just an infinite
loop caused by different scores.


hash bucket [X] -> sk1 -> sk2 -> NULL

sk1 score = 14  (because it did a connect())
sk2 score = 6

I guess we should relax the test done after atomic_inc_not_zero_hint()
to only test the base keys : 
(net, ipv6_only_sock, inet->inet_rcv_saddr & inet->inet_num)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: net: hang in ip_finish_output
@ 2016-01-18 18:33 Craig Gallek
  2016-01-19  2:20 ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Craig Gallek @ 2016-01-18 18:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Dmitry Vyukov, David S. Miller, netdev, LKML

On Sun, Jan 17, 2016 at 10:12 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2016-01-15 at 23:29 -0800, Eric Dumazet wrote:
>> On Fri, 2016-01-15 at 19:20 -0500, Craig Gallek wrote:
>>
>> > I wasn't able to reproduce this exact stack trace, but I was able to
>> > cause soft lockup messages with a fork bomb of your test program.  It
>> > is certainly related to my recent SO_REUSEPORT change (reverting it
>> > seems to fix the problem).  I haven't completely figured out the exact
>> > cause yet, though.  Could you please post your configuration and
>> > exactly how you are running this 'parallel loop'?
>>
>> There is a problem in the lookup functions (udp4_lib_lookup2() &
>> __udp4_lib_lookup())
>>
>> Because of RCU SLAB_DESTROY_BY_RCU semantics (check
>> Documentation/RCU/rculist_nulls.txt for some details), you should not
>> call reuseport_select_sock(sk, ...) without taking a stable reference on
>> the sk socket. (and checking the lookup keys again)
>>
>> This is because sk could be freed, re-used by a totally different UDP
>> socket on a different port, and the incoming frame(s) could be delivered
>> on the wrong socket/channel/application :(
>>
>> Note that we discussed some time ago to remove SLAB_DESTROY_BY_RCU for
>> UDP sockets (and freeing them after rcu grace period instead), so make
>> UDP rx path faster, as we would no longer need to increment/decrement
>> the socket refcount. This also would remove the added false sharing on
>> sk_refcnt for the case the UDP socket serves as a tunnel (up->encap_rcv
>> being non NULL)
>
> Hmm... not it looks you do the lookup , refcnt change, re-lookup just
> fine.
>
> The problem here is that UDP connected sockets update the
> sk->sk_incoming_cpu from __udp_queue_rcv_skb()
>
> This means that we can find the first socket in hash table with a
> matching incoming cpu, and badness == high_score + 1
>
> Then, the reuseport_select_sock() can selects another socket from the
> array (using bpf or the hash )
>
> We do the atomic_inc_not_zero_hint() to update sk_refcnt on the new
> socket, then compute_score2() returns high_score (< badness)
>
> So we loop back to the beginning of udp4_lib_lookup2(), and we loop
> forever (as long as the first socket in hash table has still this match
> about incoming cpu)
>
> In short, the recent SO_REUSE_PORT changes are not compatible with the
> SO_INCOMING_CPU ones, if connected UDP sockets are used.
>
> A fix could be to not check sk_incoming_cpu on connected sockets (this
> makes really little sense, as this option was meant to spread traffic on
> UDP _servers_ ). Also it collides with SO_REUSEPORT notion of a group of
> sockets having the same score.
>
> Dmitry, could you test it ? I could not get the trace you reported.

Thanks Eric, I'm still scratching my head over this one.  Your patches
make sense, but I don't think they solve this particular issue.  I was
still able to trigger the soft lockup with them applied.

I thought it has something to do with relying on RCU to dereference
the sk_reuseport_cb pointer from a soft interrupt.  As far as I can
tell, though, the only difference between rcu_dereference and
rcu_dereference_bh (and rcu_read_lock and rcu_read_lock_bh) is the
lock analysis code that gets compiled in in debug mode (in which case
we should almost certainly be using the bh versions of the rcu
functions).  However, I can still trigger the soft lookup even when I
completely remove the RCU functions and use the (racy) raw pointer.

Below is a stack with your patches applied and the RCU functions
completely removed.  I'm able to trigger it using a bunch of parallel
instances of Dmitry's test program running on separate CPU sockets (eg
for i in `seq 100`; do taskset -c 10,40 /tmp/rcu_stall & done)

[  508.633008] NMI watchdog: BUG: soft lockup - CPU#40 stuck for 22s!
[rcu_stall:179\
92]
[  508.633010] NMI watchdog: BUG: soft lockup - CPU#10 stuck for 22s!
[rcu_stall:147\
74]
[  508.633022] Modules linked in: w1_therm wire cdc_acm ehci_pci
ehci_hcd i2c_i801 m\
lx4_en ib_uverbs mlx4_ib ib_sa ib_mad ib_core ib_addr mlx4_core
xt_TCPMSS xt_tcpmss
[  508.633023] irq event stamp: 162291
[  508.633029] hardirqs last  enabled at (162290):
[<ffffffff817263d2>] restore_regs\
_and_iret+0x0/0x1d
[  508.633031] hardirqs last disabled at (162291):
[<ffffffff81726644>] apic_timer_i\
nterrupt+0x84/0x90
[  508.633034] softirqs last  enabled at (32): [<ffffffff8164a5e4>]
ip_finish_output\
2+0x214/0x570
[  508.633037] softirqs last disabled at (33): [<ffffffff8172754c>]
do_softirq_own_s\
tack+0x1c/0x30
[  508.633039] CPU: 10 PID: 14774 Comm: rcu_stall Tainted: G
  L  4.4.0-db\
x-DEV #44
[  508.633040] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.32.0 07/01/2015
[  508.633041] task: ffff880f375b8100 ti: ffff880f375bc000 task.ti:
ffff880f375bc000
[  508.633044] RIP: 0010:[<ffffffff81676c20>]  [<ffffffff81676c20>]
sock_put+0x0/0x2\
0
[  508.633045] RSP: 0018:ffff88103fb43aa0  EFLAGS: 00000217
[  508.633045] RAX: 00000000ffffffff RBX: 0000000000000002 RCX: ffff882023056b40
[  508.633046] RDX: 000000000000d933 RSI: 0000000000000003 RDI: ffff882023056ac0
[  508.633047] RBP: ffff88103fb43b18 R08: 000000000000caf3 R09: 0000000000000002
[  508.633048] R10: ffff88201bb8bac0 R11: 0000000000000002 R12: ffff88201bb8bad8
[  508.633048] R13: ffff882023056ac0 R14: ffffffff81ce7140 R15: 000000000100007f
[  508.633050] FS:  00007f79628c5700(0000) GS:ffff88103fb40000(0000)
knlGS:000000000\
0000000
[  508.633051] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  508.633052] CR2: 0000000020011b08 CR3: 0000000f3ec9e000 CR4: 00000000001406e0
[  508.633052] Stack:
[  508.633054]  ffffffff81679996 0000000000000002 ffff88201bb8bac0
000000000000562f
[  508.633056]  000033d900000000 000000010000caf3 0100007fcaf30246
000033d900000001
[  508.633058]  21da457200000000 ffff88103fb43b08 03fffffffe0739c5
ffffffff81ce7140
[  508.633059] Call Trace:
[  508.633061]  <IRQ>
[  508.633062]  [<ffffffff81679996>] ? udp4_lib_lookup2+0x246/0x320
[  508.633063]  [<ffffffff81679fab>] __udp4_lib_lookup+0x14b/0x5e0
[  508.633065]  [<ffffffff8167b3dd>] __udp4_lib_rcv+0x44d/0x860
[  508.633066]  [<ffffffff81644c3e>] ? ip_local_deliver_finish+0x3e/0x240
[  508.633067]  [<ffffffff8167bf5a>] udp_rcv+0x1a/0x20
[  508.633068]  [<ffffffff81644ca7>] ip_local_deliver_finish+0xa7/0x240
[  508.633069]  [<ffffffff81644c3e>] ? ip_local_deliver_finish+0x3e/0x240
[  508.633071]  [<ffffffff816455d0>] ip_local_deliver+0x60/0xd0
[  508.633072]  [<ffffffff81644c00>] ? inet_add_protocol+0x50/0x50
[  508.633073]  [<ffffffff81644f1d>] ip_rcv_finish+0xdd/0x550
[  508.633074]  [<ffffffff816458b9>] ip_rcv+0x279/0x3d0
[  508.633075]  [<ffffffff81644e40>] ? ip_local_deliver_finish+0x240/0x240
[  508.633079]  [<ffffffff815efa0e>] __netif_receive_skb_core+0x23e/0xae0
[  508.633080]  [<ffffffff815f0400>] ? process_backlog+0xe0/0x230
[  508.633081]  [<ffffffff815f02d1>] __netif_receive_skb+0x21/0x70
[  508.633082]  [<ffffffff815f039b>] process_backlog+0x7b/0x230
[  508.633083]  [<ffffffff815f0400>] ? process_backlog+0xe0/0x230
[  508.633085]  [<ffffffff815f2382>] ? net_rx_action+0xa2/0x3f0
[  508.633086]  [<ffffffff815f2439>] net_rx_action+0x159/0x3f0
[  508.633089]  [<ffffffff810c9b66>] __do_softirq+0xd6/0x420
[  508.633090]  [<ffffffff8164a5e4>] ? ip_finish_output2+0x214/0x570
[  508.633092]  [<ffffffff8172754c>] do_softirq_own_stack+0x1c/0x30
[  508.633093]  <EOI>
[  508.633093]  [<ffffffff810c9f65>] do_softirq+0x55/0x60
[  508.633094]  [<ffffffff810ca018>] __local_bh_enable_ip+0xa8/0xb0

[  508.633095]  [<ffffffff8164a60d>] ip_finish_output2+0x23d/0x570
[  508.633096]  [<ffffffff8164aa79>] ? ip_finish_output+0x139/0x250
[  508.633098]  [<ffffffff8164aa79>] ip_finish_output+0x139/0x250
[  508.633100]  [<ffffffff81637c95>] ? nf_hook_slow+0x5/0x190
[  508.633101]  [<ffffffff8164b7b6>] ip_output+0x76/0x120
[  508.633102]  [<ffffffff8164a940>] ? ip_finish_output2+0x570/0x570
[  508.633104]  [<ffffffff8164acd4>] ip_local_out+0x44/0x90
[  508.633105]  [<ffffffff8164c24d>] ip_send_skb+0x1d/0x50
[  508.633106]  [<ffffffff816774f3>] udp_send_skb+0x183/0x280
[  508.633108]  [<ffffffff815d79fe>] ? sk_dst_check+0x15e/0x1e0
[  508.633108]  [<ffffffff815d78a5>] ? sk_dst_check+0x5/0x1e0
[  508.633110]  [<ffffffff81648c90>] ? ip_setup_cork+0x130/0x130
[  508.633110]  [<ffffffff81678350>] udp_sendmsg+0x2d0/0xac0
[  508.633113]  [<ffffffff816879e5>] ? inet_sendmsg+0xd5/0x1e0
[  508.633114]  [<ffffffff81687915>] ? inet_sendmsg+0x5/0x1e0
[  508.633116]  [<ffffffff81687a28>] inet_sendmsg+0x118/0x1e0
[  508.633117]  [<ffffffff81687915>] ? inet_sendmsg+0x5/0x1e0
[  508.633119]  [<ffffffff815d2e76>] sock_sendmsg+0x46/0x50
[  508.633120]  [<ffffffff815d2f38>] sock_write_iter+0x78/0xd0
[  508.633123]  [<ffffffff8124270a>] __vfs_write+0xaa/0xe0
[  508.633124]  [<ffffffff81243c6a>] vfs_write+0xaa/0x1a0
[  508.633125]  [<ffffffff81243f92>] SyS_write+0x52/0xa0
[  508.633127]  [<ffffffff817258d7>] entry_SYSCALL_64_fastpath+0x12/0x6f
[  508.633140] Code: 55 81 fe 88 00 00 00 48 89 e5 74 10 83 fe 11 74
0b e8 05 69 fd \
ff 5d 0f 1f 40 00 c3 e8 5a fe ff ff 5d c3 0f 1f 84 00 00 00 00 00 <f0>
ff 8f 80 00 0\
0 00 74 01 c3 55 48 89 e5 e8 6d 2d f6 ff 5d c3
[  508.633141] Kernel panic - not syncing: softlockup: hung tasks
[  508.633142] CPU: 10 PID: 14774 Comm: rcu_stall Tainted: G
  L  4.4.0-db\
x-DEV #44
[  508.633143] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.32.0 07/01/2015
[  508.633144]  00000000000001c3 ffff88103fb43810 ffffffff8171a684
0000000000000001
[  508.633145]  ffffffff81a46e80 ffff88103fb43890 ffffffff8171595c
0000000000000001
[  508.633146]  0000000000000008 ffff88103fb438a0 ffff88103fb43840
ffff88103fb43890
[  508.633146] Call Trace:
[  508.633149]  <IRQ>  [<ffffffff8171a684>] dump_stack+0x4c/0x65
[  508.633152]  [<ffffffff8171595c>] panic+0xca/0x215
[  508.633155]  [<ffffffff8117fdd0>] watchdog_timer_fn+0x1d0/0x1d0
[  508.633157]  [<ffffffff8117fc00>] ? proc_watchdog_common+0x110/0x110
[  508.633159]  [<ffffffff8113ae73>] __hrtimer_run_queues+0x113/0x480
[  508.633160]  [<ffffffff8113b7bb>] hrtimer_interrupt+0xab/0x1c0
[  508.633161]  [<ffffffff817263d2>] ? retint_kernel+0x10/0x10
[  508.633164]  [<ffffffff810a5659>] local_apic_timer_interrupt+0x39/0x60
[  508.633165]  [<ffffffff81728075>] smp_apic_timer_interrupt+0x45/0x60
[  508.633166]  [<ffffffff81726649>] apic_timer_interrupt+0x89/0x90
[  508.633167]  [<ffffffff81676c20>] ? udp_getsockopt+0x30/0x30
[  508.633168]  [<ffffffff81679996>] ? udp4_lib_lookup2+0x246/0x320
[  508.633170]  [<ffffffff81679fab>] __udp4_lib_lookup+0x14b/0x5e0
[  508.633171]  [<ffffffff8167b3dd>] __udp4_lib_rcv+0x44d/0x860
[  508.633172]  [<ffffffff81644c3e>] ? ip_local_deliver_finish+0x3e/0x240
[  508.633173]  [<ffffffff8167bf5a>] udp_rcv+0x1a/0x20
[  508.633174]  [<ffffffff81644ca7>] ip_local_deliver_finish+0xa7/0x240
[  508.633175]  [<ffffffff81644c3e>] ? ip_local_deliver_finish+0x3e/0x240
[  508.633176]  [<ffffffff816455d0>] ip_local_deliver+0x60/0xd0
[  508.633177]  [<ffffffff81644c00>] ? inet_add_protocol+0x50/0x50
[  508.633178]  [<ffffffff81644f1d>] ip_rcv_finish+0xdd/0x550
[  508.633179]  [<ffffffff816458b9>] ip_rcv+0x279/0x3d0
[  508.633180]  [<ffffffff81644e40>] ? ip_local_deliver_finish+0x240/0x240
[  508.633182]  [<ffffffff815efa0e>] __netif_receive_skb_core+0x23e/0xae0
[  508.633183]  [<ffffffff815f0400>] ? process_backlog+0xe0/0x230
[  508.633184]  [<ffffffff815f02d1>] __netif_receive_skb+0x21/0x70
[  508.633185]  [<ffffffff815f039b>] process_backlog+0x7b/0x230

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-01-19 16:13 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-15 17:57 net: hang in ip_finish_output Dmitry Vyukov
2016-01-16  0:20 ` Craig Gallek
2016-01-16  7:29   ` Eric Dumazet
2016-01-18  3:12     ` Eric Dumazet
2016-01-18 16:21       ` Eric Dumazet
2016-01-18 18:33 Craig Gallek
2016-01-19  2:20 ` Eric Dumazet
2016-01-19  2:49   ` Eric Dumazet
2016-01-19 16:13     ` Craig Gallek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).