linux 2.4.18-5 nfs/rpc problem

* linux 2.4.18-5 nfs/rpc problem
@ 2003-10-08 22:42 Guzovsky, Eduard
  2003-10-10 17:12 ` David Jeffery
  0 siblings, 1 reply; 7+ messages in thread
From: Guzovsky, Eduard @ 2003-10-08 22:42 UTC (permalink / raw)
  To: 'nfs@lists.sourceforge.net'; +Cc: Pariseau, Luc, A.K. Srikanth

[-- Attachment #1: Type: text/plain, Size: 9567 bytes --]

We ran into a deadlock problem running Red Hat linux 2.4.18-5
on a diskless machine. Root file system is mounted over 
udp nfs v3. The problem is created by an application reading,
writing, renaming, deleting files. This machine has 256MB
of memory.

Bellow is some relevant info from the deadlocked machine.

It looks like "rpciod" (see the stack of thread ID 8 in the 
gdb listing) hangs in __wait_on_page(page=0xc1241d7c) trying 
to do nfs_commit_done() on a stale file handle. At the same 
time "kupdated" (see the stack of thread ID 6) is stuck in 
nfs_wait_on_request() while trying to do nfs_writepage()
on the same page rpciod hangs. The problem goes away if the 
file system is mounted with "sync" option. 

Is this a known problem ? Is a fix available ?

Thanks,

-Ed

rpc_show_tasks() output:
------------------------

-pid- proc flgs runs status -client- -prog- --rqstp- -timeout -rpcwait
-action- --exit--
33180 0004 0000 0002 000120 cff38aa0 100003 cff32160 00000000  <NULL>
c02090a0        0
32943 0021 0001 0002 -00116 cff38aa0 100003 cff32b58 00000000  <NULL>
0 c0176d10
32942 0021 0001 0002 000128 cff38aa0 100003 cff32078 00000000   schedq
c02090a0 c0176d10
32941 0021 0001 0002 000128 cff38aa0 100003 cff32418 00000000   schedq
c02090a0 c0176d10
00156 0000 0081 0002 -00110 cff38b20 100003        0 00000000   schedq
c016f890 c016f990
00006 0000 0081 0002 -00110 cff38aa0 100003        0 00000000   schedq
c016f890 c016f990

kernel gdb session:
-------------------

(gdb) info threads
  26 Thread 1064  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  25 Thread 1055  sys_wait4 (pid=0xffffffff, stat_addr=0x0, options=0x0, 
    ru=0x0) at exit.c:649
  24 Thread 855  schedule_timeout (timeout=0xcc1d5f00) at timer.c:879
  23 Thread 825  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  22 Thread 782  schedule_timeout (timeout=0xcc2cbf00) at timer.c:879
  21 Thread 781  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  20 Thread 763  sys_msgrcv (msqid=0x0, msgp=0xbffffc40, msgsz=0x100, 
    msgtyp=0x1, msgflg=0x0) at msg.c:816
  19 Thread 745  schedule_timeout (timeout=0xcc3b3f74) at timer.c:879
  18 Thread 711  schedule_timeout (timeout=0xcc3d1f74) at timer.c:879
  17 Thread 691  schedule_timeout (timeout=0xcc3dff00) at timer.c:879
  16 Thread 673  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  15 Thread 655  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  14 Thread 616  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  13 Thread 582  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  12 Thread 498  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  11 Thread 463  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  10 Thread 445  do_syslog (type=0x2, buf=0x804df20 "u\b^[-bèyS",
len=0xfff)^[-A
    at printk.c:191
  9 Thread 440  schedule_timeout (timeout=0x7fffffff) at timer.c:851
  8 Thread 8  ___wait_on_page (page=0xc1241d7c)
    at /usr/src/linux_2.4.18.5/include/asm/bitops.h:243
  7 Thread 7  interruptible_sleep_on (q=0xc1393784) at sched.c:926
  6 Thread 6  nfs_wait_on_request (req=0xcff27ae0) at pagelist.c:223
  5 Thread 5  interruptible_sleep_on (q=0xc02c6d88) at sched.c:926
  4 Thread 4  schedule_timeout (timeout=0xc13a5f80) at timer.c:879
  3 Thread 3  ksoftirqd (__bind_cpu=0x0) at softirq.c:386
  2 Thread 2  context_thread (startup=0xc02f008c) at context.c:101
  1 Thread 1  schedule_timeout (timeout=0xc1357f00) at timer.c:879

[kupdated] info
---------------
(gdb) thread 6
[Switching to thread 6 (Thread 6)]#0  nfs_wait_on_request (req=0xcff27ae0)
    at pagelist.c:223
(gdb) bt
#0  nfs_wait_on_request (req=0xcff27ae0) at pagelist.c:223
#1  0xc0175636 in nfs_wait_on_requests (inode=0xcb931080, file=0x0, 
    idx_start=0x0, npages=0x1) at write.c:479
#2  0xc0177268 in nfs_sync_file (inode=0xcb931080, file=0x0, idx_start=0x0, 
    npages=0x1, how=0x6) at write.c:1307
#3  0xc0175436 in nfs_writepage (page=0xc1241d7c)
    at /usr/src/linux_2.4.18.5/include/linux/nfs_fs.h:257
#4  0xc0127d14 in filemap_fdatasync (mapping=0xcb931134) at filemap.c:594
#5  0xc0149be1 in sync_unlocked_inodes () at inode.c:224
#6  0xc013b778 in sync_old_buffers () at buffer.c:2801
#7  0xc013ba72 in kupdate (startup=0xc02f0100) at buffer.c:2996
#8  0xc0107306 in kernel_thread (fn=0x202a2, arg=0xc1356000, flags=0x0)
    at process.c:499

(gdb) p *(struct nfs_page *)0xcff27ae0
$3 = {wb_hash = {next = 0xcff27ae0, prev = 0xcff27ae0}, wb_lru = {
    next = 0xcff27848, prev = 0xcff27788}, wb_list = {next = 0xcff27af0, 
    prev = 0xcff27af0}, wb_list_head = 0x0, wb_file = 0x0, 
  wb_inode = 0xcb931080, wb_cred = 0xcff38ba0, wb_page = 0xc1241d7c, 
  wb_wait = {lock = {gcc_is_buggy = 0x0}, task_list = {next = 0xc13a1eb0, 
      prev = 0xc13a1eb0}}, wb_timeout = 0xa785, wb_offset = 0x0, 
  wb_bytes = 0x400, wb_count = 0x3, wb_flags = 0x1, wb_verf = {
    committed = NFS_UNSTABLE, verifier = {0xbf766b3f, 0x87af0400}}}

deadlocking page info
---------------------
(gdb) p *(struct page *)(0xc1241d7c)
$6 = {list = {next = 0xcb931144, prev = 0xcb931144}, mapping = 0xcb931134, 
  index = 0x0, next_hash = 0x0, count = {counter = 0x4}, flags = 0x100004d, 
  lru = {next = 0xc125a468, prev = 0xc123b9dc}, age = 0x5, pte_chain = 0x0, 
  pprev_hash = 0xcffdee60, buffers = 0x0}
(gdb) p *((struct page *)(0xc1241d7c))->mapping
$7 = {clean_pages = {next = 0xcb931134, prev = 0xcb931134}, dirty_pages = {
    next = 0xcb93113c, prev = 0xcb93113c}, locked_pages = {next =
0xc1241d7c, 
    prev = 0xc1241d7c}, nrpages = 0x1, a_ops = 0xc02c8820, host =
0xcb931080, 
  i_mmap = 0x0, i_mmap_shared = 0x0, i_shared_lock = {gcc_is_buggy = 0x0}, 
  gfp_mask = 0x1d2}
(gdb) p *((struct page *)(0xc1241d7c))->mapping->a_ops
$8 = {writepage = 0xc0175400 <nfs_writepage>, 
  readpage = 0xc0174a50 <nfs_readpage>, 
  sync_page = 0xc016f400 <nfs_sync_page>, 
  prepare_write = 0xc016f3e0 <nfs_prepare_write>, 
  commit_write = 0xc016f3f0 <nfs_commit_write>, bmap = 0, flushpage = 0, 
  releasepage = 0, direct_IO = 0, removepage = 0}

[rpciod] info
-------------
(gdb) thread 8
[Switching to thread 8 (Thread 8)]#0  ___wait_on_page (page=0xc1241d7c)
    at /usr/src/linux_2.4.18.5/include/asm/bitops.h:243
(gdb) bt
#0  ___wait_on_page (page=0xc1241d7c)
    at /usr/src/linux_2.4.18.5/include/asm/bitops.h:243
#1  0xc01278d5 in truncate_list_pages (head=0xcb931144, start=0x0, 
    partial=0xcff2df1c)
    at /usr/src/linux_2.4.18.5/include/linux/pagemap.h:97
#2  0xc012796e in truncate_inode_pages (mapping=0xcb931134, lstart=0x0)
    at filemap.c:349
#3  0xc014acb2 in iput (inode=0xcb931080) at inode.c:1086
#4  0xc0176f45 in nfs_commit_done (task=0xcfe284c0) at write.c:353
#5  0xc020c71a in __rpc_execute (task=0xcfe284c0) at sched.c:648
#6  0xc020c909 in __rpc_schedule () at sched.c:733
#7  0xc020d168 in rpciod (ptr=0xc02d954c) at sched.c:1076
#8  0xc0107306 in kernel_thread (fn=0xc0316660 <tcp_hashinfo>,
arg=0xc139fc00, 
    flags=0x0) at process.c:499
#9  0x00000008 in ?? ()

rpc_task (tk_pid 32943) info (note: tk_status = -116 ESTALE)
------------------------------------------------------------
(gdb) p *(struct rpc_task *)0xcfe284c0
$9 = {tk_list = {next = 0xc02d9514, prev = 0xc02d9514}, tk_magic = 0xf00baa,

  tk_task = {next = 0xcfe28c4c, prev = 0xcfe2804c}, tk_client = 0xcff38aa0, 
  tk_rqstp = 0xcff32b58, tk_status = 0xffffff8c, tk_rpcwait = 0x0, tk_msg =
{
    rpc_proc = 0x15, rpc_argp = 0xcfe28550, rpc_resp = 0xcfe285a8, 
    rpc_cred = 0xcff38ba0}, tk_buffer = 0xcff78000, tk_garb_retry = 0x2, 
  tk_cred_retry = 0x2, tk_suid_retry = 0x1, tk_timeout_fn = 0, 
  tk_callback = 0, tk_action = 0, tk_exit = 0xc0176d10 <nfs_commit_done>, 
  tk_release = 0xc0175070 <nfs_writedata_release>, tk_calldata = 0xcfe284c0,

  tk_timer = {list = {next = 0x0, prev = 0x0}, expires = 0xa7c8, 
    data = 0xcfe284c0, function = 0xc020bab0 <rpc_run_timer>}, tk_wait = {
    lock = {gcc_is_buggy = 0x0}, task_list = {next = 0xcfe2852c, 
      prev = 0xcfe2852c}}, tk_timeout = 0x0, tk_flags = 0x1, tk_lock = 0x0, 
  tk_active = 0x1, tk_wakeup = 0x0, tk_runstate = 0x2, tk_pid = 0x80af}
(gdb) p/d 0x80af
$10 = 32943

(gdb) p *((struct rpc_task *)0xcfe284c0)->tk_rqstp
$11 = {rq_xprt = 0xcff32000, rq_timeout = {to_current = 0x46, 
    to_initval = 0x46, to_maxval = 0x1770, to_increment = 0x0, 
    to_resrvval = 0x2ee0, to_retries = 0x3, to_exponential = 0x1}, 
  rq_snd_buf = {io_vec = {{iov_base = 0xcff78000, iov_len = 0x64}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}}, io_nr = 0x1, io_len = 0x64}, 
  rq_rcv_buf = {io_vec = {{iov_base = 0xcff78280, iov_len = 0x280}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}, {iov_base = 0x0, iov_len = 0x0}, {
        iov_base = 0x0, iov_len = 0x0}}, io_nr = 0x1, io_len = 0x280}, 
  rq_task = 0xcfe284c0, rq_xid = 0x424070ad, rq_next = 0x0, rq_cong = 0x0, 
  rq_received = 0x1, rq_creddata = {0x0, 0x0}, rq_bytes_sent = 0x0}

[-- Attachment #2: Type: text/html, Size: 15454 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread