From mboxrd@z Thu Jan 1 00:00:00 1970 From: Ian Kent Subject: Re: clients suddenly start hanging (was: (no subject)) Date: Fri, 23 May 2008 10:35:00 +0800 Message-ID: <1211510100.15090.27.camel@raven.themaw.net> References: <20080423185018.122C53C3B1@xena.cft.ca.us> <1211083674.3118.5.camel@raven.themaw.net> <20080522214233.7F5D4211250@simba.math.ucla.edu> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <20080522214233.7F5D4211250@simba.math.ucla.edu> List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: autofs-bounces@linux.kernel.org Errors-To: autofs-bounces@linux.kernel.org To: Jim Carter Cc: autofs@linux.kernel.org On Thu, 2008-05-22 at 14:42 -0700, Jim Carter wrote: > Sorry for the delay -- we have a crisis with KDE menus disappearing :-( > Anyway: the new patch did not stop the hanging, unfortunately. Again > upon the first hang gdb was not able to print out the individual thread > tracebacks, but the second time around, it succeeded. But strangely, > the hung daemon thread from the first hang is easily identified, but > strings pertaining to the second hang are not evident. > > I'm calling the patch "take-submount-lock". OK. I've named the patch autofs-5.0.3-lock-submount-before-state-change.patch but it doesn't really matter because it will be folded into the patch tittled "autofs-5.0.3 - fix submount shutdown handling" which currently has the name autofs-5.0.3-submount-shutdown-recovery-5.patch and will likely end up with a 6 when I do this. I'm going to retain this submount locking even though it may not be the source of the problem as I think it's an improvement. However, I'll keep the patch below separate as it (possibly) addresses a different issue. I've named it autofs-5.0.3-expire-thread-create-cond-handling.patch. I'm sorry for all this hassle and I appreciate your patience and efforts to help out very much. But onto the issue. I've looked at this again, on a broader front. I think there are two possible causes. One is that umount(8) is faulting and the expire thread is disappearing because of it. I've seen that happen before so it is certainly possible. But I don't want to add code to work around that yet because it would mask a possible bug in the daemon. The other possibility is my poor pthreads condition handling at expire thread (the thread that does the actual umount) creation. In the early development I changed the code, for both mount and umount thread creation, time and again and ended up with something that appeared to work for around 18 months. But then, recently, I suddenly started seeing a hang with heavy mount activity. So I changed the code again, and although I can understand why it started working with this change, I'm still a little puzzled because I think it really should have continued to function. Anyway, I've made this same alteration to the expire thread creation in case the same thing is happening. The code is very similar in both cases so it's a good candidate. autofs-5.0.3 - fix incorrect pthreads condition handling for expire requests. From: Ian Kent --- daemon/direct.c | 40 +++++++++++++++++++++------------------- daemon/indirect.c | 28 +++++++++++++++------------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/daemon/direct.c b/daemon/direct.c index 86c817c..4b35aba 100644 --- a/daemon/direct.c +++ b/daemon/direct.c @@ -1052,55 +1052,53 @@ static void expire_mutex_unlock(void *arg) static void *do_expire_direct(void *arg) { - struct pending_args *mt; + struct pending_args *args, mt; struct autofs_point *ap; size_t len; int status, state; - mt = (struct pending_args *) arg; + args = (struct pending_args *) arg; status = pthread_mutex_lock(&ea_mutex); if (status) fatal(status); - ap = mt->ap; + memcpy(&mt, args, sizeof(struct pending_args)); - mt->signaled = 1; - status = pthread_cond_signal(&mt->cond); + ap = mt.ap; + + args->signaled = 1; + status = pthread_cond_signal(&args->cond); if (status) fatal(status); expire_mutex_unlock(NULL); - pthread_cleanup_push(free_pending_args, mt); - pthread_cleanup_push(pending_cond_destroy, mt); - pthread_cleanup_push(expire_send_fail, mt); + pthread_cleanup_push(expire_send_fail, &mt); - len = _strlen(mt->name, KEY_MAX_LEN); + len = _strlen(mt.name, KEY_MAX_LEN); if (!len) { - warn(ap->logopt, "direct key path too long %s", mt->name); + warn(ap->logopt, "direct key path too long %s", mt.name); /* TODO: force umount ?? */ pthread_exit(NULL); } - status = do_expire(ap, mt->name, len); + status = do_expire(ap, mt.name, len); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &state); if (status) - send_fail(ap->logopt, mt->ioctlfd, mt->wait_queue_token); + send_fail(ap->logopt, mt.ioctlfd, mt.wait_queue_token); else { struct mapent *me; - cache_readlock(mt->mc); - me = cache_lookup_distinct(mt->mc, mt->name); + cache_readlock(mt.mc); + me = cache_lookup_distinct(mt.mc, mt.name); me->ioctlfd = -1; - cache_unlock(mt->mc); - send_ready(ap->logopt, mt->ioctlfd, mt->wait_queue_token); - close(mt->ioctlfd); + cache_unlock(mt.mc); + send_ready(ap->logopt, mt.ioctlfd, mt.wait_queue_token); + close(mt.ioctlfd); } pthread_setcancelstate(state, NULL); pthread_cleanup_pop(0); - pthread_cleanup_pop(1); - pthread_cleanup_pop(1); return NULL; } @@ -1196,6 +1194,8 @@ int handle_packet_expire_direct(struct autofs_point *ap, autofs_packet_expire_di cache_unlock(mc); + pthread_cleanup_push(free_pending_args, mt); + pthread_cleanup_push(pending_cond_destroy, mt); pthread_cleanup_push(expire_mutex_unlock, NULL); pthread_setcancelstate(state, NULL); @@ -1207,6 +1207,8 @@ int handle_packet_expire_direct(struct autofs_point *ap, autofs_packet_expire_di } pthread_cleanup_pop(1); + pthread_cleanup_pop(1); + pthread_cleanup_pop(1); return 0; } diff --git a/daemon/indirect.c b/daemon/indirect.c index c1bd3f2..81c7c84 100644 --- a/daemon/indirect.c +++ b/daemon/indirect.c @@ -595,40 +595,38 @@ static void expire_mutex_unlock(void *arg) static void *do_expire_indirect(void *arg) { - struct pending_args *mt; + struct pending_args *args, mt; struct autofs_point *ap; int status, state; - mt = (struct pending_args *) arg; + args = (struct pending_args *) arg; status = pthread_mutex_lock(&ea_mutex); if (status) fatal(status); - ap = mt->ap; + memcpy(&mt, args, sizeof(struct pending_args)); + + ap = mt.ap; - mt->signaled = 1; - status = pthread_cond_signal(&mt->cond); + args->signaled = 1; + status = pthread_cond_signal(&args->cond); if (status) fatal(status); expire_mutex_unlock(NULL); - pthread_cleanup_push(free_pending_args, mt); - pthread_cleanup_push(pending_cond_destroy, mt); - pthread_cleanup_push(expire_send_fail, mt); + pthread_cleanup_push(expire_send_fail, &mt); - status = do_expire(mt->ap, mt->name, mt->len); + status = do_expire(mt.ap, mt.name, mt.len); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &state); if (status) - send_fail(ap->logopt, ap->ioctlfd, mt->wait_queue_token); + send_fail(ap->logopt, ap->ioctlfd, mt.wait_queue_token); else - send_ready(ap->logopt, ap->ioctlfd, mt->wait_queue_token); + send_ready(ap->logopt, ap->ioctlfd, mt.wait_queue_token); pthread_setcancelstate(state, NULL); pthread_cleanup_pop(0); - pthread_cleanup_pop(1); - pthread_cleanup_pop(1); return NULL; } @@ -679,6 +677,8 @@ int handle_packet_expire_indirect(struct autofs_point *ap, autofs_packet_expire_ return 1; } + pthread_cleanup_push(free_pending_args, mt); + pthread_cleanup_push(pending_cond_destroy, mt); pthread_cleanup_push(expire_mutex_unlock, NULL); pthread_setcancelstate(state, NULL); @@ -690,6 +690,8 @@ int handle_packet_expire_indirect(struct autofs_point *ap, autofs_packet_expire_ } pthread_cleanup_pop(1); + pthread_cleanup_pop(1); + pthread_cleanup_pop(1); return 0; }