All of lore.kernel.org
 help / color / mirror / Atom feed
* glibc mutex deadlock in signal handler
@ 2015-09-03 11:00 Takashi Iwai
  2015-09-03 18:12 ` Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Takashi Iwai @ 2015-09-03 11:00 UTC (permalink / raw)
  To: git

Hi,

we've got a bug report of git-log stall in below:
  https://bugzilla.opensuse.org/show_bug.cgi?id=942297

In short, git-log is left unterminated sometimes when pager is
aborted.  When this happens, git process can't be killed by SIGTERM,
but only by SIGKILL.  And, further investigation revealed the possible
mutex deadlock used in glibc *alloc()/free().

I tried to reproduce this by adding a fault injection in glibc code,
and actually got the similar stack trace as reported.  The problem is
that glibc malloc (in this case realloc() and free()) takes a private
mutex.  Thus calling any function does *alloc() or free() in a signal
handler may deadlock.  In the case of git, it was free() calls in
various cleanup codes and the printf() / strerror() invocations.

Also, basically it's not safe to call fflush() or raise(), either.
But they seem to work practically on the current systems.

Below is a band-aid patch I tested and confirmed to work in the
fault-injection case.  But, some unsafe calls mentioned in the above
are left.  If we want to be in a safer side, we should really limit
the things to do in a signal handler, e.g. only calling close() and
doing waitpid(), I suppose.

Any better ideas?


thanks,

Takashi

---
diff --git a/pager.c b/pager.c
index 27d4c8a17aa1..57dda0142fa9 100644
--- a/pager.c
+++ b/pager.c
@@ -14,19 +14,25 @@
 static const char *pager_argv[] = { NULL, NULL };
 static struct child_process pager_process = CHILD_PROCESS_INIT;
 
-static void wait_for_pager(void)
+static void flush_pager(void)
 {
 	fflush(stdout);
 	fflush(stderr);
 	/* signal EOF to pager */
 	close(1);
 	close(2);
+}
+
+static void wait_for_pager(void)
+{
+	flush_pager();
 	finish_command(&pager_process);
 }
 
 static void wait_for_pager_signal(int signo)
 {
-	wait_for_pager();
+	flush_pager();
+	finish_command_in_signal(&pager_process);
 	sigchain_pop(signo);
 	raise(signo);
 }
diff --git a/run-command.c b/run-command.c
index 3277cf797ed4..a8cdc8f32944 100644
--- a/run-command.c
+++ b/run-command.c
@@ -18,26 +18,27 @@ struct child_to_clean {
 static struct child_to_clean *children_to_clean;
 static int installed_child_cleanup_handler;
 
-static void cleanup_children(int sig)
+static void cleanup_children(int sig, int in_signal)
 {
 	while (children_to_clean) {
 		struct child_to_clean *p = children_to_clean;
 		children_to_clean = p->next;
 		kill(p->pid, sig);
-		free(p);
+		if (!in_signal)
+			free(p);
 	}
 }
 
 static void cleanup_children_on_signal(int sig)
 {
-	cleanup_children(sig);
+	cleanup_children(sig, 1);
 	sigchain_pop(sig);
 	raise(sig);
 }
 
 static void cleanup_children_on_exit(void)
 {
-	cleanup_children(SIGTERM);
+	cleanup_children(SIGTERM, 0);
 }
 
 static void mark_child_for_cleanup(pid_t pid)
@@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
 		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
 }
 
-static int wait_or_whine(pid_t pid, const char *argv0)
+static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
 {
 	int status, code = -1;
 	pid_t waiting;
@@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 
 	if (waiting < 0) {
 		failed_errno = errno;
-		error("waitpid for %s failed: %s", argv0, strerror(errno));
+		if (!in_signal)
+			error("waitpid for %s failed: %s", argv0,
+			      strerror(errno));
 	} else if (waiting != pid) {
-		error("waitpid is confused (%s)", argv0);
+		if (!in_signal)
+			error("waitpid is confused (%s)", argv0);
 	} else if (WIFSIGNALED(status)) {
 		code = WTERMSIG(status);
 		if (code != SIGINT && code != SIGQUIT)
-			error("%s died of signal %d", argv0, code);
+			if (!in_signal)
+				error("%s died of signal %d", argv0, code);
 		/*
 		 * This return value is chosen so that code & 0xff
 		 * mimics the exit code that a POSIX shell would report for
@@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 			failed_errno = ENOENT;
 		}
 	} else {
-		error("waitpid is confused (%s)", argv0);
+		if (!in_signal)
+			error("waitpid is confused (%s)", argv0);
 	}
 
-	clear_child_for_cleanup(pid);
+	if (!in_signal)
+		clear_child_for_cleanup(pid);
 
 	errno = failed_errno;
 	return code;
@@ -437,7 +444,7 @@ fail_pipe:
 		 * At this point we know that fork() succeeded, but execvp()
 		 * failed. Errors have been reported to our stderr.
 		 */
-		wait_or_whine(cmd->pid, cmd->argv[0]);
+		wait_or_whine(cmd->pid, cmd->argv[0], 0);
 		failed_errno = errno;
 		cmd->pid = -1;
 	}
@@ -536,12 +543,18 @@ fail_pipe:
 
 int finish_command(struct child_process *cmd)
 {
-	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
+	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
 	argv_array_clear(&cmd->args);
 	argv_array_clear(&cmd->env_array);
 	return ret;
 }
 
+int finish_command_in_signal(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
+}
+
+
 int run_command(struct child_process *cmd)
 {
 	int code;
@@ -772,7 +785,7 @@ error:
 int finish_async(struct async *async)
 {
 #ifdef NO_PTHREADS
-	return wait_or_whine(async->pid, "child process");
+	return wait_or_whine(async->pid, "child process", 0);
 #else
 	void *ret = (void *)(intptr_t)(-1);
 
diff --git a/run-command.h b/run-command.h
index 5b4425a3cbe1..275d35c442ac 100644
--- a/run-command.h
+++ b/run-command.h
@@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
 
 int start_command(struct child_process *);
 int finish_command(struct child_process *);
+int finish_command_in_signal(struct child_process *);
 int run_command(struct child_process *);
 
 /*

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-09-05  9:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-03 11:00 glibc mutex deadlock in signal handler Takashi Iwai
2015-09-03 18:12 ` Junio C Hamano
2015-09-03 19:34   ` Takashi Iwai
2015-09-03 20:55     ` Andreas Schwab
2015-09-04  5:52       ` Takashi Iwai
2015-09-04  9:23         ` Jeff King
2015-09-04  9:35           ` Takashi Iwai
2015-09-04 13:04             ` Jeff King
2015-09-04 13:40               ` Takashi Iwai
2015-09-04 21:56           ` Junio C Hamano
2015-09-05  8:59             ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.