All of lore.kernel.org
 help / color / mirror / Atom feed
* glibc mutex deadlock in signal handler
@ 2015-09-03 11:00 Takashi Iwai
  2015-09-03 18:12 ` Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Takashi Iwai @ 2015-09-03 11:00 UTC (permalink / raw)
  To: git

Hi,

we've got a bug report of git-log stall in below:
  https://bugzilla.opensuse.org/show_bug.cgi?id=942297

In short, git-log is left unterminated sometimes when pager is
aborted.  When this happens, git process can't be killed by SIGTERM,
but only by SIGKILL.  And, further investigation revealed the possible
mutex deadlock used in glibc *alloc()/free().

I tried to reproduce this by adding a fault injection in glibc code,
and actually got the similar stack trace as reported.  The problem is
that glibc malloc (in this case realloc() and free()) takes a private
mutex.  Thus calling any function does *alloc() or free() in a signal
handler may deadlock.  In the case of git, it was free() calls in
various cleanup codes and the printf() / strerror() invocations.

Also, basically it's not safe to call fflush() or raise(), either.
But they seem to work practically on the current systems.

Below is a band-aid patch I tested and confirmed to work in the
fault-injection case.  But, some unsafe calls mentioned in the above
are left.  If we want to be in a safer side, we should really limit
the things to do in a signal handler, e.g. only calling close() and
doing waitpid(), I suppose.

Any better ideas?


thanks,

Takashi

---
diff --git a/pager.c b/pager.c
index 27d4c8a17aa1..57dda0142fa9 100644
--- a/pager.c
+++ b/pager.c
@@ -14,19 +14,25 @@
 static const char *pager_argv[] = { NULL, NULL };
 static struct child_process pager_process = CHILD_PROCESS_INIT;
 
-static void wait_for_pager(void)
+static void flush_pager(void)
 {
 	fflush(stdout);
 	fflush(stderr);
 	/* signal EOF to pager */
 	close(1);
 	close(2);
+}
+
+static void wait_for_pager(void)
+{
+	flush_pager();
 	finish_command(&pager_process);
 }
 
 static void wait_for_pager_signal(int signo)
 {
-	wait_for_pager();
+	flush_pager();
+	finish_command_in_signal(&pager_process);
 	sigchain_pop(signo);
 	raise(signo);
 }
diff --git a/run-command.c b/run-command.c
index 3277cf797ed4..a8cdc8f32944 100644
--- a/run-command.c
+++ b/run-command.c
@@ -18,26 +18,27 @@ struct child_to_clean {
 static struct child_to_clean *children_to_clean;
 static int installed_child_cleanup_handler;
 
-static void cleanup_children(int sig)
+static void cleanup_children(int sig, int in_signal)
 {
 	while (children_to_clean) {
 		struct child_to_clean *p = children_to_clean;
 		children_to_clean = p->next;
 		kill(p->pid, sig);
-		free(p);
+		if (!in_signal)
+			free(p);
 	}
 }
 
 static void cleanup_children_on_signal(int sig)
 {
-	cleanup_children(sig);
+	cleanup_children(sig, 1);
 	sigchain_pop(sig);
 	raise(sig);
 }
 
 static void cleanup_children_on_exit(void)
 {
-	cleanup_children(SIGTERM);
+	cleanup_children(SIGTERM, 0);
 }
 
 static void mark_child_for_cleanup(pid_t pid)
@@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
 		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
 }
 
-static int wait_or_whine(pid_t pid, const char *argv0)
+static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
 {
 	int status, code = -1;
 	pid_t waiting;
@@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 
 	if (waiting < 0) {
 		failed_errno = errno;
-		error("waitpid for %s failed: %s", argv0, strerror(errno));
+		if (!in_signal)
+			error("waitpid for %s failed: %s", argv0,
+			      strerror(errno));
 	} else if (waiting != pid) {
-		error("waitpid is confused (%s)", argv0);
+		if (!in_signal)
+			error("waitpid is confused (%s)", argv0);
 	} else if (WIFSIGNALED(status)) {
 		code = WTERMSIG(status);
 		if (code != SIGINT && code != SIGQUIT)
-			error("%s died of signal %d", argv0, code);
+			if (!in_signal)
+				error("%s died of signal %d", argv0, code);
 		/*
 		 * This return value is chosen so that code & 0xff
 		 * mimics the exit code that a POSIX shell would report for
@@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 			failed_errno = ENOENT;
 		}
 	} else {
-		error("waitpid is confused (%s)", argv0);
+		if (!in_signal)
+			error("waitpid is confused (%s)", argv0);
 	}
 
-	clear_child_for_cleanup(pid);
+	if (!in_signal)
+		clear_child_for_cleanup(pid);
 
 	errno = failed_errno;
 	return code;
@@ -437,7 +444,7 @@ fail_pipe:
 		 * At this point we know that fork() succeeded, but execvp()
 		 * failed. Errors have been reported to our stderr.
 		 */
-		wait_or_whine(cmd->pid, cmd->argv[0]);
+		wait_or_whine(cmd->pid, cmd->argv[0], 0);
 		failed_errno = errno;
 		cmd->pid = -1;
 	}
@@ -536,12 +543,18 @@ fail_pipe:
 
 int finish_command(struct child_process *cmd)
 {
-	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
+	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
 	argv_array_clear(&cmd->args);
 	argv_array_clear(&cmd->env_array);
 	return ret;
 }
 
+int finish_command_in_signal(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
+}
+
+
 int run_command(struct child_process *cmd)
 {
 	int code;
@@ -772,7 +785,7 @@ error:
 int finish_async(struct async *async)
 {
 #ifdef NO_PTHREADS
-	return wait_or_whine(async->pid, "child process");
+	return wait_or_whine(async->pid, "child process", 0);
 #else
 	void *ret = (void *)(intptr_t)(-1);
 
diff --git a/run-command.h b/run-command.h
index 5b4425a3cbe1..275d35c442ac 100644
--- a/run-command.h
+++ b/run-command.h
@@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
 
 int start_command(struct child_process *);
 int finish_command(struct child_process *);
+int finish_command_in_signal(struct child_process *);
 int run_command(struct child_process *);
 
 /*

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-03 11:00 glibc mutex deadlock in signal handler Takashi Iwai
@ 2015-09-03 18:12 ` Junio C Hamano
  2015-09-03 19:34   ` Takashi Iwai
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2015-09-03 18:12 UTC (permalink / raw)
  To: Takashi Iwai; +Cc: git, Jeff King

Takashi Iwai <tiwai@suse.de> writes:

> we've got a bug report of git-log stall in below:
>   https://bugzilla.opensuse.org/show_bug.cgi?id=942297
>
> In short, git-log is left unterminated sometimes when pager is
> aborted.  When this happens, git process can't be killed by SIGTERM,
> but only by SIGKILL.  And, further investigation revealed the possible
> mutex deadlock used in glibc *alloc()/free().
>
> I tried to reproduce this by adding a fault injection in glibc code,
> and actually got the similar stack trace as reported.  The problem is
> that glibc malloc (in this case realloc() and free()) takes a private
> mutex.  Thus calling any function does *alloc() or free() in a signal
> handler may deadlock.  In the case of git, it was free() calls in
> various cleanup codes and the printf() / strerror() invocations.
>
> Also, basically it's not safe to call fflush() or raise(), either.
> But they seem to work practically on the current systems.
>
> Below is a band-aid patch I tested and confirmed to work in the
> fault-injection case.  But, some unsafe calls mentioned in the above
> are left.  If we want to be in a safer side, we should really limit
> the things to do in a signal handler, e.g. only calling close() and
> doing waitpid(), I suppose.
>
> Any better ideas?

Sorry, but I am not with better ideas (at least offhand right now).

Essentially the idea seems to be to avoid calling allocation-related
functions in the signal handler codepath that is used for cleaning
things up.  Not calling free() in the codepaths is perfectly fine as
we know we will be soon exiting anyway.  Not being able to call
error() and strerror() to report funnies (other than the fact that
we were interrupted) is somewhat sad, though.




> diff --git a/pager.c b/pager.c
> index 27d4c8a17aa1..57dda0142fa9 100644
> --- a/pager.c
> +++ b/pager.c
> @@ -14,19 +14,25 @@
>  static const char *pager_argv[] = { NULL, NULL };
>  static struct child_process pager_process = CHILD_PROCESS_INIT;
>  
> -static void wait_for_pager(void)
> +static void flush_pager(void)
>  {
>  	fflush(stdout);
>  	fflush(stderr);
>  	/* signal EOF to pager */
>  	close(1);
>  	close(2);
> +}
> +
> +static void wait_for_pager(void)
> +{
> +	flush_pager();
>  	finish_command(&pager_process);
>  }
>  
>  static void wait_for_pager_signal(int signo)
>  {
> -	wait_for_pager();
> +	flush_pager();
> +	finish_command_in_signal(&pager_process);
>  	sigchain_pop(signo);
>  	raise(signo);
>  }
> diff --git a/run-command.c b/run-command.c
> index 3277cf797ed4..a8cdc8f32944 100644
> --- a/run-command.c
> +++ b/run-command.c
> @@ -18,26 +18,27 @@ struct child_to_clean {
>  static struct child_to_clean *children_to_clean;
>  static int installed_child_cleanup_handler;
>  
> -static void cleanup_children(int sig)
> +static void cleanup_children(int sig, int in_signal)
>  {
>  	while (children_to_clean) {
>  		struct child_to_clean *p = children_to_clean;
>  		children_to_clean = p->next;
>  		kill(p->pid, sig);
> -		free(p);
> +		if (!in_signal)
> +			free(p);
>  	}
>  }
>  
>  static void cleanup_children_on_signal(int sig)
>  {
> -	cleanup_children(sig);
> +	cleanup_children(sig, 1);
>  	sigchain_pop(sig);
>  	raise(sig);
>  }
>  
>  static void cleanup_children_on_exit(void)
>  {
> -	cleanup_children(SIGTERM);
> +	cleanup_children(SIGTERM, 0);
>  }
>  
>  static void mark_child_for_cleanup(pid_t pid)
> @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
>  		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
>  }
>  
> -static int wait_or_whine(pid_t pid, const char *argv0)
> +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
>  {
>  	int status, code = -1;
>  	pid_t waiting;
> @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
>  
>  	if (waiting < 0) {
>  		failed_errno = errno;
> -		error("waitpid for %s failed: %s", argv0, strerror(errno));
> +		if (!in_signal)
> +			error("waitpid for %s failed: %s", argv0,
> +			      strerror(errno));
>  	} else if (waiting != pid) {
> -		error("waitpid is confused (%s)", argv0);
> +		if (!in_signal)
> +			error("waitpid is confused (%s)", argv0);
>  	} else if (WIFSIGNALED(status)) {
>  		code = WTERMSIG(status);
>  		if (code != SIGINT && code != SIGQUIT)
> -			error("%s died of signal %d", argv0, code);
> +			if (!in_signal)
> +				error("%s died of signal %d", argv0, code);
>  		/*
>  		 * This return value is chosen so that code & 0xff
>  		 * mimics the exit code that a POSIX shell would report for
> @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
>  			failed_errno = ENOENT;
>  		}
>  	} else {
> -		error("waitpid is confused (%s)", argv0);
> +		if (!in_signal)
> +			error("waitpid is confused (%s)", argv0);
>  	}
>  
> -	clear_child_for_cleanup(pid);
> +	if (!in_signal)
> +		clear_child_for_cleanup(pid);
>  
>  	errno = failed_errno;
>  	return code;
> @@ -437,7 +444,7 @@ fail_pipe:
>  		 * At this point we know that fork() succeeded, but execvp()
>  		 * failed. Errors have been reported to our stderr.
>  		 */
> -		wait_or_whine(cmd->pid, cmd->argv[0]);
> +		wait_or_whine(cmd->pid, cmd->argv[0], 0);
>  		failed_errno = errno;
>  		cmd->pid = -1;
>  	}
> @@ -536,12 +543,18 @@ fail_pipe:
>  
>  int finish_command(struct child_process *cmd)
>  {
> -	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
> +	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
>  	argv_array_clear(&cmd->args);
>  	argv_array_clear(&cmd->env_array);
>  	return ret;
>  }
>  
> +int finish_command_in_signal(struct child_process *cmd)
> +{
> +	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
> +}
> +
> +
>  int run_command(struct child_process *cmd)
>  {
>  	int code;
> @@ -772,7 +785,7 @@ error:
>  int finish_async(struct async *async)
>  {
>  #ifdef NO_PTHREADS
> -	return wait_or_whine(async->pid, "child process");
> +	return wait_or_whine(async->pid, "child process", 0);
>  #else
>  	void *ret = (void *)(intptr_t)(-1);
>  
> diff --git a/run-command.h b/run-command.h
> index 5b4425a3cbe1..275d35c442ac 100644
> --- a/run-command.h
> +++ b/run-command.h
> @@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
>  
>  int start_command(struct child_process *);
>  int finish_command(struct child_process *);
> +int finish_command_in_signal(struct child_process *);
>  int run_command(struct child_process *);
>  
>  /*

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-03 18:12 ` Junio C Hamano
@ 2015-09-03 19:34   ` Takashi Iwai
  2015-09-03 20:55     ` Andreas Schwab
  0 siblings, 1 reply; 11+ messages in thread
From: Takashi Iwai @ 2015-09-03 19:34 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King

On Thu, 03 Sep 2015 20:12:38 +0200,
Junio C Hamano wrote:
> 
> Takashi Iwai <tiwai@suse.de> writes:
> 
> > we've got a bug report of git-log stall in below:
> >   https://bugzilla.opensuse.org/show_bug.cgi?id=942297
> >
> > In short, git-log is left unterminated sometimes when pager is
> > aborted.  When this happens, git process can't be killed by SIGTERM,
> > but only by SIGKILL.  And, further investigation revealed the possible
> > mutex deadlock used in glibc *alloc()/free().
> >
> > I tried to reproduce this by adding a fault injection in glibc code,
> > and actually got the similar stack trace as reported.  The problem is
> > that glibc malloc (in this case realloc() and free()) takes a private
> > mutex.  Thus calling any function does *alloc() or free() in a signal
> > handler may deadlock.  In the case of git, it was free() calls in
> > various cleanup codes and the printf() / strerror() invocations.
> >
> > Also, basically it's not safe to call fflush() or raise(), either.
> > But they seem to work practically on the current systems.
> >
> > Below is a band-aid patch I tested and confirmed to work in the
> > fault-injection case.  But, some unsafe calls mentioned in the above
> > are left.  If we want to be in a safer side, we should really limit
> > the things to do in a signal handler, e.g. only calling close() and
> > doing waitpid(), I suppose.
> >
> > Any better ideas?
> 
> Sorry, but I am not with better ideas (at least offhand right now).
> 
> Essentially the idea seems to be to avoid calling allocation-related
> functions in the signal handler codepath that is used for cleaning
> things up.  Not calling free() in the codepaths is perfectly fine as
> we know we will be soon exiting anyway.  Not being able to call
> error() and strerror() to report funnies (other than the fact that
> we were interrupted) is somewhat sad, though.

Reading signal(7) again, I correct some of my statement: raise() is
safe to use.  But fflush() still isn't.  Maybe fflush() should be
avoided but just call only close().

Regarding the error message: write() is still safe to use, so it is
possible in some level.  strerror() seems invoking malloc(), judging
from the stack trace in bugzilla, so this needs to be avoided.
printf() is a blackbox, so it's hard to tell.  That is, in the worst
case, we can just call write() directly for serious errors, if any.


Takashi

> 
> 
> 
> 
> > diff --git a/pager.c b/pager.c
> > index 27d4c8a17aa1..57dda0142fa9 100644
> > --- a/pager.c
> > +++ b/pager.c
> > @@ -14,19 +14,25 @@
> >  static const char *pager_argv[] = { NULL, NULL };
> >  static struct child_process pager_process = CHILD_PROCESS_INIT;
> >  
> > -static void wait_for_pager(void)
> > +static void flush_pager(void)
> >  {
> >  	fflush(stdout);
> >  	fflush(stderr);
> >  	/* signal EOF to pager */
> >  	close(1);
> >  	close(2);
> > +}
> > +
> > +static void wait_for_pager(void)
> > +{
> > +	flush_pager();
> >  	finish_command(&pager_process);
> >  }
> >  
> >  static void wait_for_pager_signal(int signo)
> >  {
> > -	wait_for_pager();
> > +	flush_pager();
> > +	finish_command_in_signal(&pager_process);
> >  	sigchain_pop(signo);
> >  	raise(signo);
> >  }
> > diff --git a/run-command.c b/run-command.c
> > index 3277cf797ed4..a8cdc8f32944 100644
> > --- a/run-command.c
> > +++ b/run-command.c
> > @@ -18,26 +18,27 @@ struct child_to_clean {
> >  static struct child_to_clean *children_to_clean;
> >  static int installed_child_cleanup_handler;
> >  
> > -static void cleanup_children(int sig)
> > +static void cleanup_children(int sig, int in_signal)
> >  {
> >  	while (children_to_clean) {
> >  		struct child_to_clean *p = children_to_clean;
> >  		children_to_clean = p->next;
> >  		kill(p->pid, sig);
> > -		free(p);
> > +		if (!in_signal)
> > +			free(p);
> >  	}
> >  }
> >  
> >  static void cleanup_children_on_signal(int sig)
> >  {
> > -	cleanup_children(sig);
> > +	cleanup_children(sig, 1);
> >  	sigchain_pop(sig);
> >  	raise(sig);
> >  }
> >  
> >  static void cleanup_children_on_exit(void)
> >  {
> > -	cleanup_children(SIGTERM);
> > +	cleanup_children(SIGTERM, 0);
> >  }
> >  
> >  static void mark_child_for_cleanup(pid_t pid)
> > @@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
> >  		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
> >  }
> >  
> > -static int wait_or_whine(pid_t pid, const char *argv0)
> > +static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
> >  {
> >  	int status, code = -1;
> >  	pid_t waiting;
> > @@ -231,13 +232,17 @@ static int wait_or_whine(pid_t pid, const char *argv0)
> >  
> >  	if (waiting < 0) {
> >  		failed_errno = errno;
> > -		error("waitpid for %s failed: %s", argv0, strerror(errno));
> > +		if (!in_signal)
> > +			error("waitpid for %s failed: %s", argv0,
> > +			      strerror(errno));
> >  	} else if (waiting != pid) {
> > -		error("waitpid is confused (%s)", argv0);
> > +		if (!in_signal)
> > +			error("waitpid is confused (%s)", argv0);
> >  	} else if (WIFSIGNALED(status)) {
> >  		code = WTERMSIG(status);
> >  		if (code != SIGINT && code != SIGQUIT)
> > -			error("%s died of signal %d", argv0, code);
> > +			if (!in_signal)
> > +				error("%s died of signal %d", argv0, code);
> >  		/*
> >  		 * This return value is chosen so that code & 0xff
> >  		 * mimics the exit code that a POSIX shell would report for
> > @@ -254,10 +259,12 @@ static int wait_or_whine(pid_t pid, const char *argv0)
> >  			failed_errno = ENOENT;
> >  		}
> >  	} else {
> > -		error("waitpid is confused (%s)", argv0);
> > +		if (!in_signal)
> > +			error("waitpid is confused (%s)", argv0);
> >  	}
> >  
> > -	clear_child_for_cleanup(pid);
> > +	if (!in_signal)
> > +		clear_child_for_cleanup(pid);
> >  
> >  	errno = failed_errno;
> >  	return code;
> > @@ -437,7 +444,7 @@ fail_pipe:
> >  		 * At this point we know that fork() succeeded, but execvp()
> >  		 * failed. Errors have been reported to our stderr.
> >  		 */
> > -		wait_or_whine(cmd->pid, cmd->argv[0]);
> > +		wait_or_whine(cmd->pid, cmd->argv[0], 0);
> >  		failed_errno = errno;
> >  		cmd->pid = -1;
> >  	}
> > @@ -536,12 +543,18 @@ fail_pipe:
> >  
> >  int finish_command(struct child_process *cmd)
> >  {
> > -	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
> > +	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
> >  	argv_array_clear(&cmd->args);
> >  	argv_array_clear(&cmd->env_array);
> >  	return ret;
> >  }
> >  
> > +int finish_command_in_signal(struct child_process *cmd)
> > +{
> > +	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
> > +}
> > +
> > +
> >  int run_command(struct child_process *cmd)
> >  {
> >  	int code;
> > @@ -772,7 +785,7 @@ error:
> >  int finish_async(struct async *async)
> >  {
> >  #ifdef NO_PTHREADS
> > -	return wait_or_whine(async->pid, "child process");
> > +	return wait_or_whine(async->pid, "child process", 0);
> >  #else
> >  	void *ret = (void *)(intptr_t)(-1);
> >  
> > diff --git a/run-command.h b/run-command.h
> > index 5b4425a3cbe1..275d35c442ac 100644
> > --- a/run-command.h
> > +++ b/run-command.h
> > @@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
> >  
> >  int start_command(struct child_process *);
> >  int finish_command(struct child_process *);
> > +int finish_command_in_signal(struct child_process *);
> >  int run_command(struct child_process *);
> >  
> >  /*
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-03 19:34   ` Takashi Iwai
@ 2015-09-03 20:55     ` Andreas Schwab
  2015-09-04  5:52       ` Takashi Iwai
  0 siblings, 1 reply; 11+ messages in thread
From: Andreas Schwab @ 2015-09-03 20:55 UTC (permalink / raw)
  To: Takashi Iwai; +Cc: Junio C Hamano, git, Jeff King

See
<http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03>
for the complete list of functions you may safely call from a signal
handler.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-03 20:55     ` Andreas Schwab
@ 2015-09-04  5:52       ` Takashi Iwai
  2015-09-04  9:23         ` Jeff King
  0 siblings, 1 reply; 11+ messages in thread
From: Takashi Iwai @ 2015-09-04  5:52 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Junio C Hamano, git, Jeff King

On Thu, 03 Sep 2015 22:55:52 +0200,
Andreas Schwab wrote:
> 
> See
> <http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03>
> for the complete list of functions you may safely call from a signal
> handler.

Thanks, this looks more comprehensive.
FWIW, below is the updated patch with a proper change log.


Takashi

-- 8< --
From: Takashi Iwai <tiwai@suse.de>
Subject: [PATCH] pager: don't use unsafe functions in signal handlers

Since the commit [a3da8821208d: pager: do wait_for_pager on signal
death], we call wait_for_pager() in the pager's signal handler.  The
recent bug report revealed that this causes a deadlock in glibc at
aborting "git log" [*1].  When this happens, git process is left
unterminated, and it can't be killed by SIGTERM but only by SIGKILL.

The problem is that wait_for_pager() function does more than waiting
for pager process's termination, but it does cleanups and printing
errors.  Unfortunately, the functions that may be used in a signal
handler are very limited [*2].  Particularly, malloc(), free() and the
variants can't be used in a signal handler because they take a mutex
internally in glibc.  This was the cause of the deadlock above.  Other
than the direct calls of malloc/free, many functions calling
malloc/free can't be used.  strerror() is such one, either.

Also the usage of fflush() and printf() in a signal handler is bad,
although it seems working so far.  In a safer side, we should avoid
them, too.

This patch tries to reduce the calls of such functions in signal
handlers.  wait_for_pager_signal() is now open-coded not to call
unsafe functions, and finish_command_in_signal() is introduced for the
same reason.  There the free() calls are removed, and only waits for
the children without whining at errors.

[*1] https://bugzilla.opensuse.org/show_bug.cgi?id=942297
[*2] http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 pager.c       |  5 ++++-
 run-command.c | 25 +++++++++++++++++--------
 run-command.h |  1 +
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/pager.c b/pager.c
index 27d4c8a17aa1..12d17af73745 100644
--- a/pager.c
+++ b/pager.c
@@ -26,7 +26,10 @@ static void wait_for_pager(void)
 
 static void wait_for_pager_signal(int signo)
 {
-	wait_for_pager();
+	/* signal EOF to pager */
+	close(1);
+	close(2);
+	finish_command_in_signal(&pager_process);
 	sigchain_pop(signo);
 	raise(signo);
 }
diff --git a/run-command.c b/run-command.c
index 3277cf797ed4..e09275bd9e36 100644
--- a/run-command.c
+++ b/run-command.c
@@ -18,26 +18,27 @@ struct child_to_clean {
 static struct child_to_clean *children_to_clean;
 static int installed_child_cleanup_handler;
 
-static void cleanup_children(int sig)
+static void cleanup_children(int sig, int in_signal)
 {
 	while (children_to_clean) {
 		struct child_to_clean *p = children_to_clean;
 		children_to_clean = p->next;
 		kill(p->pid, sig);
-		free(p);
+		if (!in_signal)
+			free(p);
 	}
 }
 
 static void cleanup_children_on_signal(int sig)
 {
-	cleanup_children(sig);
+	cleanup_children(sig, 1);
 	sigchain_pop(sig);
 	raise(sig);
 }
 
 static void cleanup_children_on_exit(void)
 {
-	cleanup_children(SIGTERM);
+	cleanup_children(SIGTERM, 0);
 }
 
 static void mark_child_for_cleanup(pid_t pid)
@@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
 		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
 }
 
-static int wait_or_whine(pid_t pid, const char *argv0)
+static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
 {
 	int status, code = -1;
 	pid_t waiting;
@@ -228,6 +229,8 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 
 	while ((waiting = waitpid(pid, &status, 0)) < 0 && errno == EINTR)
 		;	/* nothing */
+	if (in_signal)
+		return 0;
 
 	if (waiting < 0) {
 		failed_errno = errno;
@@ -437,7 +440,7 @@ fail_pipe:
 		 * At this point we know that fork() succeeded, but execvp()
 		 * failed. Errors have been reported to our stderr.
 		 */
-		wait_or_whine(cmd->pid, cmd->argv[0]);
+		wait_or_whine(cmd->pid, cmd->argv[0], 0);
 		failed_errno = errno;
 		cmd->pid = -1;
 	}
@@ -536,12 +539,18 @@ fail_pipe:
 
 int finish_command(struct child_process *cmd)
 {
-	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
+	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
 	argv_array_clear(&cmd->args);
 	argv_array_clear(&cmd->env_array);
 	return ret;
 }
 
+int finish_command_in_signal(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
+}
+
+
 int run_command(struct child_process *cmd)
 {
 	int code;
@@ -772,7 +781,7 @@ error:
 int finish_async(struct async *async)
 {
 #ifdef NO_PTHREADS
-	return wait_or_whine(async->pid, "child process");
+	return wait_or_whine(async->pid, "child process", 0);
 #else
 	void *ret = (void *)(intptr_t)(-1);
 
diff --git a/run-command.h b/run-command.h
index 5b4425a3cbe1..275d35c442ac 100644
--- a/run-command.h
+++ b/run-command.h
@@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
 
 int start_command(struct child_process *);
 int finish_command(struct child_process *);
+int finish_command_in_signal(struct child_process *);
 int run_command(struct child_process *);
 
 /*
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04  5:52       ` Takashi Iwai
@ 2015-09-04  9:23         ` Jeff King
  2015-09-04  9:35           ` Takashi Iwai
  2015-09-04 21:56           ` Junio C Hamano
  0 siblings, 2 replies; 11+ messages in thread
From: Jeff King @ 2015-09-04  9:23 UTC (permalink / raw)
  To: Takashi Iwai; +Cc: Andreas Schwab, Junio C Hamano, git

On Fri, Sep 04, 2015 at 07:52:21AM +0200, Takashi Iwai wrote:

> -- 8< --
> From: Takashi Iwai <tiwai@suse.de>
> Subject: [PATCH] pager: don't use unsafe functions in signal handlers
> 
> Since the commit [a3da8821208d: pager: do wait_for_pager on signal
> death], we call wait_for_pager() in the pager's signal handler.  The
> recent bug report revealed that this causes a deadlock in glibc at
> aborting "git log" [*1].  When this happens, git process is left
> unterminated, and it can't be killed by SIGTERM but only by SIGKILL.
> 
> The problem is that wait_for_pager() function does more than waiting
> for pager process's termination, but it does cleanups and printing
> errors.  Unfortunately, the functions that may be used in a signal
> handler are very limited [*2].  Particularly, malloc(), free() and the
> variants can't be used in a signal handler because they take a mutex
> internally in glibc.  This was the cause of the deadlock above.  Other
> than the direct calls of malloc/free, many functions calling
> malloc/free can't be used.  strerror() is such one, either.

I think this approach is the only real solution here (and I agree it is
a real-world problem). Unfortunately, it is the tip of the iceberg.
Looking at other signal handlers, there are lots of other potential
problems. For example, here are the first few I found by grepping:

  - clone.c:remove_junk uses strbufs; these are doing useful work, and
    can't just be skipped if we are in a signal handler

  - fetch calls transport_unlock_pack, which has a free (which can be
    skipped)

  - repack uses remove_temporary_files, which uses a strbuf

and so on.

> Also the usage of fflush() and printf() in a signal handler is bad,
> although it seems working so far.  In a safer side, we should avoid
> them, too.

I'd be surprised if they are safe; stdio definitely involves locking.

Perhaps we should reconsider whether f4c3edc (vreportf: avoid
intermediate buffer, 2015-08-11) is a good idea.  Note that snprintf is
not on the list of safe functions, but I imagine that in practice it is
fine. Though just avoiding error()/warning() in signal handlers might be
a more practical solution anyway.

> diff --git a/pager.c b/pager.c
> index 27d4c8a17aa1..12d17af73745 100644
> --- a/pager.c
> +++ b/pager.c
> @@ -26,7 +26,10 @@ static void wait_for_pager(void)
>  
>  static void wait_for_pager_signal(int signo)
>  {
> -	wait_for_pager();
> +	/* signal EOF to pager */
> +	close(1);
> +	close(2);
> +	finish_command_in_signal(&pager_process);
>  	sigchain_pop(signo);
>  	raise(signo);
>  }

Hmm, is there is any reason to just pass an "in_signal" flag to
wait_for_pager(), to avoid duplicating the logic?

The rest of the patch looks pretty straightforward.

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04  9:23         ` Jeff King
@ 2015-09-04  9:35           ` Takashi Iwai
  2015-09-04 13:04             ` Jeff King
  2015-09-04 21:56           ` Junio C Hamano
  1 sibling, 1 reply; 11+ messages in thread
From: Takashi Iwai @ 2015-09-04  9:35 UTC (permalink / raw)
  To: Jeff King; +Cc: Andreas Schwab, Junio C Hamano, git

On Fri, 04 Sep 2015 11:23:55 +0200,
Jeff King wrote:
> 
> On Fri, Sep 04, 2015 at 07:52:21AM +0200, Takashi Iwai wrote:
> 
> > -- 8< --
> > From: Takashi Iwai <tiwai@suse.de>
> > Subject: [PATCH] pager: don't use unsafe functions in signal handlers
> > 
> > Since the commit [a3da8821208d: pager: do wait_for_pager on signal
> > death], we call wait_for_pager() in the pager's signal handler.  The
> > recent bug report revealed that this causes a deadlock in glibc at
> > aborting "git log" [*1].  When this happens, git process is left
> > unterminated, and it can't be killed by SIGTERM but only by SIGKILL.
> > 
> > The problem is that wait_for_pager() function does more than waiting
> > for pager process's termination, but it does cleanups and printing
> > errors.  Unfortunately, the functions that may be used in a signal
> > handler are very limited [*2].  Particularly, malloc(), free() and the
> > variants can't be used in a signal handler because they take a mutex
> > internally in glibc.  This was the cause of the deadlock above.  Other
> > than the direct calls of malloc/free, many functions calling
> > malloc/free can't be used.  strerror() is such one, either.
> 
> I think this approach is the only real solution here (and I agree it is
> a real-world problem). Unfortunately, it is the tip of the iceberg.
> Looking at other signal handlers, there are lots of other potential
> problems. For example, here are the first few I found by grepping:
> 
>   - clone.c:remove_junk uses strbufs; these are doing useful work, and
>     can't just be skipped if we are in a signal handler
> 
>   - fetch calls transport_unlock_pack, which has a free (which can be
>     skipped)
> 
>   - repack uses remove_temporary_files, which uses a strbuf
> 
> and so on.

Yes, we need to revise all signal handlers...

> > Also the usage of fflush() and printf() in a signal handler is bad,
> > although it seems working so far.  In a safer side, we should avoid
> > them, too.
> 
> I'd be surprised if they are safe; stdio definitely involves locking.
> 
> Perhaps we should reconsider whether f4c3edc (vreportf: avoid
> intermediate buffer, 2015-08-11) is a good idea.  Note that snprintf is
> not on the list of safe functions, but I imagine that in practice it is
> fine. Though just avoiding error()/warning() in signal handlers might be
> a more practical solution anyway.
> 
> > diff --git a/pager.c b/pager.c
> > index 27d4c8a17aa1..12d17af73745 100644
> > --- a/pager.c
> > +++ b/pager.c
> > @@ -26,7 +26,10 @@ static void wait_for_pager(void)
> >  
> >  static void wait_for_pager_signal(int signo)
> >  {
> > -	wait_for_pager();
> > +	/* signal EOF to pager */
> > +	close(1);
> > +	close(2);
> > +	finish_command_in_signal(&pager_process);
> >  	sigchain_pop(signo);
> >  	raise(signo);
> >  }
> 
> Hmm, is there is any reason to just pass an "in_signal" flag to
> wait_for_pager(), to avoid duplicating the logic?

Just because wait_for_pager() itself is an atexit hook that can't take
an argument, so we'd need to split to a new function.  I don't mind
either way.  The revised patch is below.


thanks,

Takashi

-- 8< --
From: Takashi Iwai <tiwai@suse.de>
Subject: [PATCH v2] pager: don't use unsafe functions in signal handlers

Since the commit [a3da8821208d: pager: do wait_for_pager on signal
death], we call wait_for_pager() in the pager's signal handler.  The
recent bug report revealed that this causes a deadlock in glibc at
aborting "git log" [*1].  When this happens, git process is left
unterminated, and it can't be killed by SIGTERM but only by SIGKILL.

The problem is that wait_for_pager() function does more than waiting
for pager process's termination, but it does cleanups and printing
errors.  Unfortunately, the functions that may be used in a signal
handler are very limited [*2].  Particularly, malloc(), free() and the
variants can't be used in a signal handler because they take a mutex
internally in glibc.  This was the cause of the deadlock above.  Other
than the direct calls of malloc/free, many functions calling
malloc/free can't be used.  strerror() is such one, either.

Also the usage of fflush() and printf() in a signal handler is bad,
although it seems working so far.  In a safer side, we should avoid
them, too.

This patch tries to reduce the calls of such functions in signal
handlers.  wait_for_signal() takes a flag and avoids the unsafe
calls.   Also, finish_command_in_signal() is introduced for the
same reason.  There the free() calls are removed, and only waits for
the children without whining at errors.

[*1] https://bugzilla.opensuse.org/show_bug.cgi?id=942297
[*2] http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 pager.c       | 22 ++++++++++++++++------
 run-command.c | 25 +++++++++++++++++--------
 run-command.h |  1 +
 3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/pager.c b/pager.c
index 27d4c8a17aa1..e425070528f4 100644
--- a/pager.c
+++ b/pager.c
@@ -14,19 +14,29 @@
 static const char *pager_argv[] = { NULL, NULL };
 static struct child_process pager_process = CHILD_PROCESS_INIT;
 
-static void wait_for_pager(void)
+static void wait_for_pager(int in_signal)
 {
-	fflush(stdout);
-	fflush(stderr);
+	if (!in_signal) {
+		fflush(stdout);
+		fflush(stderr);
+	}
 	/* signal EOF to pager */
 	close(1);
 	close(2);
-	finish_command(&pager_process);
+	if (in_signal)
+		finish_command_in_signal(&pager_process);
+	else
+		finish_command(&pager_process);
+}
+
+static void wait_for_pager_atexit(void)
+{
+	wait_for_pager(0);
 }
 
 static void wait_for_pager_signal(int signo)
 {
-	wait_for_pager();
+	wait_for_pager(1);
 	sigchain_pop(signo);
 	raise(signo);
 }
@@ -90,7 +100,7 @@ void setup_pager(void)
 
 	/* this makes sure that the parent terminates after the pager */
 	sigchain_push_common(wait_for_pager_signal);
-	atexit(wait_for_pager);
+	atexit(wait_for_pager_atexit);
 }
 
 int pager_in_use(void)
diff --git a/run-command.c b/run-command.c
index 3277cf797ed4..e09275bd9e36 100644
--- a/run-command.c
+++ b/run-command.c
@@ -18,26 +18,27 @@ struct child_to_clean {
 static struct child_to_clean *children_to_clean;
 static int installed_child_cleanup_handler;
 
-static void cleanup_children(int sig)
+static void cleanup_children(int sig, int in_signal)
 {
 	while (children_to_clean) {
 		struct child_to_clean *p = children_to_clean;
 		children_to_clean = p->next;
 		kill(p->pid, sig);
-		free(p);
+		if (!in_signal)
+			free(p);
 	}
 }
 
 static void cleanup_children_on_signal(int sig)
 {
-	cleanup_children(sig);
+	cleanup_children(sig, 1);
 	sigchain_pop(sig);
 	raise(sig);
 }
 
 static void cleanup_children_on_exit(void)
 {
-	cleanup_children(SIGTERM);
+	cleanup_children(SIGTERM, 0);
 }
 
 static void mark_child_for_cleanup(pid_t pid)
@@ -220,7 +221,7 @@ static inline void set_cloexec(int fd)
 		fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
 }
 
-static int wait_or_whine(pid_t pid, const char *argv0)
+static int wait_or_whine(pid_t pid, const char *argv0, int in_signal)
 {
 	int status, code = -1;
 	pid_t waiting;
@@ -228,6 +229,8 @@ static int wait_or_whine(pid_t pid, const char *argv0)
 
 	while ((waiting = waitpid(pid, &status, 0)) < 0 && errno == EINTR)
 		;	/* nothing */
+	if (in_signal)
+		return 0;
 
 	if (waiting < 0) {
 		failed_errno = errno;
@@ -437,7 +440,7 @@ fail_pipe:
 		 * At this point we know that fork() succeeded, but execvp()
 		 * failed. Errors have been reported to our stderr.
 		 */
-		wait_or_whine(cmd->pid, cmd->argv[0]);
+		wait_or_whine(cmd->pid, cmd->argv[0], 0);
 		failed_errno = errno;
 		cmd->pid = -1;
 	}
@@ -536,12 +539,18 @@ fail_pipe:
 
 int finish_command(struct child_process *cmd)
 {
-	int ret = wait_or_whine(cmd->pid, cmd->argv[0]);
+	int ret = wait_or_whine(cmd->pid, cmd->argv[0], 0);
 	argv_array_clear(&cmd->args);
 	argv_array_clear(&cmd->env_array);
 	return ret;
 }
 
+int finish_command_in_signal(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid, cmd->argv[0], 1);
+}
+
+
 int run_command(struct child_process *cmd)
 {
 	int code;
@@ -772,7 +781,7 @@ error:
 int finish_async(struct async *async)
 {
 #ifdef NO_PTHREADS
-	return wait_or_whine(async->pid, "child process");
+	return wait_or_whine(async->pid, "child process", 0);
 #else
 	void *ret = (void *)(intptr_t)(-1);
 
diff --git a/run-command.h b/run-command.h
index 5b4425a3cbe1..275d35c442ac 100644
--- a/run-command.h
+++ b/run-command.h
@@ -50,6 +50,7 @@ void child_process_init(struct child_process *);
 
 int start_command(struct child_process *);
 int finish_command(struct child_process *);
+int finish_command_in_signal(struct child_process *);
 int run_command(struct child_process *);
 
 /*
-- 
2.5.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04  9:35           ` Takashi Iwai
@ 2015-09-04 13:04             ` Jeff King
  2015-09-04 13:40               ` Takashi Iwai
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff King @ 2015-09-04 13:04 UTC (permalink / raw)
  To: Takashi Iwai; +Cc: Andreas Schwab, Junio C Hamano, git

On Fri, Sep 04, 2015 at 11:35:57AM +0200, Takashi Iwai wrote:

> > Hmm, is there is any reason to just pass an "in_signal" flag to
> > wait_for_pager(), to avoid duplicating the logic?
> 
> Just because wait_for_pager() itself is an atexit hook that can't take
> an argument, so we'd need to split to a new function.  I don't mind
> either way.  The revised patch is below.

Ah, right. That's unfortunate, but I think I prefer adding the extra
wrapper to duplicating the contents of the function.

> -- 8< --
> From: Takashi Iwai <tiwai@suse.de>
> Subject: [PATCH v2] pager: don't use unsafe functions in signal handlers
> [...]

This looks good to me. Do you plan on fixing any of the other handlers
(you don't have to; I just want to know if somebody is planning to work
on it).

The pattern of atexit and signal handlers is repeated in several places,
and it seems like we will have to add the same in_signal boilerplate in
each instance. I wonder if we should provide a global "register_cleanup"
that takes a "void (*func)(int in_signal))" function pointer, and:

  1. Adds it to a list (ideally in a way that is atomic if we get
     interrupted while adding to the list).

  2. If not already run, registers an atexit() handler and
     sigchain_push_common for a meta-handler which runs through the list
     and runs each handler.

It's not a _huge_ amount of boilerplate code we'd be saving, but at
least conforming to the "in_signal" function template would make people
think twice about what they're doing inside the cleanup function. :)

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04 13:04             ` Jeff King
@ 2015-09-04 13:40               ` Takashi Iwai
  0 siblings, 0 replies; 11+ messages in thread
From: Takashi Iwai @ 2015-09-04 13:40 UTC (permalink / raw)
  To: Jeff King; +Cc: Andreas Schwab, Junio C Hamano, git

On Fri, 04 Sep 2015 15:04:48 +0200,
Jeff King wrote:
> 
> On Fri, Sep 04, 2015 at 11:35:57AM +0200, Takashi Iwai wrote:
> 
> > > Hmm, is there is any reason to just pass an "in_signal" flag to
> > > wait_for_pager(), to avoid duplicating the logic?
> > 
> > Just because wait_for_pager() itself is an atexit hook that can't take
> > an argument, so we'd need to split to a new function.  I don't mind
> > either way.  The revised patch is below.
> 
> Ah, right. That's unfortunate, but I think I prefer adding the extra
> wrapper to duplicating the contents of the function.
> 
> > -- 8< --
> > From: Takashi Iwai <tiwai@suse.de>
> > Subject: [PATCH v2] pager: don't use unsafe functions in signal handlers
> > [...]
> 
> This looks good to me. Do you plan on fixing any of the other handlers
> (you don't have to; I just want to know if somebody is planning to work
> on it).

Heh, I'd like to leave the rest for professionals :)

> The pattern of atexit and signal handlers is repeated in several places,
> and it seems like we will have to add the same in_signal boilerplate in
> each instance. I wonder if we should provide a global "register_cleanup"
> that takes a "void (*func)(int in_signal))" function pointer, and:
> 
>   1. Adds it to a list (ideally in a way that is atomic if we get
>      interrupted while adding to the list).
> 
>   2. If not already run, registers an atexit() handler and
>      sigchain_push_common for a meta-handler which runs through the list
>      and runs each handler.
> 
> It's not a _huge_ amount of boilerplate code we'd be saving, but at
> least conforming to the "in_signal" function template would make people
> think twice about what they're doing inside the cleanup function. :)

Or, we may have a global in_signal flag.  Of course, this is bad if
git itself is multi-threaded, though.


thanks,

Takashi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04  9:23         ` Jeff King
  2015-09-04  9:35           ` Takashi Iwai
@ 2015-09-04 21:56           ` Junio C Hamano
  2015-09-05  8:59             ` Jeff King
  1 sibling, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2015-09-04 21:56 UTC (permalink / raw)
  To: Jeff King; +Cc: Takashi Iwai, Andreas Schwab, git

Jeff King <peff@peff.net> writes:

> Perhaps we should reconsider whether f4c3edc (vreportf: avoid
> intermediate buffer, 2015-08-11) is a good idea.  Note that snprintf is
> not on the list of safe functions, but I imagine that in practice it is
> fine. Though just avoiding error()/warning() in signal handlers might be
> a more practical solution anyway.

I had exactly the same thought when I read the initial report here.

I wish we can just do "if (in_signal) return;" at the beginning of
vreportf(), but we would not want a global variable there, so... ;-)

Further, I briefly hoped that avoiding error/warning in the signal
handler codepath would allow us to be more lax around allocations,
but I suspect that it unfortunately would not help us that much, as
we may be calling these functions in low memory situations.

So let's queue Takashi's patch as-is for now and look at other
signal codepaths.

Thanks.

	
	

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: glibc mutex deadlock in signal handler
  2015-09-04 21:56           ` Junio C Hamano
@ 2015-09-05  8:59             ` Jeff King
  0 siblings, 0 replies; 11+ messages in thread
From: Jeff King @ 2015-09-05  8:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Takashi Iwai, Andreas Schwab, git

On Fri, Sep 04, 2015 at 02:56:58PM -0700, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > Perhaps we should reconsider whether f4c3edc (vreportf: avoid
> > intermediate buffer, 2015-08-11) is a good idea.  Note that snprintf is
> > not on the list of safe functions, but I imagine that in practice it is
> > fine. Though just avoiding error()/warning() in signal handlers might be
> > a more practical solution anyway.
> 
> I had exactly the same thought when I read the initial report here.
> 
> I wish we can just do "if (in_signal) return;" at the beginning of
> vreportf(), but we would not want a global variable there, so... ;-)

Why not? I mean, sure it's gross. But it actually seems like a pretty
simple fix that doesn't have to hurt other callers (or involve passing
an "in_signal" through the stack). We could even fallback to snprintf()
into a fixed-sized buffer, or some other degraded mode.

> Further, I briefly hoped that avoiding error/warning in the signal
> handler codepath would allow us to be more lax around allocations,
> but I suspect that it unfortunately would not help us that much, as
> we may be calling these functions in low memory situations.

I'm not sure the low-memory thing isn't a red herring. Sure, we call
die() when malloc fails. But only with a tiny string. Something like the
robust_buf patch I posted would handle that just fine.

The real danger of signal handlers is that you don't get to say "oh,
malloc failed, so let's fallback to some degraded mode". You just get
deadlocked in a futex and never return. :)

> So let's queue Takashi's patch as-is for now and look at other
> signal codepaths.

Sounds like a good first step, unless we are going to do refactoring
that Takashi's patch could take advantage of (either a global in_signal,
or some register_cleanup() infrastructure).

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-09-05  9:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-03 11:00 glibc mutex deadlock in signal handler Takashi Iwai
2015-09-03 18:12 ` Junio C Hamano
2015-09-03 19:34   ` Takashi Iwai
2015-09-03 20:55     ` Andreas Schwab
2015-09-04  5:52       ` Takashi Iwai
2015-09-04  9:23         ` Jeff King
2015-09-04  9:35           ` Takashi Iwai
2015-09-04 13:04             ` Jeff King
2015-09-04 13:40               ` Takashi Iwai
2015-09-04 21:56           ` Junio C Hamano
2015-09-05  8:59             ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.