All of lore.kernel.org
 help / color / mirror / Atom feed
From: Johannes Schindelin <johannes.schindelin@gmx.de>
To: git@vger.kernel.org
Cc: Junio C Hamano <gitster@pobox.com>, Jeff King <peff@peff.net>
Subject: [PATCH v2 0/3] Fix a segfault caused by regexec() being called on mmap()ed data
Date: Thu, 8 Sep 2016 09:31:02 +0200 (CEST)	[thread overview]
Message-ID: <cover.1473319844.git.johannes.schindelin@gmx.de> (raw)
In-Reply-To: <cover.1473090278.git.johannes.schindelin@gmx.de>

This patch series addresses a problem where `git diff` is called using
`-G` or `-S --pickaxe-regex` on new-born files that are configured
without user diff drivers, and that hence get mmap()ed into memory.

The problem with that: mmap()ed memory is *not* NUL-terminated, yet the
pickaxe code calls regexec() on it just the same.

This problem has been reported by my colleague Chris Sidi.

We solve this by introducing a helper, regexec_buf(), that takes a
pointer and a length instead of a NUL-terminated string.

This helper then uses REG_STARTEND where available, and falls back to
allocating and constructing a NUL-terminated string. Given the
wide-spread support for REG_STARTEND (Linux has it, MacOSX has it, Git
for Windows has it because it uses compat/regex/ that has it), I think
this is a fair trade-off.

Changes since v1:

- completely revamped the approach: now we no longer force-append a NUL
  whenever mmap()ing buffers for use by the diff machinery, but we fix
  things at the regexec() level.


Johannes Schindelin (3):
  Demonstrate a problem: our pickaxe code assumes NUL-terminated buffers
  Introduce a function to run regexec() on non-NUL-terminated buffers
  Use the newly-introduced regexec_buf() function

 diff.c                  |  3 ++-
 diffcore-pickaxe.c      | 18 ++++++++----------
 git-compat-util.h       | 21 +++++++++++++++++++++
 t/t4061-diff-pickaxe.sh | 22 ++++++++++++++++++++++
 xdiff-interface.c       | 13 ++++---------
 5 files changed, 57 insertions(+), 20 deletions(-)
 create mode 100755 t/t4061-diff-pickaxe.sh

Published-As: https://github.com/dscho/git/releases/tag/mmap-regexec-v2
Fetch-It-Via: git fetch https://github.com/dscho/git mmap-regexec-v2

Interdiff vs v1:

 diff --git a/diff.c b/diff.c
 index 32f7f46..526775a 100644
 --- a/diff.c
 +++ b/diff.c
 @@ -951,7 +951,8 @@ static int find_word_boundaries(mmfile_t *buffer, regex_t *word_regex,
  {
  	if (word_regex && *begin < buffer->size) {
  		regmatch_t match[1];
 -		if (!regexec(word_regex, buffer->ptr + *begin, 1, match, 0)) {
 +		if (!regexec_buf(word_regex, buffer->ptr + *begin,
 +				 buffer->size - *begin, 1, match, 0)) {
  			char *p = memchr(buffer->ptr + *begin + match[0].rm_so,
  					'\n', match[0].rm_eo - match[0].rm_so);
  			*end = p ? p - buffer->ptr : match[0].rm_eo + *begin;
 @@ -2826,15 +2827,6 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
  			s->data = strbuf_detach(&buf, &size);
  			s->size = size;
  			s->should_free = 1;
 -		} else {
 -			/* data must be NUL-terminated so e.g. for regexec() */
 -			char *data = xmalloc(s->size + 1);
 -			memcpy(data, s->data, s->size);
 -			data[s->size] = '\0';
 -			munmap(s->data, s->size);
 -			s->should_munmap = 0;
 -			s->data = data;
 -			s->should_free = 1;
  		}
  	}
  	else {
 diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
 index 88820b6..9795ca1 100644
 --- a/diffcore-pickaxe.c
 +++ b/diffcore-pickaxe.c
 @@ -23,7 +23,6 @@ static void diffgrep_consume(void *priv, char *line, unsigned long len)
  {
  	struct diffgrep_cb *data = priv;
  	regmatch_t regmatch;
 -	int hold;
  
  	if (line[0] != '+' && line[0] != '-')
  		return;
 @@ -33,11 +32,8 @@ static void diffgrep_consume(void *priv, char *line, unsigned long len)
  		 * caller early.
  		 */
  		return;
 -	/* Yuck -- line ought to be "const char *"! */
 -	hold = line[len];
 -	line[len] = '\0';
 -	data->hit = !regexec(data->regexp, line + 1, 1, &regmatch, 0);
 -	line[len] = hold;
 +	data->hit = !regexec_buf(data->regexp, line + 1, len - 1, 1,
 +				 &regmatch, 0);
  }
  
  static int diff_grep(mmfile_t *one, mmfile_t *two,
 @@ -49,12 +45,12 @@ static int diff_grep(mmfile_t *one, mmfile_t *two,
  	xpparam_t xpp;
  	xdemitconf_t xecfg;
  
 -	assert(!one || one->ptr[one->size] == '\0');
 -	assert(!two || two->ptr[two->size] == '\0');
  	if (!one)
 -		return !regexec(regexp, two->ptr, 1, &regmatch, 0);
 +		return !regexec_buf(regexp, two->ptr, two->size,
 +				    1, &regmatch, 0);
  	if (!two)
 -		return !regexec(regexp, one->ptr, 1, &regmatch, 0);
 +		return !regexec_buf(regexp, one->ptr, one->size,
 +				    1, &regmatch, 0);
  
  	/*
  	 * We have both sides; need to run textual diff and see if
 @@ -85,8 +81,8 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
  		regmatch_t regmatch;
  		int flags = 0;
  
 -		assert(data[sz] == '\0');
 -		while (*data && !regexec(regexp, data, 1, &regmatch, flags)) {
 +		while (*data &&
 +		       !regexec_buf(regexp, data, sz, 1, &regmatch, flags)) {
  			flags |= REG_NOTBOL;
  			data += regmatch.rm_eo;
  			if (*data && regmatch.rm_so == regmatch.rm_eo)
 diff --git a/git-compat-util.h b/git-compat-util.h
 index db89ba7..19128b3 100644
 --- a/git-compat-util.h
 +++ b/git-compat-util.h
 @@ -965,6 +965,27 @@ void git_qsort(void *base, size_t nmemb, size_t size,
  #define qsort git_qsort
  #endif
  
 +static inline int regexec_buf(const regex_t *preg, const char *buf, size_t size,
 +			      size_t nmatch, regmatch_t pmatch[], int eflags)
 +{
 +#ifdef REG_STARTEND
 +	assert(nmatch > 0 && pmatch);
 +	pmatch[0].rm_so = 0;
 +	pmatch[0].rm_eo = size;
 +	return regexec(preg, buf, nmatch, pmatch, eflags | REG_STARTEND);
 +#else
 +	char *buf2 = xmalloc(size + 1);
 +	int ret;
 +
 +	memcpy(buf2, buf, size);
 +	buf2[size] = '\0';
 +	ret = regexec(preg, buf2, nmatch, pmatch, eflags);
 +	free(buf2);
 +
 +	return ret;
 +#endif
 +}
 +
  #ifndef DIR_HAS_BSD_GROUP_SEMANTICS
  # define FORCE_DIR_SET_GID S_ISGID
  #else
 diff --git a/t/t4059-diff-pickaxe.sh b/t/t4061-diff-pickaxe.sh
 similarity index 91%
 rename from t/t4059-diff-pickaxe.sh
 rename to t/t4061-diff-pickaxe.sh
 index f0bf50b..5929f2e 100755
 --- a/t/t4059-diff-pickaxe.sh
 +++ b/t/t4061-diff-pickaxe.sh
 @@ -14,7 +14,7 @@ test_expect_success setup '
  	test_tick &&
  	git commit -m "A 4k file"
  '
 -test_expect_success '-G matches' '
 +test_expect_failure '-G matches' '
  	git diff --name-only -G "^0{4096}$" HEAD^ >out &&
  	test 4096-zeroes.txt = "$(cat out)"
  '
 diff --git a/xdiff-interface.c b/xdiff-interface.c
 index f34ea76..50702a2 100644
 --- a/xdiff-interface.c
 +++ b/xdiff-interface.c
 @@ -214,11 +214,10 @@ struct ff_regs {
  static long ff_regexp(const char *line, long len,
  		char *buffer, long buffer_size, void *priv)
  {
 -	char *line_buffer;
  	struct ff_regs *regs = priv;
  	regmatch_t pmatch[2];
  	int i;
 -	int result = -1;
 +	int result;
  
  	/* Exclude terminating newline (and cr) from matching */
  	if (len > 0 && line[len-1] == '\n') {
 @@ -228,18 +227,16 @@ static long ff_regexp(const char *line, long len,
  			len--;
  	}
  
 -	line_buffer = xstrndup(line, len); /* make NUL terminated */
 -
  	for (i = 0; i < regs->nr; i++) {
  		struct ff_reg *reg = regs->array + i;
 -		if (!regexec(&reg->re, line_buffer, 2, pmatch, 0)) {
 +		if (!regexec_buf(&reg->re, line, len, 2, pmatch, 0)) {
  			if (reg->negate)
 -				goto fail;
 +				return -1;
  			break;
  		}
  	}
  	if (regs->nr <= i)
 -		goto fail;
 +		return -1;
  	i = pmatch[1].rm_so >= 0 ? 1 : 0;
  	line += pmatch[i].rm_so;
  	result = pmatch[i].rm_eo - pmatch[i].rm_so;
 @@ -248,8 +245,6 @@ static long ff_regexp(const char *line, long len,
  	while (result > 0 && (isspace(line[result - 1])))
  		result--;
  	memcpy(buffer, line, result);
 - fail:
 -	free(line_buffer);
  	return result;
  }
  

-- 
2.10.0.windows.1.10.g803177d

base-commit: 6ebdac1bab966b720d776aa43ca188fe378b1f4b

  parent reply	other threads:[~2016-09-08  7:31 UTC|newest]

Thread overview: 66+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-05 15:44 [PATCH 0/3] Fix a segfault caused by regexec() being called on mmap()ed data Johannes Schindelin
2016-09-05 15:45 ` [PATCH 1/3] Demonstrate a problem: our pickaxe code assumes NUL-terminated buffers Johannes Schindelin
2016-09-06 18:43   ` Jeff King
2016-09-08  7:53     ` Johannes Schindelin
2016-09-05 15:45 ` [PATCH 2/3] diff_populate_filespec: NUL-terminate buffers Johannes Schindelin
2016-09-06  7:06   ` Jeff King
2016-09-06 16:02     ` Johannes Schindelin
2016-09-06 18:41       ` Jeff King
2016-09-07 18:31         ` Junio C Hamano
2016-09-08  7:52           ` Johannes Schindelin
2016-09-08  7:49         ` Johannes Schindelin
2016-09-08  8:22           ` Jeff King
2016-09-08 16:57             ` Junio C Hamano
2016-09-08 18:22               ` Johannes Schindelin
2016-09-08 18:48               ` Jeff King
2016-09-05 15:45 ` [PATCH 3/3] diff_grep: add assertions verifying that the buffers are NUL-terminated Johannes Schindelin
2016-09-06  7:08   ` Jeff King
2016-09-06 16:04     ` Johannes Schindelin
2016-09-05 19:10 ` [PATCH 0/3] Fix a segfault caused by regexec() being called on mmap()ed data Junio C Hamano
2016-09-06  7:12   ` Jeff King
2016-09-06 14:06     ` Johannes Schindelin
2016-09-06 18:29       ` Jeff King
2016-09-08  7:29         ` Johannes Schindelin
2016-09-08  8:00           ` Jeff King
2016-09-09 10:09             ` Johannes Schindelin
2016-09-09 17:46               ` Junio C Hamano
2016-09-06 13:21   ` Johannes Schindelin
2016-09-06  6:58 ` Jeff King
2016-09-06 14:13   ` Johannes Schindelin
2016-09-08  7:31 ` Johannes Schindelin [this message]
2016-09-08  7:31   ` [PATCH v2 2/3] Introduce a function to run regexec() on non-NUL-terminated buffers Johannes Schindelin
2016-09-08  8:04     ` Jeff King
2016-09-09  9:45       ` Johannes Schindelin
2016-09-09  9:59         ` Jeff King
2016-09-08  7:31   ` [PATCH v2 1/3] Demonstrate a problem: our pickaxe code assumes NUL-terminated buffers Johannes Schindelin
2016-09-08  7:31   ` [PATCH v2 3/3] Use the newly-introduced regexec_buf() function Johannes Schindelin
2016-09-08  7:54     ` Johannes Schindelin
2016-09-08  8:10       ` Jeff King
2016-09-08  8:14         ` Jeff King
2016-09-08  8:35           ` Jeff King
2016-09-08 19:06             ` Ramsay Jones
2016-09-08 19:53               ` Jeff King
2016-09-08 21:30                 ` Junio C Hamano
2016-09-08  7:33   ` [PATCH v2 0/3] Fix a segfault caused by regexec() being called on mmap()ed data Johannes Schindelin
2016-09-08  8:13     ` Jeff King
2016-09-08  7:57   ` [PATCH v3 " Johannes Schindelin
2016-09-08  7:57     ` [PATCH v3 1/3] Demonstrate a problem: our pickaxe code assumes NUL-terminated buffers Johannes Schindelin
2016-09-08  7:58     ` [PATCH v3 2/3] Introduce a function to run regexec() on non-NUL-terminated buffers Johannes Schindelin
2016-09-08 17:03       ` Junio C Hamano
2016-09-08  7:59     ` [PATCH v3 3/3] Use the newly-introduced regexec_buf() function Johannes Schindelin
2016-09-08 17:09       ` Junio C Hamano
2016-09-09  9:52         ` Johannes Schindelin
2016-09-09  9:57           ` Jeff King
2016-09-09 10:41             ` Johannes Schindelin
2016-09-09 17:49           ` Junio C Hamano
2016-09-21 18:23     ` [PATCH v4 0/3] Fix a segfault caused by regexec() being called on mmap()ed data Johannes Schindelin
2016-09-21 18:23       ` [PATCH v4 1/3] regex: -G<pattern> feeds a non NUL-terminated string to regexec() and fails Johannes Schindelin
2016-09-21 18:24       ` [PATCH v4 2/3] regex: add regexec_buf() that can work on a non NUL-terminated string Johannes Schindelin
2016-09-21 19:17         ` Junio C Hamano
2016-09-22 18:38           ` Johannes Schindelin
2016-09-21 18:24       ` [PATCH v4 3/3] regex: use regexec_buf() Johannes Schindelin
2016-09-21 19:18         ` Junio C Hamano
2016-09-21 20:09           ` Junio C Hamano
2016-09-21 22:03         ` Jeff King
2016-09-25 14:01           ` Johannes Schindelin
2016-09-21 22:04       ` [PATCH v4 0/3] Fix a segfault caused by regexec() being called on mmap()ed data Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=cover.1473319844.git.johannes.schindelin@gmx.de \
    --to=johannes.schindelin@gmx.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.