All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
@ 2021-10-15 16:13 Hamza Mahfooz
  2021-10-15 16:13 ` [PATCH v13 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
                   ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Hamza Mahfooz @ 2021-10-15 16:13 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Hamza Mahfooz

These changes are made in preparation of, the colorization support for the
"git log" subcommands that, rely on regex functionality (i.e. "--author",
"--committer" and "--grep"). These changes are necessary primarily because
match_one_pattern() expects header lines to be prefixed, however, in
pretty, the prefixes are stripped from the lines because the name-email
pairs need to go through additional parsing, before they can be printed and
because next_match() doesn't handle the case of
"ctx == GREP_CONTEXT_HEAD" at all. So, teach next_match() how to handle the
new case and move match_one_pattern()'s core logic to
headerless_match_one_pattern() while preserving match_one_pattern()'s uses
that depend on the additional processing.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v5: separate grep changes from pretty changes.

v6: rescope some variables.

v7: export header_field[] and allow for subsequent matches on header lines
    in match_one_pattern().

v8: add headerless_match_one_pattern() and move header_field[] back.

v9: get rid of the new check in headerless_match_one_pattern(), move the
    header pattern filtering logic in grep_next_match() and document
    grep_next_match() in grep.h.

v10: add a "magic" comment in grep_next_match() to signify a fall through
     in the switch statement.
---
 grep.c | 79 ++++++++++++++++++++++++++++++++++++----------------------
 grep.h |  9 +++++++
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/grep.c b/grep.c
index 14fe8a0fd2..fe847a0111 100644
--- a/grep.c
+++ b/grep.c
@@ -944,10 +944,10 @@ static struct {
 	{ "reflog ", 7 },
 };
 
-static int match_one_pattern(struct grep_pat *p,
-			     const char *bol, const char *eol,
-			     enum grep_context ctx,
-			     regmatch_t *pmatch, int eflags)
+static int headerless_match_one_pattern(struct grep_pat *p,
+					const char *bol, const char *eol,
+					enum grep_context ctx,
+					regmatch_t *pmatch, int eflags)
 {
 	int hit = 0;
 	const char *start = bol;
@@ -956,25 +956,6 @@ static int match_one_pattern(struct grep_pat *p,
 	    ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
 		return 0;
 
-	if (p->token == GREP_PATTERN_HEAD) {
-		const char *field;
-		size_t len;
-		assert(p->field < ARRAY_SIZE(header_field));
-		field = header_field[p->field].field;
-		len = header_field[p->field].len;
-		if (strncmp(bol, field, len))
-			return 0;
-		bol += len;
-		switch (p->field) {
-		case GREP_HEADER_AUTHOR:
-		case GREP_HEADER_COMMITTER:
-			strip_timestamp(bol, &eol);
-			break;
-		default:
-			break;
-		}
-	}
-
  again:
 	hit = patmatch(p, bol, eol, pmatch, eflags);
 
@@ -1025,6 +1006,36 @@ static int match_one_pattern(struct grep_pat *p,
 	return hit;
 }
 
+static int match_one_pattern(struct grep_pat *p,
+			     const char *bol, const char *eol,
+			     enum grep_context ctx, regmatch_t *pmatch,
+			     int eflags)
+{
+	const char *field;
+	size_t len;
+
+	if (p->token == GREP_PATTERN_HEAD) {
+		assert(p->field < ARRAY_SIZE(header_field));
+		field = header_field[p->field].field;
+		len = header_field[p->field].len;
+		if (strncmp(bol, field, len))
+			return 0;
+		bol += len;
+
+		switch (p->field) {
+		case GREP_HEADER_AUTHOR:
+		case GREP_HEADER_COMMITTER:
+			strip_timestamp(bol, &eol);
+			break;
+		default:
+			break;
+		}
+	}
+
+	return headerless_match_one_pattern(p, bol, eol, ctx, pmatch, eflags);
+}
+
+
 static int match_expr_eval(struct grep_opt *opt, struct grep_expr *x,
 			   const char *bol, const char *eol,
 			   enum grep_context ctx, ssize_t *col,
@@ -1143,7 +1154,7 @@ static int match_next_pattern(struct grep_pat *p,
 {
 	regmatch_t match;
 
-	if (!match_one_pattern(p, bol, eol, ctx, &match, eflags))
+	if (!headerless_match_one_pattern(p, bol, eol, ctx, &match, eflags))
 		return 0;
 	if (match.rm_so < 0 || match.rm_eo < 0)
 		return 0;
@@ -1158,19 +1169,26 @@ static int match_next_pattern(struct grep_pat *p,
 	return 1;
 }
 
-static int next_match(struct grep_opt *opt,
-		      const char *bol, const char *eol,
-		      enum grep_context ctx, regmatch_t *pmatch, int eflags)
+int grep_next_match(struct grep_opt *opt,
+		    const char *bol, const char *eol,
+		    enum grep_context ctx, regmatch_t *pmatch,
+		    enum grep_header_field field, int eflags)
 {
 	struct grep_pat *p;
 	int hit = 0;
 
 	pmatch->rm_so = pmatch->rm_eo = -1;
 	if (bol < eol) {
-		for (p = opt->pattern_list; p; p = p->next) {
+		for (p = ((ctx == GREP_CONTEXT_HEAD)
+			   ? opt->header_list : opt->pattern_list);
+			  p; p = p->next) {
 			switch (p->token) {
-			case GREP_PATTERN: /* atom */
 			case GREP_PATTERN_HEAD:
+				if ((field != GREP_HEADER_FIELD_MAX) &&
+				    (p->field != field))
+					continue;
+				/* fall thru */
+			case GREP_PATTERN: /* atom */
 			case GREP_PATTERN_BODY:
 				hit |= match_next_pattern(p, bol, eol, ctx,
 							  pmatch, eflags);
@@ -1261,7 +1279,8 @@ static void show_line(struct grep_opt *opt,
 			else if (sign == '=')
 				line_color = opt->colors[GREP_COLOR_FUNCTION];
 		}
-		while (next_match(opt, bol, eol, ctx, &match, eflags)) {
+		while (grep_next_match(opt, bol, eol, ctx, &match,
+				       GREP_HEADER_FIELD_MAX, eflags)) {
 			if (match.rm_so == match.rm_eo)
 				break;
 
diff --git a/grep.h b/grep.h
index 3cb8a83ae8..808ad76f0c 100644
--- a/grep.h
+++ b/grep.h
@@ -191,6 +191,15 @@ void compile_grep_patterns(struct grep_opt *opt);
 void free_grep_patterns(struct grep_opt *opt);
 int grep_buffer(struct grep_opt *opt, const char *buf, unsigned long size);
 
+/* The field parameter is only used to filter header patterns
+ * (where appropriate). If filtering isn't desirable
+ * GREP_HEADER_FIELD_MAX should be supplied.
+ */
+int grep_next_match(struct grep_opt *opt,
+		    const char *bol, const char *eol,
+		    enum grep_context ctx, regmatch_t *pmatch,
+		    enum grep_header_field field, int eflags);
+
 struct grep_source {
 	char *name;
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v13 2/3] pretty: colorize pattern matches in commit messages
  2021-10-15 16:13 [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
@ 2021-10-15 16:13 ` Hamza Mahfooz
  2021-10-15 16:13 ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
  2021-10-15 18:05 ` [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Junio C Hamano
  2 siblings, 0 replies; 47+ messages in thread
From: Hamza Mahfooz @ 2021-10-15 16:13 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Hamza Mahfooz

The "git log" command limits its output to the commits that contain strings
matched by a pattern when the "--grep=<pattern>" option is used, but unlike
output from "git grep -e <pattern>", the matches are not highlighted,
making them harder to spot.

Teach the pretty-printer code to highlight matches from the
"--grep=<pattern>", "--author=<pattern>" and "--committer=<pattern>"
options (to view the last one, you may have to ask for --pretty=fuller).

Also, it must be noted that we are effectively greping the content twice
(because it would be a hassle to rework the existing matching code to do
a /g match and then pass it all down to the coloring code), however it only
slows down "git log --author=^H" on this repository by around 1-2%
(compared to v2.33.0), so it should be a small enough slow down to justify
the addition of the feature.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: make the commit message whole (add the missing ingredients), rename
    append_matched_line() to append_line_with_color(), use
    colors[GREP_COLOR_MATCH_SELECTED] instead of
    colors[GREP_COLOR_MATCH_CONTEXT], allow the background color to be
    customized, don't copy strings to a buffer when not coloring in
    append_line_with_color(), rename next_match() to grep_next_match(),
    repurpose grep_next_match()/match_one_pattern() for use in
    append_line_with_color() (allowing us to remove duplicated matching
    code in append_line_with_color()), document how to customize the
    feature and modify some of the tests to fit the feature better.

v3: fix a formatting issue with the added documentation.

v4: add strbuf_add_with_color(), use the correct color code scheme in the
    unit tests and add more unit tests.

v5: separate grep changes from pretty changes and add some performance
    analysis in the commit message.

v6: put the documentation in the correct place, cleanup pretty.c and
    format the unit tests according to the current convention.

v7: get rid of all manual strbuf management, constify where now appropriate
    and fix the header line prefix issue properly.

v8: remove code that relies on grep_header_fields[].

v11: clarify why we are grepping twice.
---
 Documentation/config/color.txt |   7 ++-
 pretty.c                       | 101 +++++++++++++++++++++++++++++----
 t/t4202-log.sh                 |  51 +++++++++++++++++
 3 files changed, 145 insertions(+), 14 deletions(-)

diff --git a/Documentation/config/color.txt b/Documentation/config/color.txt
index e05d520a86..91d9a9da32 100644
--- a/Documentation/config/color.txt
+++ b/Documentation/config/color.txt
@@ -104,9 +104,12 @@ color.grep.<slot>::
 `matchContext`;;
 	matching text in context lines
 `matchSelected`;;
-	matching text in selected lines
+	matching text in selected lines. Also, used to customize the following
+	linkgit:git-log[1] subcommands: `--grep`, `--author` and `--committer`.
 `selected`;;
-	non-matching text in selected lines
+	non-matching text in selected lines. Also, used to customize the
+	following linkgit:git-log[1] subcommands: `--grep`, `--author` and
+	`--committer`.
 `separator`;;
 	separators between fields on a line (`:`, `-`, and `=`)
 	and between hunks (`--`)
diff --git a/pretty.c b/pretty.c
index 73b5ead509..2dd94af886 100644
--- a/pretty.c
+++ b/pretty.c
@@ -431,6 +431,52 @@ const char *show_ident_date(const struct ident_split *ident,
 	return show_date(date, tz, mode);
 }
 
+static inline void strbuf_add_with_color(struct strbuf *sb, const char *color,
+					 const char *buf, size_t buflen)
+{
+	strbuf_addstr(sb, color);
+	strbuf_add(sb, buf, buflen);
+	if (*color)
+		strbuf_addstr(sb, GIT_COLOR_RESET);
+}
+
+static void append_line_with_color(struct strbuf *sb, struct grep_opt *opt,
+				   const char *line, size_t linelen,
+				   int color, enum grep_context ctx,
+				   enum grep_header_field field)
+{
+	const char *buf, *eol, *line_color, *match_color;
+	regmatch_t match;
+	int eflags = 0;
+
+	buf = line;
+	eol = buf + linelen;
+
+	if (!opt || !want_color(color) || opt->invert)
+		goto end;
+
+	line_color = opt->colors[GREP_COLOR_SELECTED];
+	match_color = opt->colors[GREP_COLOR_MATCH_SELECTED];
+
+	while (grep_next_match(opt, buf, eol, ctx, &match, field, eflags)) {
+		if (match.rm_so == match.rm_eo)
+			break;
+
+		strbuf_add_with_color(sb, line_color, buf, match.rm_so);
+		strbuf_add_with_color(sb, match_color, buf + match.rm_so,
+				      match.rm_eo - match.rm_so);
+		buf += match.rm_eo;
+		eflags = REG_NOTBOL;
+	}
+
+	if (eflags)
+		strbuf_add_with_color(sb, line_color, buf, eol - buf);
+	else {
+end:
+		strbuf_add(sb, buf, eol - buf);
+	}
+}
+
 void pp_user_info(struct pretty_print_context *pp,
 		  const char *what, struct strbuf *sb,
 		  const char *line, const char *encoding)
@@ -496,9 +542,26 @@ void pp_user_info(struct pretty_print_context *pp,
 			strbuf_addch(sb, '\n');
 		strbuf_addf(sb, " <%.*s>\n", (int)maillen, mailbuf);
 	} else {
-		strbuf_addf(sb, "%s: %.*s%.*s <%.*s>\n", what,
-			    (pp->fmt == CMIT_FMT_FULLER) ? 4 : 0, "    ",
-			    (int)namelen, namebuf, (int)maillen, mailbuf);
+		struct strbuf id = STRBUF_INIT;
+		enum grep_header_field field = GREP_HEADER_FIELD_MAX;
+		struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
+		if (!strcmp(what, "Author"))
+			field = GREP_HEADER_AUTHOR;
+		else if (!strcmp(what, "Commit"))
+			field = GREP_HEADER_COMMITTER;
+
+		strbuf_addf(sb, "%s: ", what);
+		if (pp->fmt == CMIT_FMT_FULLER)
+			strbuf_addchars(sb, ' ', 4);
+
+		strbuf_addf(&id, "%.*s <%.*s>", (int)namelen, namebuf,
+			    (int)maillen, mailbuf);
+
+		append_line_with_color(sb, opt, id.buf, id.len, pp->color,
+				       GREP_CONTEXT_HEAD, field);
+		strbuf_addch(sb, '\n');
+		strbuf_release(&id);
 	}
 
 	switch (pp->fmt) {
@@ -1939,8 +2002,9 @@ static int pp_utf8_width(const char *start, const char *end)
 	return width;
 }
 
-static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
-				 const char *line, int linelen)
+static void strbuf_add_tabexpand(struct strbuf *sb, struct grep_opt *opt,
+				 int color, int tabwidth, const char *line,
+				 int linelen)
 {
 	const char *tab;
 
@@ -1957,7 +2021,9 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
 			break;
 
 		/* Output the data .. */
-		strbuf_add(sb, line, tab - line);
+		append_line_with_color(sb, opt, line, tab - line, color,
+				       GREP_CONTEXT_BODY,
+				       GREP_HEADER_FIELD_MAX);
 
 		/* .. and the de-tabified tab */
 		strbuf_addchars(sb, ' ', tabwidth - (width % tabwidth));
@@ -1972,7 +2038,8 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
 	 * worrying about width - there's nothing more to
 	 * align.
 	 */
-	strbuf_add(sb, line, linelen);
+	append_line_with_color(sb, opt, line, linelen, color, GREP_CONTEXT_BODY,
+			       GREP_HEADER_FIELD_MAX);
 }
 
 /*
@@ -1984,11 +2051,16 @@ static void pp_handle_indent(struct pretty_print_context *pp,
 			     struct strbuf *sb, int indent,
 			     const char *line, int linelen)
 {
+	struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
 	strbuf_addchars(sb, ' ', indent);
 	if (pp->expand_tabs_in_log)
-		strbuf_add_tabexpand(sb, pp->expand_tabs_in_log, line, linelen);
+		strbuf_add_tabexpand(sb, opt, pp->color, pp->expand_tabs_in_log,
+				     line, linelen);
 	else
-		strbuf_add(sb, line, linelen);
+		append_line_with_color(sb, opt, line, linelen, pp->color,
+				       GREP_CONTEXT_BODY,
+				       GREP_HEADER_FIELD_MAX);
 }
 
 static int is_mboxrd_from(const char *line, int len)
@@ -2006,7 +2078,9 @@ void pp_remainder(struct pretty_print_context *pp,
 		  struct strbuf *sb,
 		  int indent)
 {
+	struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
 	int first = 1;
+
 	for (;;) {
 		const char *line = *msg_p;
 		int linelen = get_one_line(line);
@@ -2027,14 +2101,17 @@ void pp_remainder(struct pretty_print_context *pp,
 		if (indent)
 			pp_handle_indent(pp, sb, indent, line, linelen);
 		else if (pp->expand_tabs_in_log)
-			strbuf_add_tabexpand(sb, pp->expand_tabs_in_log,
-					     line, linelen);
+			strbuf_add_tabexpand(sb, opt, pp->color,
+					     pp->expand_tabs_in_log, line,
+					     linelen);
 		else {
 			if (pp->fmt == CMIT_FMT_MBOXRD &&
 					is_mboxrd_from(line, linelen))
 				strbuf_addch(sb, '>');
 
-			strbuf_add(sb, line, linelen);
+			append_line_with_color(sb, opt, line, linelen,
+					       pp->color, GREP_CONTEXT_BODY,
+					       GREP_HEADER_FIELD_MAX);
 		}
 		strbuf_addch(sb, '\n');
 	}
diff --git a/t/t4202-log.sh b/t/t4202-log.sh
index 9dfead936b..3d240bba57 100755
--- a/t/t4202-log.sh
+++ b/t/t4202-log.sh
@@ -449,6 +449,57 @@ test_expect_success !FAIL_PREREQS 'log with various grep.patternType configurati
 	)
 '
 
+test_expect_success 'log --author' '
+	cat >expect <<-\EOF &&
+	Author: <BOLD;RED>A U<RESET> Thor <author@example.com>
+	EOF
+	git log -1 --color=always --author="A U" >log &&
+	grep Author log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'log --committer' '
+	cat >expect <<-\EOF &&
+	Commit:     C O Mitter <committer@<BOLD;RED>example<RESET>.com>
+	EOF
+	git log -1 --color=always --pretty=fuller --committer="example" >log &&
+	grep "Commit:" log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'log -i --grep with color' '
+	cat >expect <<-\EOF &&
+	    <BOLD;RED>Sec<RESET>ond
+	    <BOLD;RED>sec<RESET>ond
+	EOF
+	git log --color=always -i --grep=^sec >log &&
+	grep -i sec log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.selected log --grep' '
+	cat >expect <<-\EOF &&
+	    <GREEN>th<RESET><BOLD;RED>ir<RESET><GREEN>d<RESET>
+	EOF
+	git -c color.grep.selected="green" log --color=always --grep=ir >log &&
+	grep ir log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.matchSelected log --grep' '
+	cat >expect <<-\EOF &&
+	    <BLUE>i<RESET>n<BLUE>i<RESET>t<BLUE>i<RESET>al
+	EOF
+	git -c color.grep.matchSelected="blue" log --color=always --grep=i >log &&
+	grep al log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
 cat > expect <<EOF
 * Second
 * sixth
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-15 16:13 [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
  2021-10-15 16:13 ` [PATCH v13 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
@ 2021-10-15 16:13 ` Hamza Mahfooz
  2021-10-15 20:03   ` Junio C Hamano
  2021-11-15 20:43   ` Andreas Schwab
  2021-10-15 18:05 ` [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Junio C Hamano
  2 siblings, 2 replies; 47+ messages in thread
From: Hamza Mahfooz @ 2021-10-15 16:13 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Hamza Mahfooz, Ævar Arnfjörð Bjarmason

If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:

    $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
    grep: (standard input): binary file matches

So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
output is encoded in UTF-8.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v12: get rid of utf8_all_the_things and fix an issue with one of the unit
     tests.

v13: make some clarifications in the commit message.
---
 grep.c                          |  6 +++--
 t/t7812-grep-icase-non-ascii.sh | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/grep.c b/grep.c
index fe847a0111..f6e113e9f0 100644
--- a/grep.c
+++ b/grep.c
@@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
-	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
+	    (!opt->ignore_locale && is_utf8_locale() &&
+	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
+					    (p->fixed || p->is_fixed))))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index e5d1e4ea68..22487d90fd 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
 	test_cmp expected actual
 '
 
+test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+	EOF
+	test_write_lines "forth" >file4 &&
+	git add file4 &&
+	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
+	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+	grep Author log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
+	EOF
+	test_write_lines "fifth" >file5 &&
+	git add file5 &&
+	GIT_COMMITTER_NAME="Ç O Mîtter" &&
+	GIT_COMMITTER_EMAIL="committer@example.com" &&
+	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
+	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
+	grep Commit: log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	    sé<BOLD;RED>con<RESET>d
+	EOF
+	git log -1 --color=always --perl-regexp --grep="con" >log &&
+	grep con log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	    <BOLD;RED>thïrd<RESET>
+	EOF
+	git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
+	grep "th.*rd" log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
 	printf "\\200\\n" >invalid-0x80 &&
 	echo "ævar" >expected &&
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
  2021-10-15 16:13 [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
  2021-10-15 16:13 ` [PATCH v13 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
  2021-10-15 16:13 ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
@ 2021-10-15 18:05 ` Junio C Hamano
  2021-10-15 18:24   ` Hamza Mahfooz
  2 siblings, 1 reply; 47+ messages in thread
From: Junio C Hamano @ 2021-10-15 18:05 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git

Hamza Mahfooz <someguy@effective-light.com> writes:

> These changes are made in preparation of, the colorization support for the
> "git log" subcommands that, rely on regex functionality (i.e. "--author",
> "--committer" and "--grep"). These changes are necessary primarily because
> match_one_pattern() expects header lines to be prefixed, however, in
> pretty, the prefixes are stripped from the lines because the name-email
> pairs need to go through additional parsing, before they can be printed and
> because next_match() doesn't handle the case of
> "ctx == GREP_CONTEXT_HEAD" at all. So, teach next_match() how to handle the
> new case and move match_one_pattern()'s core logic to
> headerless_match_one_pattern() while preserving match_one_pattern()'s uses
> that depend on the additional processing.
>
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v5: separate grep changes from pretty changes.
>
> v6: rescope some variables.
>
> v7: export header_field[] and allow for subsequent matches on header lines
>     in match_one_pattern().
>
> v8: add headerless_match_one_pattern() and move header_field[] back.
>
> v9: get rid of the new check in headerless_match_one_pattern(), move the
>     header pattern filtering logic in grep_next_match() and document
>     grep_next_match() in grep.h.
>
> v10: add a "magic" comment in grep_next_match() to signify a fall through
>      in the switch statement.

Makes readers curious what happend in v11 and later...

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
  2021-10-15 18:05 ` [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Junio C Hamano
@ 2021-10-15 18:24   ` Hamza Mahfooz
  2021-10-15 19:28     ` Junio C Hamano
  0 siblings, 1 reply; 47+ messages in thread
From: Hamza Mahfooz @ 2021-10-15 18:24 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


On Fri, Oct 15 2021 at 11:05:24 AM -0700, Junio C Hamano 
<gitster@pobox.com> wrote:
> Makes readers curious what happend in v11 and later...

It was my understanding that no comment means nothing changed since 
then, or is something to the effect of "no changes" preferable?



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
  2021-10-15 18:24   ` Hamza Mahfooz
@ 2021-10-15 19:28     ` Junio C Hamano
  2021-10-15 19:40       ` Hamza Mahfooz
  2021-10-15 19:49       ` Junio C Hamano
  0 siblings, 2 replies; 47+ messages in thread
From: Junio C Hamano @ 2021-10-15 19:28 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git

Hamza Mahfooz <someguy@effective-light.com> writes:

> On Fri, Oct 15 2021 at 11:05:24 AM -0700, Junio C Hamano
> <gitster@pobox.com> wrote:
>> Makes readers curious what happend in v11 and later...
>
> It was my understanding that no comment means nothing changed since
> then, or is something to the effect of "no changes" preferable?

Usually people do not want to spam the list with a multi-patch
series when they have nothing new to show, unless there are other
reasons to do so, like "there is no change, but I am sending it
again because nobody seemed to have time reviewing the series the
last time", in which case that would make a good explanation to put
there.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
  2021-10-15 19:28     ` Junio C Hamano
@ 2021-10-15 19:40       ` Hamza Mahfooz
  2021-10-15 19:49       ` Junio C Hamano
  1 sibling, 0 replies; 47+ messages in thread
From: Hamza Mahfooz @ 2021-10-15 19:40 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


On Fri, Oct 15 2021 at 12:28:42 PM -0700, Junio C Hamano 
<gitster@pobox.com> wrote:
> 
> Usually people do not want to spam the list with a multi-patch
> series when they have nothing new to show, unless there are other
> reasons to do so, like "there is no change, but I am sending it
> again because nobody seemed to have time reviewing the series the
> last time", in which case that would make a good explanation to put
> there.

I made changes to other patches in the series however, (I only document 
the changes if that patch in particular has changed.)



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use
  2021-10-15 19:28     ` Junio C Hamano
  2021-10-15 19:40       ` Hamza Mahfooz
@ 2021-10-15 19:49       ` Junio C Hamano
  1 sibling, 0 replies; 47+ messages in thread
From: Junio C Hamano @ 2021-10-15 19:49 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git

Junio C Hamano <gitster@pobox.com> writes:

> Hamza Mahfooz <someguy@effective-light.com> writes:
>
>> On Fri, Oct 15 2021 at 11:05:24 AM -0700, Junio C Hamano
>> <gitster@pobox.com> wrote:
>>> Makes readers curious what happend in v11 and later...
>>
>> It was my understanding that no comment means nothing changed since
>> then, or is something to the effect of "no changes" preferable?
>
> Usually people do not want to spam the list with a multi-patch
> series when they have nothing new to show, unless there are other
> reasons to do so, like "there is no change, but I am sending it
> again because nobody seemed to have time reviewing the series the
> last time", in which case that would make a good explanation to put
> there.

Ah, I thought I was responding to a cover letter.  OK, so you were
saying that there was no change in 1/3, but other steps may be
different.

In that case, that makes sense (even though it is more helpful to
just say "v13: same as v12" or something).

Thanks.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-15 16:13 ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
@ 2021-10-15 20:03   ` Junio C Hamano
  2021-10-16 16:25     ` René Scharfe
  2021-11-15 20:43   ` Andreas Schwab
  1 sibling, 1 reply; 47+ messages in thread
From: Junio C Hamano @ 2021-10-15 20:03 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git, Ævar Arnfjörð Bjarmason

Hamza Mahfooz <someguy@effective-light.com> writes:

> If we attempt to grep non-ascii log message text with an ascii pattern, we

"with an ascii pattern, when Git is built with and told to use pcre2, we"

> run into the following issue:
>
>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>     grep: (standard input): binary file matches
>
> So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
> output is encoded in UTF-8.

> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> +	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
> +	    (!opt->ignore_locale && is_utf8_locale() &&
> +	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
> +					    (p->fixed || p->is_fixed))))

That's a mouthful.  It is not obvious what new condition is being
added.  I had to flip the order to see the only difference is, that

> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> +	if ((!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed))) ||
> +	    (!opt->ignore_locale && !has_non_ascii(p->pattern)))

... in addition to the case where the original condition holds, if
we do not say "ignore locale" and the pattern is ascii-only, we
apply these two option flags.  And that matches what the proposed
log message explained as the condition the problem appears.

So,... looks good, I guess.

Thanks, will queue.


Addendum.

If we were reordering pieces in the condition, I wonder if there is
a better way to reorganize it, though.  The original is already
barely explainable with words, and with this new condition added, I
am not sure if anybody can phrase the condition in simple words to
others after staring it for a few minutes.  I can't.

But straightening it out is best left as a future clean-up patch,
separate from this series.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-15 20:03   ` Junio C Hamano
@ 2021-10-16 16:25     ` René Scharfe
  2021-10-16 17:12       ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 47+ messages in thread
From: René Scharfe @ 2021-10-16 16:25 UTC (permalink / raw)
  To: Junio C Hamano, Hamza Mahfooz; +Cc: git, Ævar Arnfjörð Bjarmason

Am 15.10.21 um 22:03 schrieb Junio C Hamano:
> Hamza Mahfooz <someguy@effective-light.com> writes:
>
>> If we attempt to grep non-ascii log message text with an ascii pattern, we
>
> "with an ascii pattern, when Git is built with and told to use pcre2, we"
>
>> run into the following issue:
>>
>>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>>     grep: (standard input): binary file matches

I get no error message on macOS 11.6, but this result, with the underlined
part in red:

   Author: ??var Arnfjörð Bjarmason <avarab@gmail.com>
            ^^^^^^^^^^^^^^^^^^

So the pattern matches the second byte of a two-byte character, inserts a
color code in the middle and thus produces invalid output in this case.

>>
>> So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
>> output is encoded in UTF-8.
>
>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>> +	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>> +	    (!opt->ignore_locale && is_utf8_locale() &&
>> +	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>> +					    (p->fixed || p->is_fixed))))
>
> That's a mouthful.  It is not obvious what new condition is being
> added.  I had to flip the order to see the only difference is, that
>
>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>> +	if ((!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed))) ||
>> +	    (!opt->ignore_locale && !has_non_ascii(p->pattern)))
>
> ... in addition to the case where the original condition holds, if
> we do not say "ignore locale" and the pattern is ascii-only, we
> apply these two option flags.  And that matches what the proposed
> log message explained as the condition the problem appears.
>
> So,... looks good, I guess.
>
> Thanks, will queue.
>
>
> Addendum.
>
> If we were reordering pieces in the condition, I wonder if there is
> a better way to reorganize it, though.  The original is already
> barely explainable with words, and with this new condition added, I
> am not sure if anybody can phrase the condition in simple words to
> others after staring it for a few minutes.  I can't.
>
> But straightening it out is best left as a future clean-up patch,
> separate from this series.
>

It can be written as:

	literal = !opt->ignore_case && (p->fixed || p->is_fixed);
	if (!opt->ignore_locale) {
		if (!has_non_ascii(p->pattern) ||
		    (is_utf8_locale() && !literal))
			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
	}

Literal patterns are those that don't use any wildcards or case-folding.
If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
pattern only consists of ASCII characters, or if the pattern is encoded
in UTF-8 and is not just a literal pattern.

Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
ASCII chars?

The old condition was (reformatted to better match the new one):

	if (!opt->ignore_locale) {
		if (is_utf8_locale() && has_non_ascii(p->pattern) && !literal)
			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
	}

Intuitively I'd say the condition should be:

	if (!opt->ignore_locale && is_utf8_locale()) {
		if (has_non_ascii(p->pattern) || !literal)
			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
	}

If both input and pattern are encoded in UTF-8, enable PCRE2_UTF if we
have to match non-ASCII characters or do more than just literal
matching.

For literal patterns that consist only of ASCII characters we don't need
the cost and complication of PCRE2_UTF.

Makes sense?

René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-16 16:25     ` René Scharfe
@ 2021-10-16 17:12       ` Ævar Arnfjörð Bjarmason
  2021-10-16 19:44         ` René Scharfe
  0 siblings, 1 reply; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-10-16 17:12 UTC (permalink / raw)
  To: René Scharfe; +Cc: Junio C Hamano, Hamza Mahfooz, git


On Sat, Oct 16 2021, René Scharfe wrote:

> Am 15.10.21 um 22:03 schrieb Junio C Hamano:
>> Hamza Mahfooz <someguy@effective-light.com> writes:
>>
>>> If we attempt to grep non-ascii log message text with an ascii pattern, we
>>
>> "with an ascii pattern, when Git is built with and told to use pcre2, we"
>>
>>> run into the following issue:
>>>
>>>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>>>     grep: (standard input): binary file matches
>
> I get no error message on macOS 11.6, but this result, with the underlined
> part in red:
>
>    Author: ??var Arnfjörð Bjarmason <avarab@gmail.com>
>             ^^^^^^^^^^^^^^^^^^
>
> So the pattern matches the second byte of a two-byte character, inserts a
> color code in the middle and thus produces invalid output in this case.

Thanks for digging into these edge cases...

>>>
>>> So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
>>> output is encoded in UTF-8.
>>
>>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>> +	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>>> +	    (!opt->ignore_locale && is_utf8_locale() &&
>>> +	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>>> +					    (p->fixed || p->is_fixed))))
>>
>> That's a mouthful.  It is not obvious what new condition is being
>> added.  I had to flip the order to see the only difference is, that
>>
>>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>> +	if ((!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed))) ||
>>> +	    (!opt->ignore_locale && !has_non_ascii(p->pattern)))
>>
>> ... in addition to the case where the original condition holds, if
>> we do not say "ignore locale" and the pattern is ascii-only, we
>> apply these two option flags.  And that matches what the proposed
>> log message explained as the condition the problem appears.
>>
>> So,... looks good, I guess.
>>
>> Thanks, will queue.
>>
>>
>> Addendum.
>>
>> If we were reordering pieces in the condition, I wonder if there is
>> a better way to reorganize it, though.  The original is already
>> barely explainable with words, and with this new condition added, I
>> am not sure if anybody can phrase the condition in simple words to
>> others after staring it for a few minutes.  I can't.
>>
>> But straightening it out is best left as a future clean-up patch,
>> separate from this series.
>>
>
> It can be written as:
>
> 	literal = !opt->ignore_case && (p->fixed || p->is_fixed);
> 	if (!opt->ignore_locale) {
> 		if (!has_non_ascii(p->pattern) ||
> 		    (is_utf8_locale() && !literal))
> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> 	}

Whatever we go from here I'm very much for untangling that condition,
but I guess it can be done as a follow-up too, I'll defer to Hamza
there...

> Literal patterns are those that don't use any wildcards or case-folding.
> If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
> pattern only consists of ASCII characters, or if the pattern is encoded
> in UTF-8 and is not just a literal pattern.
>
> Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
> ASCII chars?
>
> The old condition was (reformatted to better match the new one):
>
> 	if (!opt->ignore_locale) {
> 		if (is_utf8_locale() && has_non_ascii(p->pattern) && !literal)
> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> 	}
>
> Intuitively I'd say the condition should be:
>
> 	if (!opt->ignore_locale && is_utf8_locale()) {
> 		if (has_non_ascii(p->pattern) || !literal)
> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> 	}
>
> If both input and pattern are encoded in UTF-8, enable PCRE2_UTF if we
> have to match non-ASCII characters or do more than just literal
> matching.
>
> For literal patterns that consist only of ASCII characters we don't need
> the cost and complication of PCRE2_UTF.
>
> Makes sense?

    echo 'René Scharfe' >f &&
    $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?Ren. S' f; echo $?
    1
    $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?R[eé]n. S' f; echo $?
    f:René Scharfe
    0

So it's a choose-your-own adventure where you can pick if you're
right. I.e. do you want the "." metacharacter to match your "é" or not?

These sorts of patterns demonstrate nicely that the relationship between your
pattern being ASCII and wanting or not wanting UTF-8 matching semantics
isn't what you might imagine it to be.

If you look at:

    git log --reverse -p -Gis_utf8_locale -- grep.c

And my earlier replies in this thread-at-large (not digging up a
reference now, sorry), you'll see that the current behavior in grep.c is
really a compromise based on some intersection of user patterns, us
potentially grepping arbitrary binary data and/or "valid" encodings, and
what PCRE does and doesn't puke on.

It's not "right" by any sense, but we sort of limp along with it, mostly
because unlike say Perl's regex engine which really is used for serious
production work where Unicode-correctness matters I don't think anyone
is using "git grep" for anything like that, it's mostly for people's
ad-hoc greps.

Right now I can't remember if the only reason we can't "fix this" is
because we'd be too aggressive with the PCRE version dependency, see
95ca1f987ed (grep/pcre2: better support invalid UTF-8 haystacks,
2021-01-24).







^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-16 17:12       ` Ævar Arnfjörð Bjarmason
@ 2021-10-16 19:44         ` René Scharfe
  2021-10-17  6:00           ` Junio C Hamano
  0 siblings, 1 reply; 47+ messages in thread
From: René Scharfe @ 2021-10-16 19:44 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Junio C Hamano, Hamza Mahfooz, git

Am 16.10.21 um 19:12 schrieb Ævar Arnfjörð Bjarmason:
>
> On Sat, Oct 16 2021, René Scharfe wrote:
>
>> Am 15.10.21 um 22:03 schrieb Junio C Hamano:
>>> Hamza Mahfooz <someguy@effective-light.com> writes:
>>>
>>>> If we attempt to grep non-ascii log message text with an ascii pattern, we
>>>
>>> "with an ascii pattern, when Git is built with and told to use pcre2, we"
>>>
>>>> run into the following issue:
>>>>
>>>>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>>>>     grep: (standard input): binary file matches
>>
>> I get no error message on macOS 11.6, but this result, with the underlined
>> part in red:
>>
>>    Author: ??var Arnfjörð Bjarmason <avarab@gmail.com>
>>             ^^^^^^^^^^^^^^^^^^
>>
>> So the pattern matches the second byte of a two-byte character, inserts a
>> color code in the middle and thus produces invalid output in this case.
>
> Thanks for digging into these edge cases...
>
>>>>
>>>> So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
>>>> output is encoded in UTF-8.
>>>
>>>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>>> +	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>>>> +	    (!opt->ignore_locale && is_utf8_locale() &&
>>>> +	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>>>> +					    (p->fixed || p->is_fixed))))
>>>
>>> That's a mouthful.  It is not obvious what new condition is being
>>> added.  I had to flip the order to see the only difference is, that
>>>
>>>> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>>> -	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>>> +	if ((!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
>>>> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed))) ||
>>>> +	    (!opt->ignore_locale && !has_non_ascii(p->pattern)))
>>>
>>> ... in addition to the case where the original condition holds, if
>>> we do not say "ignore locale" and the pattern is ascii-only, we
>>> apply these two option flags.  And that matches what the proposed
>>> log message explained as the condition the problem appears.
>>>
>>> So,... looks good, I guess.
>>>
>>> Thanks, will queue.
>>>
>>>
>>> Addendum.
>>>
>>> If we were reordering pieces in the condition, I wonder if there is
>>> a better way to reorganize it, though.  The original is already
>>> barely explainable with words, and with this new condition added, I
>>> am not sure if anybody can phrase the condition in simple words to
>>> others after staring it for a few minutes.  I can't.
>>>
>>> But straightening it out is best left as a future clean-up patch,
>>> separate from this series.
>>>
>>
>> It can be written as:
>>
>> 	literal = !opt->ignore_case && (p->fixed || p->is_fixed);
>> 	if (!opt->ignore_locale) {
>> 		if (!has_non_ascii(p->pattern) ||
>> 		    (is_utf8_locale() && !literal))
>> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>> 	}
>
> Whatever we go from here I'm very much for untangling that condition,
> but I guess it can be done as a follow-up too, I'll defer to Hamza
> there...
>
>> Literal patterns are those that don't use any wildcards or case-folding.
>> If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
>> pattern only consists of ASCII characters, or if the pattern is encoded
>> in UTF-8 and is not just a literal pattern.
>>
>> Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
>> ASCII chars?
>>
>> The old condition was (reformatted to better match the new one):
>>
>> 	if (!opt->ignore_locale) {
>> 		if (is_utf8_locale() && has_non_ascii(p->pattern) && !literal)
>> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>> 	}
>>
>> Intuitively I'd say the condition should be:
>>
>> 	if (!opt->ignore_locale && is_utf8_locale()) {
>> 		if (has_non_ascii(p->pattern) || !literal)
>> 			options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>> 	}
>>
>> If both input and pattern are encoded in UTF-8, enable PCRE2_UTF if we
>> have to match non-ASCII characters or do more than just literal
>> matching.
>>
>> For literal patterns that consist only of ASCII characters we don't need
>> the cost and complication of PCRE2_UTF.
>>
>> Makes sense?
>
>     echo 'René Scharfe' >f &&
>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?Ren. S' f; echo $?
>     1
>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?R[eé]n. S' f; echo $?
>     f:René Scharfe
>     0
>
> So it's a choose-your-own adventure where you can pick if you're
> right. I.e. do you want the "." metacharacter to match your "é" or not?

Yes, I do, and it's what Hamza's patch is fixing.

> These sorts of patterns demonstrate nicely that the relationship between your
> pattern being ASCII and wanting or not wanting UTF-8 matching semantics
> isn't what you might imagine it to be.

Differences are:

o: opt->ignore_locale
h: has_non_ascii(p->pattern)
i: is_utf8_locale()
l: literal

o h i l master hamza rene
0 0 0 0      0     1    0
0 0 0 1      0     1    0
0 0 1 0      0     1    1   <== your first example
0 0 1 1      0     1    0
0 1 1 1      0     0    1

Turning on PCRE2_UTF when is_utf8_locale() == 0 seems wrong (first two
lines).

Turning on PCRE2_UTF for literal matching (fourth line) goes against
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26).

Turning on PCRE2_UTF for literal matching of non-ASCII characters (fifth
line) also goes against that, so my intuition betrayed me.  When I
adjust it, I get:

	if (!opt->ignore_locale && is_utf8_locale() && !literal)
		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);

That looks deceptively simple -- just drop has_non_ascii(p->pattern)
from the original condition.

Your second example is handle the same by all versions btw.:

o h i l master hamza rene
0 1 1 0      1     1    1

René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-16 19:44         ` René Scharfe
@ 2021-10-17  6:00           ` Junio C Hamano
  2021-10-17  6:55             ` René Scharfe
  0 siblings, 1 reply; 47+ messages in thread
From: Junio C Hamano @ 2021-10-17  6:00 UTC (permalink / raw)
  To: René Scharfe
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git

René Scharfe <l.s.r@web.de> writes:

>>> Literal patterns are those that don't use any wildcards or case-folding.
>>> If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
>>> pattern only consists of ASCII characters, or if the pattern is encoded
>>> in UTF-8 and is not just a literal pattern.
>>>
>>> Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
>>> ASCII chars?
>>> ...
>>     echo 'René Scharfe' >f &&
>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?Ren. S' f; echo $?
>>     1
>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?R[eé]n. S' f; echo $?
>>     f:René Scharfe
>>     0
>>
>> So it's a choose-your-own adventure where you can pick if you're
>> right. I.e. do you want the "." metacharacter to match your "é" or not?
>
> Yes, I do, and it's what Hamza's patch is fixing.

That may be correct but is this discussion still about "Why enable
... for literal patterns that consist of only ASCII"?  Calling "." a
"metacharacter" and wanting it to match anything other than a single
dot would mean the pattern we are discussing is no longer "literal",
isn't it?  I am puzzled.

>> These sorts of patterns demonstrate nicely that the relationship between your
>> pattern being ASCII and wanting or not wanting UTF-8 matching semantics
>> isn't what you might imagine it to be.
>
> Differences are:
>
> o: opt->ignore_locale
> h: has_non_ascii(p->pattern)
> i: is_utf8_locale()
> l: literal
>
> o h i l master hamza rene
> 0 0 0 0      0     1    0
> 0 0 0 1      0     1    0
> 0 0 1 0      0     1    1   <== your first example
> 0 0 1 1      0     1    0
> 0 1 1 1      0     0    1
>
> Turning on PCRE2_UTF when is_utf8_locale() == 0 seems wrong (first two
> lines).
>
> Turning on PCRE2_UTF for literal matching (fourth line) goes against
> 870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
> 2019-07-26).
>
> Turning on PCRE2_UTF for literal matching of non-ASCII characters (fifth
> line) also goes against that, so my intuition betrayed me.  When I
> adjust it, I get:
>
> 	if (!opt->ignore_locale && is_utf8_locale() && !literal)
> 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>
> That looks deceptively simple -- just drop has_non_ascii(p->pattern)
> from the original condition.
>
> Your second example is handle the same by all versions btw.:
>
> o h i l master hamza rene
> 0 1 1 0      1     1    1
>
> René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-17  6:00           ` Junio C Hamano
@ 2021-10-17  6:55             ` René Scharfe
  2021-10-17  9:44               ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 47+ messages in thread
From: René Scharfe @ 2021-10-17  6:55 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git

Am 17.10.21 um 08:00 schrieb Junio C Hamano:
> René Scharfe <l.s.r@web.de> writes:
>
>>>> Literal patterns are those that don't use any wildcards or case-folding.
>>>> If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
>>>> pattern only consists of ASCII characters, or if the pattern is encoded
>>>> in UTF-8 and is not just a literal pattern.
>>>>
>>>> Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
>>>> ASCII chars?
>>>> ...
>>>     echo 'René Scharfe' >f &&
>>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?Ren. S' f; echo $?
>>>     1
>>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?R[eé]n. S' f; echo $?
>>>     f:René Scharfe
>>>     0
>>>
>>> So it's a choose-your-own adventure where you can pick if you're
>>> right. I.e. do you want the "." metacharacter to match your "é" or not?
>>
>> Yes, I do, and it's what Hamza's patch is fixing.
>
> That may be correct but is this discussion still about "Why enable
> ... for literal patterns that consist of only ASCII"?  Calling "." a
> "metacharacter" and wanting it to match anything other than a single
> dot would mean the pattern we are discussing is no longer "literal",
> isn't it?  I am puzzled.

Right, Ævar's comment is not about my question, but highlights an
inconsistency in master that is fixed by Hamza's patch.

I'll repeat and extend my question: Hamza's patch enables PCRE2_UTF for
non-ASCII patterns even if they are literal or our locale is not UTF-8.
The following change would fix the edge case mentioned in its commit
message without these side-effects.  Am I correct?

diff --git a/grep.c b/grep.c
index fe847a0111..5badb6d851 100644
--- a/grep.c
+++ b/grep.c
@@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	if (!opt->ignore_locale && is_utf8_locale() &&
 	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-17  6:55             ` René Scharfe
@ 2021-10-17  9:44               ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-10-17  9:44 UTC (permalink / raw)
  To: René Scharfe; +Cc: Junio C Hamano, Hamza Mahfooz, git


On Sun, Oct 17 2021, René Scharfe wrote:

> Am 17.10.21 um 08:00 schrieb Junio C Hamano:
>> René Scharfe <l.s.r@web.de> writes:
>>
>>>>> Literal patterns are those that don't use any wildcards or case-folding.
>>>>> If the text is encoded in UTF-8 then we enable PCRE2_UTF either if the
>>>>> pattern only consists of ASCII characters, or if the pattern is encoded
>>>>> in UTF-8 and is not just a literal pattern.
>>>>>
>>>>> Hmm.  Why enable PCRE2_UTF for literal patterns that consist of only
>>>>> ASCII chars?
>>>>> ...
>>>>     echo 'René Scharfe' >f &&
>>>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?Ren. S' f; echo $?
>>>>     1
>>>>     $ git -P grep --no-index -P '^(?:You are (?:wrong|correct), )?R[eé]n. S' f; echo $?
>>>>     f:René Scharfe
>>>>     0
>>>>
>>>> So it's a choose-your-own adventure where you can pick if you're
>>>> right. I.e. do you want the "." metacharacter to match your "é" or not?
>>>
>>> Yes, I do, and it's what Hamza's patch is fixing.
>>
>> That may be correct but is this discussion still about "Why enable
>> ... for literal patterns that consist of only ASCII"?  Calling "." a
>> "metacharacter" and wanting it to match anything other than a single
>> dot would mean the pattern we are discussing is no longer "literal",
>> isn't it?  I am puzzled.
>
> Right, Ævar's comment is not about my question, but highlights an
> inconsistency in master that is fixed by Hamza's patch.

Yes, sorry about that. Just generally about the messy semantics.

> I'll repeat and extend my question: Hamza's patch enables PCRE2_UTF for
> non-ASCII patterns even if they are literal or our locale is not UTF-8.
> The following change would fix the edge case mentioned in its commit
> message without these side-effects.  Am I correct?
>
> diff --git a/grep.c b/grep.c
> index fe847a0111..5badb6d851 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  		}
>  		options |= PCRE2_CASELESS;
>  	}
> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	if (!opt->ignore_locale && is_utf8_locale() &&
>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);

I haven't had time to carefully look into this, but one caveat to check
out is if it works with older PCRE and whether you need e.g. the
GIT_PCRE2_VERSION_10_36_OR_HIGHER.

I tried your suggestion on top of Hamza's series, compiled PCRE v2
10.23, tested, and also tried manually removing the
PCRE2_MATCH_INVALID_UTF flag and tested again.

We pass all tests with both, so maybe this is safe to do (or maybe we're
missing some test we haven't thought of yet...).

One thing that makes me nervous is that we had breakages in the past
once the patches escaped into the wild, particularly because the code
being modified here has is_utf8_locale(), and our tests run under
LANG=c LC_ALL=C.

I tried running all the tests with a non-C locale, there's a lot of
failures, but none new with this change. As an aside the below patch
makes all but one shortlog test pass for me. I wonder if we shouldn't do
this for real to smoke out any $LANG or $LC_ALL dependencies.

I.e. almost all of the failures were due to relying on the sort order of
sort(1), and in one case comm(1), the first hunk here is also redundant
to defining our own ls(1) wrapper....

diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh
index bd17f308b38..738ca6ef587 100755
--- a/t/t5319-multi-pack-index.sh
+++ b/t/t5319-multi-pack-index.sh
@@ -693,12 +693,12 @@ test_expect_success 'expire removes unreferenced packs' '
 		^refs/heads/C
 		EOF
 		git multi-pack-index write &&
-		ls .git/objects/pack | grep -v -e pack-[AB] >expect &&
+		ls .git/objects/pack | sort | grep -v -e pack-[AB] >expect &&
 		git multi-pack-index expire &&
-		ls .git/objects/pack >actual &&
+		ls .git/objects/pack | sort >actual &&
 		test_cmp expect actual &&
-		ls .git/objects/pack/ | grep idx >expect-idx &&
-		test-tool read-midx .git/objects | grep idx >actual-midx &&
+		ls .git/objects/pack/ | sort | grep idx >expect-idx &&
+		test-tool read-midx .git/objects | sort | grep idx >actual-midx &&
 		test_cmp expect-idx actual-midx &&
 		git multi-pack-index verify &&
 		git fsck
@@ -802,7 +802,7 @@ test_expect_success 'expire works when adding new packs' '
 		refs/heads/E
 		EOF
 		git multi-pack-index expire &&
-		ls .git/objects/pack/ | grep idx >expect &&
+		ls .git/objects/pack/ | sort | grep idx >expect &&
 		test-tool read-midx .git/objects | grep idx >actual &&
 		test_cmp expect actual &&
 		git multi-pack-index verify
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 8361b5c1c57..f4f9d231f28 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -417,14 +417,22 @@ test -n "$BASH_VERSION" && shopt -u checkwinsize 2>/dev/null
 
 # For repeatability, reset the environment to known value.
 # TERM is sanitized below, after saving color control sequences.
-LANG=C
-LC_ALL=C
+LANG=en_US.UTF-8
+LC_ALL=en_US.UTF-8
 PAGER=cat
 TZ=UTC
 COLUMNS=80
 export LANG LC_ALL PAGER TZ COLUMNS
 EDITOR=:
 
+sort () {
+	LC_ALL=C LANG=C command sort "$@"
+}
+
+comm () {
+	LC_ALL=C LANG=C command comm "$@"
+}
+
 # A call to "unset" with no arguments causes at least Solaris 10
 # /usr/xpg4/bin/sh and /bin/ksh to bail out.  So keep the unsets
 # deriving from the command substitution clustered with the other

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-15 16:13 ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
  2021-10-15 20:03   ` Junio C Hamano
@ 2021-11-15 20:43   ` Andreas Schwab
  2021-11-15 22:41     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 47+ messages in thread
From: Andreas Schwab @ 2021-11-15 20:43 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git, Junio C Hamano, Ævar Arnfjörð Bjarmason

This breaks 'PCRE v2: grep ASCII from invalid UTF-8 data'.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-15 20:43   ` Andreas Schwab
@ 2021-11-15 22:41     ` Ævar Arnfjörð Bjarmason
  2021-11-16  2:12       ` Carlo Arenas
  2021-11-16  8:41       ` Andreas Schwab
  0 siblings, 2 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-15 22:41 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Hamza Mahfooz, git, Junio C Hamano


On Mon, Nov 15 2021, Andreas Schwab wrote:

> This breaks 'PCRE v2: grep ASCII from invalid UTF-8 data'.

What PCREv2 version are you using? I wonder if it's got to do with the
10.36 boundary, as noted in 95ca1f987ed (grep/pcre2: better support
invalid UTF-8 haystacks, 2021-01-24). One of the two tests matching your
description tests for that bug.

I can't remember much of the details of the bug, and it seems PCREv2's
bug tracker went private, or at least I can't view
https://bugs.exim.org/show_bug.cgi?id=2642 anymore.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-15 22:41     ` Ævar Arnfjörð Bjarmason
@ 2021-11-16  2:12       ` Carlo Arenas
  2021-11-16  8:41       ` Andreas Schwab
  1 sibling, 0 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-16  2:12 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Andreas Schwab, Hamza Mahfooz, git, Junio C Hamano

> I can't remember much of the details of the bug, and it seems PCREv2's
> bug tracker went private, or at least I can't view
> https://bugs.exim.org/show_bug.cgi?id=2642 anymore.

It was shutdown; all new development is now done in github[1], old
closed bugs were not migrated over though but the fix is pretty
straight forward if it needs backporting[2]

I would have expected the less risky change from René[3] to be
released though, but might be a good opportunity to take a deeper look
at the underlying problem and also have a likely solution to both the
regression and the original bug.

Carlo

[1] https://github.com/PhilipHazel/pcre2
[2] https://github.com/PhilipHazel/pcre2/commit/f8cbb1f58df05f175a6898f35dc18e26be6362d0
[3] https://lore.kernel.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-15 22:41     ` Ævar Arnfjörð Bjarmason
  2021-11-16  2:12       ` Carlo Arenas
@ 2021-11-16  8:41       ` Andreas Schwab
  2021-11-16  9:06         ` Carlo Arenas
  1 sibling, 1 reply; 47+ messages in thread
From: Andreas Schwab @ 2021-11-16  8:41 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Hamza Mahfooz, git, Junio C Hamano

On Nov 15 2021, Ævar Arnfjörð Bjarmason wrote:

> What PCREv2 version are you using?

10.31

https://build.opensuse.org/package/show/openSUSE:Leap:15.3:Update/pcre2

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  8:41       ` Andreas Schwab
@ 2021-11-16  9:06         ` Carlo Arenas
  2021-11-16  9:18           ` Andreas Schwab
  2021-11-16  9:29           ` Andreas Schwab
  0 siblings, 2 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-16  9:06 UTC (permalink / raw)
  To: Andreas Schwab
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

On Tue, Nov 16, 2021 at 12:42 AM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> On Nov 15 2021, Ævar Arnfjörð Bjarmason wrote:
>
> > What PCREv2 version are you using?
>
> 10.31
>
> https://build.opensuse.org/package/show/openSUSE:Leap:15.3:Update/pcre2

interesting; that version doesn't need the workaround because it
doesn't have MATCH_INVALID_UTF that was released with 10.34, but it
has a recently backported fix for a similar issue that might be
confusing git and it is somehow related to it.

a full log (run it with -x -i, and get the last lines) would be useful.

curious also if you see other problems using `git log --grep` and
which locale do you have configured?

Carlo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  9:06         ` Carlo Arenas
@ 2021-11-16  9:18           ` Andreas Schwab
  2021-11-16  9:29           ` Andreas Schwab
  1 sibling, 0 replies; 47+ messages in thread
From: Andreas Schwab @ 2021-11-16  9:18 UTC (permalink / raw)
  To: Carlo Arenas
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

On Nov 16 2021, Carlo Arenas wrote:

> curious also if you see other problems using `git log --grep` and

Not sure what you are asking for.

> which locale do you have configured?

There is no locale configured during build.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  9:06         ` Carlo Arenas
  2021-11-16  9:18           ` Andreas Schwab
@ 2021-11-16  9:29           ` Andreas Schwab
  2021-11-16  9:38             ` Carlo Arenas
  1 sibling, 1 reply; 47+ messages in thread
From: Andreas Schwab @ 2021-11-16  9:29 UTC (permalink / raw)
  To: Carlo Arenas
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

expecting success of 7812.13 'PCRE v2: grep ASCII from invalid UTF-8 data': 
        git grep -h "var" invalid-0x80 >actual &&
        test_cmp expected actual &&
        git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
        test_cmp expected actual

++ git grep -h var invalid-0x80
++ test_cmp expected actual
++ test 2 -ne 2
++ eval 'diff -u' '"$@"'
+++ diff -u expected actual
++ git grep -h '(*NO_JIT)var' invalid-0x80
fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set
error: last command exited with $?=128
not ok 13 - PCRE v2: grep ASCII from invalid UTF-8 data
#
#               git grep -h "var" invalid-0x80 >actual &&
#               test_cmp expected actual &&
#               git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
#               test_cmp expected actual
#

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  9:29           ` Andreas Schwab
@ 2021-11-16  9:38             ` Carlo Arenas
  2021-11-16  9:55               ` Andreas Schwab
  2021-11-17 18:46               ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data René Scharfe
  0 siblings, 2 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-16  9:38 UTC (permalink / raw)
  To: Andreas Schwab
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

On Tue, Nov 16, 2021 at 1:30 AM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> expecting success of 7812.13 'PCRE v2: grep ASCII from invalid UTF-8 data':
>         git grep -h "var" invalid-0x80 >actual &&
>         test_cmp expected actual &&
>         git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
>         test_cmp expected actual
>
> ++ git grep -h var invalid-0x80
> ++ test_cmp expected actual
> ++ test 2 -ne 2
> ++ eval 'diff -u' '"$@"'
> +++ diff -u expected actual
> ++ git grep -h '(*NO_JIT)var' invalid-0x80
> fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set

That is exactly what I was worried about, this is not failing one
test, but making `git grep` unusable in any repository that has any
binary files that might be reachable by it, and it is likely affecting
anyone using PCRE older than 10.34

Reverting this specific commit might fix it and is unlikely to
introduce other issues, did you try it?

Carlo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  9:38             ` Carlo Arenas
@ 2021-11-16  9:55               ` Andreas Schwab
  2021-11-16 11:00                 ` [PATCH] grep: avoid setting UTF mode when not needed Carlo Marcelo Arenas Belón
  2021-11-17 18:46               ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data René Scharfe
  1 sibling, 1 reply; 47+ messages in thread
From: Andreas Schwab @ 2021-11-16  9:55 UTC (permalink / raw)
  To: Carlo Arenas
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

On Nov 16 2021, Carlo Arenas wrote:

> Reverting this specific commit might fix it and is unlikely to
> introduce other issues, did you try it?

Yes, of course.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16  9:55               ` Andreas Schwab
@ 2021-11-16 11:00                 ` Carlo Marcelo Arenas Belón
  2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
  2021-11-17 10:23                   ` [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE Carlo Marcelo Arenas Belón
  0 siblings, 2 replies; 47+ messages in thread
From: Carlo Marcelo Arenas Belón @ 2021-11-16 11:00 UTC (permalink / raw)
  To: git
  Cc: avarab, someguy, gitster, Carlo Marcelo Arenas Belón,
	Andreas Schwab

Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii patterns
and UTF-8 data, 2021-10-15), PCRE_UTF mode is enabled in cases where it
will fail because of UTF-8 validation, which is needed for versions of
PCRE2 older than 10.34.

Revert the change on logic to avoid failures that were reported from the
test cases, but that should also reflect in normal use when JIT is enabled
and could result in crashes (or worse), as UTF-8 validation is skipped.

Keeping the tests, as they pass even without the fix as replicated locally
in Debian 10 and the CI.

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
 grep.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/grep.c b/grep.c
index f6e113e9f0..fe847a0111 100644
--- a/grep.c
+++ b/grep.c
@@ -382,10 +382,8 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
-	    (!opt->ignore_locale && is_utf8_locale() &&
-	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
-					    (p->fixed || p->is_fixed))))
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
-- 
2.34.0.352.g07dee3c5e1


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16 11:00                 ` [PATCH] grep: avoid setting UTF mode when not needed Carlo Marcelo Arenas Belón
@ 2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
  2021-11-16 13:35                     ` Hamza Mahfooz
                                       ` (2 more replies)
  2021-11-17 10:23                   ` [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE Carlo Marcelo Arenas Belón
  1 sibling, 3 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-16 12:32 UTC (permalink / raw)
  To: Carlo Marcelo Arenas Belón; +Cc: git, someguy, gitster, Andreas Schwab


On Tue, Nov 16 2021, Carlo Marcelo Arenas Belón wrote:

> Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii patterns
> and UTF-8 data, 2021-10-15), PCRE_UTF mode is enabled in cases where it
> will fail because of UTF-8 validation, which is needed for versions of
> PCRE2 older than 10.34.
>
> Revert the change on logic to avoid failures that were reported from the
> test cases, but that should also reflect in normal use when JIT is enabled
> and could result in crashes (or worse), as UTF-8 validation is skipped.
>
> Keeping the tests, as they pass even without the fix as replicated locally
> in Debian 10 and the CI.
>
> Reported-by: Andreas Schwab <schwab@linux-m68k.org>
> Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
> ---
>  grep.c | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/grep.c b/grep.c
> index f6e113e9f0..fe847a0111 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,10 +382,8 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  		}
>  		options |= PCRE2_CASELESS;
>  	}
> -	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
> -	    (!opt->ignore_locale && is_utf8_locale() &&
> -	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
> -					    (p->fixed || p->is_fixed))))
> +	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>  
>  #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER

Hrm.

A few things:

First, if we've got a post-PCREv2 version whatever fix let's guard that
with an ifdef, see thep GIT_PCRE2_VERSION_*_HIGHER at the top of grep.h.

It really helps to have those, both to know to test on those older
versions, and also so that we can at some point in the future bump the
required version and drop these workarounds entirely (as we do with
git-curl-compat.h).

Secondly, whatever do here let's first fix the test added in ae39ba431a,
so we're not groping around in the dark even more.

I didn't spot this at the time but the test that Hamza added in that
based on my initial report[1] is broken & doesn't test anything
meaningful. It needs to have this applied:
	
	diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
	index 22487d90fdc..1da6b07a579 100755
	--- a/t/t7812-grep-icase-non-ascii.sh
	+++ b/t/t7812-grep-icase-non-ascii.sh
	@@ -60,7 +60,7 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on U
	        test_write_lines "forth" >file4 &&
	        git add file4 &&
	        git commit --author="À Ú Thor <author@example.com>" -m sécond &&
	-       git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
	+       git log -1 --color=always --perl-regexp --author=". . Thor" >log &&
	        grep Author log >actual.raw &&
	        test_decode_color <actual.raw >actual &&
	        test_cmp expected actual

I.e. the whole point of using the color output to test this is to
discover where PCRE2 is going to consider a character boundary to be,
using .* means that it won't be tested at add, since .* will happily eat
up whatever arbitrary data it finds with or without UTF-8 mode.

Other tests added in that & adjacent (if any?) commits may have the same
issue, I haven't dug into it.

If we lead with that patch we'll get the test passing on master as
before, but with your patch above it'll break. I.e. the "when not
needed' in the $subject isn't true, it's just that the test is
completely broken.

In the context of this being a pretty urgent post-release fix (but I
don't know if Junio would consider a point-release, so perhaphs it's
not) I'd be OK with either of:

 A. Let's back out this new log grep color thing entirely while we
    reconsider this. The gitster/hm/paint-hits-in-log-grep topic
    currently reverts cleanly.

 B. Don't break the new log grep color thing, and also fix the 'grepping
    binary' regression (which is much more important than having A)

But let's not go for some in-between where we break the new feature to
the point of it being worse than the state of not having it at all in
v2.33.0.

I.e. without the that log grep color feature we wouldn't screw up the
display of non-ASCII characters in log output (yay), in v2.34.0 we
don't, but also color the match (yay), but we broke grepping binary
*files* (boo!).

I think the approach I suggested in [2] is a much more viable way
forward, i.e. let's stop fiddling with this giant nested if statement
that's mainly meant for the grep-a-file-case, revert to the
pre-log-grep-color state, and have the log-grep-color mode pass in a
"yes, I'd like the UTF-8 mode, please".

Having said that that's probably also broken, just in ways we're less
likely to spot (or maybe some of the log encoding settings/reencoding
saves us there?), we may need to have that ifdef'd to 10.34 and higher,
or have some opt-in setting for this.

1. https://lore.kernel.org/git/87v92bju64.fsf@evledraar.gmail.com/
2. https://lore.kernel.org/git/87v92bju64.fsf@evledraar.gmail.com/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
@ 2021-11-16 13:35                     ` Hamza Mahfooz
  2021-11-17 14:31                       ` Ævar Arnfjörð Bjarmason
  2021-11-16 18:22                     ` Carlo Arenas
  2021-11-16 18:48                     ` Junio C Hamano
  2 siblings, 1 reply; 47+ messages in thread
From: Hamza Mahfooz @ 2021-11-16 13:35 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Carlo Marcelo Arenas Belón, git, gitster, Andreas Schwab

Does anyone know if René's patch causes older PCRE versions to break? 
If it doesn't I think René's patch plus Ævar's fix is the way to go.

On Tue, Nov 16 2021 at 01:32:17 PM +0100, Ævar Arnfjörð Bjarmason 
<avarab@gmail.com> wrote:
> 
> On Tue, Nov 16 2021, Carlo Marcelo Arenas Belón wrote:
> 
>>  Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii 
>> patterns
>>  and UTF-8 data, 2021-10-15), PCRE_UTF mode is enabled in cases 
>> where it
>>  will fail because of UTF-8 validation, which is needed for versions 
>> of
>>  PCRE2 older than 10.34.
>> 
>>  Revert the change on logic to avoid failures that were reported 
>> from the
>>  test cases, but that should also reflect in normal use when JIT is 
>> enabled
>>  and could result in crashes (or worse), as UTF-8 validation is 
>> skipped.
>> 
>>  Keeping the tests, as they pass even without the fix as replicated 
>> locally
>>  in Debian 10 and the CI.
>> 
>>  Reported-by: Andreas Schwab <schwab@linux-m68k.org>
>>  Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
>>  ---
>>   grep.c | 6 ++----
>>   1 file changed, 2 insertions(+), 4 deletions(-)
>> 
>>  diff --git a/grep.c b/grep.c
>>  index f6e113e9f0..fe847a0111 100644
>>  --- a/grep.c
>>  +++ b/grep.c
>>  @@ -382,10 +382,8 @@ static void compile_pcre2_pattern(struct 
>> grep_pat *p, const struct grep_opt *opt
>>   		}
>>   		options |= PCRE2_CASELESS;
>>   	}
>>  -	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>>  -	    (!opt->ignore_locale && is_utf8_locale() &&
>>  -	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>>  -					    (p->fixed || p->is_fixed))))
>>  +	if (!opt->ignore_locale && is_utf8_locale() && 
>> has_non_ascii(p->pattern) &&
>>  +	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>>   		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>> 
>>   #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
> 
> Hrm.
> 
> A few things:
> 
> First, if we've got a post-PCREv2 version whatever fix let's guard 
> that
> with an ifdef, see thep GIT_PCRE2_VERSION_*_HIGHER at the top of 
> grep.h.
> 
> It really helps to have those, both to know to test on those older
> versions, and also so that we can at some point in the future bump the
> required version and drop these workarounds entirely (as we do with
> git-curl-compat.h).
> 
> Secondly, whatever do here let's first fix the test added in 
> ae39ba431a,
> so we're not groping around in the dark even more.
> 
> I didn't spot this at the time but the test that Hamza added in that
> based on my initial report[1] is broken & doesn't test anything
> meaningful. It needs to have this applied:
> 
> 	diff --git a/t/t7812-grep-icase-non-ascii.sh 
> b/t/t7812-grep-icase-non-ascii.sh
> 	index 22487d90fdc..1da6b07a579 100755
> 	--- a/t/t7812-grep-icase-non-ascii.sh
> 	+++ b/t/t7812-grep-icase-non-ascii.sh
> 	@@ -60,7 +60,7 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log 
> --author with an ascii pattern on U
> 	        test_write_lines "forth" >file4 &&
> 	        git add file4 &&
> 	        git commit --author="À Ú Thor <author@example.com>" -m 
> sécond &&
> 	-       git log -1 --color=always --perl-regexp --author=".*Thor" 
> >log &&
> 	+       git log -1 --color=always --perl-regexp --author=". . Thor" 
> >log &&
> 	        grep Author log >actual.raw &&
> 	        test_decode_color <actual.raw >actual &&
> 	        test_cmp expected actual
> 
> I.e. the whole point of using the color output to test this is to
> discover where PCRE2 is going to consider a character boundary to be,
> using .* means that it won't be tested at add, since .* will happily 
> eat
> up whatever arbitrary data it finds with or without UTF-8 mode.
> 
> Other tests added in that & adjacent (if any?) commits may have the 
> same
> issue, I haven't dug into it.
> 
> If we lead with that patch we'll get the test passing on master as
> before, but with your patch above it'll break. I.e. the "when not
> needed' in the $subject isn't true, it's just that the test is
> completely broken.
> 
> In the context of this being a pretty urgent post-release fix (but I
> don't know if Junio would consider a point-release, so perhaphs it's
> not) I'd be OK with either of:
> 
>  A. Let's back out this new log grep color thing entirely while we
>     reconsider this. The gitster/hm/paint-hits-in-log-grep topic
>     currently reverts cleanly.
> 
>  B. Don't break the new log grep color thing, and also fix the 
> 'grepping
>     binary' regression (which is much more important than having A)
> 
> But let's not go for some in-between where we break the new feature to
> the point of it being worse than the state of not having it at all in
> v2.33.0.
> 
> I.e. without the that log grep color feature we wouldn't screw up the
> display of non-ASCII characters in log output (yay), in v2.34.0 we
> don't, but also color the match (yay), but we broke grepping binary
> *files* (boo!).
> 
> I think the approach I suggested in [2] is a much more viable way
> forward, i.e. let's stop fiddling with this giant nested if statement
> that's mainly meant for the grep-a-file-case, revert to the
> pre-log-grep-color state, and have the log-grep-color mode pass in a
> "yes, I'd like the UTF-8 mode, please".
> 
> Having said that that's probably also broken, just in ways we're less
> likely to spot (or maybe some of the log encoding settings/reencoding
> saves us there?), we may need to have that ifdef'd to 10.34 and 
> higher,
> or have some opt-in setting for this.
> 
> 1. https://lore.kernel.org/git/87v92bju64.fsf@evledraar.gmail.com/
> 2. https://lore.kernel.org/git/87v92bju64.fsf@evledraar.gmail.com/



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
  2021-11-16 13:35                     ` Hamza Mahfooz
@ 2021-11-16 18:22                     ` Carlo Arenas
  2021-11-16 18:48                     ` Junio C Hamano
  2 siblings, 0 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-16 18:22 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, someguy, gitster, Andreas Schwab

On Tue, Nov 16, 2021 at 4:50 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> First, if we've got a post-PCREv2 version whatever fix let's guard that
> with an ifdef, see thep GIT_PCRE2_VERSION_*_HIGHER at the top of grep.h.

The availability of PCRE2_MATCH_INVALID_UTF does that implicitly, but
adding one makes sense to make it explicit; specially if it leads to
better testing.

> In the context of this being a pretty urgent post-release fix (but I
> don't know if Junio would consider a point-release, so perhaphs it's
> not) I'd be OK with either of:
>
>  A. Let's back out this new log grep color thing entirely while we
>     reconsider this. The gitster/hm/paint-hits-in-log-grep topic
>     currently reverts cleanly.

Agree that reverting the whole feature makes more sense, but was
aiming to the minimum change required to allow for a "brown bag"
release with this.

The way the PCRE2 integration works, uses the fast path that presumes
valid UTF-8 and is documented[1] to have "Undefined Behaviour" when
the subject that is being searched on is not.

That has been shown before to lead to crashes or infinite loops

Carlo

[1] https://www.pcre.org/current/doc/html/pcre2jit.html#SEC4

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
  2021-11-16 13:35                     ` Hamza Mahfooz
  2021-11-16 18:22                     ` Carlo Arenas
@ 2021-11-16 18:48                     ` Junio C Hamano
  2 siblings, 0 replies; 47+ messages in thread
From: Junio C Hamano @ 2021-11-16 18:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Carlo Marcelo Arenas Belón, git, someguy, Andreas Schwab

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> In the context of this being a pretty urgent post-release fix (but I
> don't know if Junio would consider a point-release, so perhaphs it's
> not) I'd be OK with either of:
>
>  A. Let's back out this new log grep color thing entirely while we
>     reconsider this. The gitster/hm/paint-hits-in-log-grep topic
>     currently reverts cleanly.
>
>  B. Don't break the new log grep color thing, and also fix the 'grepping
>     binary' regression (which is much more important than having A)
>
> But let's not go for some in-between where we break the new feature to
> the point of it being worse than the state of not having it at all in
> v2.33.0.
>
> I.e. without the that log grep color feature we wouldn't screw up the
> display of non-ASCII characters in log output (yay), in v2.34.0 we
> don't, but also color the match (yay), but we broke grepping binary
> *files* (boo!).

Sorry, but with too many new patches on the list that are not
urgent, while we wanted to see us work solely on post-release
regression fixes, I do not seem to be able to locate the reports of
this "breakage" and the other binary-file breakage.

But in any case, yes, since the 2.33.0 cycle was run deliberately
loosely to take undercooked topics to 'next' without much reviews
(no, "I looked at some part and they looked OK" is not a review),
and because any topic in 'next', by default, graduates to 'master'
solely on time basis, I am fully expecting that we'd have to issue
2.33.1 (and possibly .2) during the first two or three weeks of this
cycle.  So let's make sure we fix any iffy ones.

Downthread Carlo seem to agree with you that it would be better to
revert the paint-hits-in-log-grep topic wholesale; I have not yet
formed an opinion, as I haven't seen the breakage reports as I said.

As I will be offline most of the day and perhaps tomorrow, nothing
may happen in the meantime on my end, but hopefully we'll see a
reviewed and ready-to-be-applied patchset that people are happy with
by the time I look at the list again ;-)?

Thanks.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-16 11:00                 ` [PATCH] grep: avoid setting UTF mode when not needed Carlo Marcelo Arenas Belón
  2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
@ 2021-11-17 10:23                   ` Carlo Marcelo Arenas Belón
  2021-11-18  7:29                     ` Junio C Hamano
  2021-11-18 21:21                     ` Hamza Mahfooz
  1 sibling, 2 replies; 47+ messages in thread
From: Carlo Marcelo Arenas Belón @ 2021-11-17 10:23 UTC (permalink / raw)
  To: git
  Cc: avarab, someguy, gitster, Carlo Marcelo Arenas Belón,
	René Scharfe, Andreas Schwab

Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii patterns
and UTF-8 data, 2021-10-15), PCRE2_UTF mode is enabled for cases where it
will trigger UTF-8 validation errors (as reported) or can result in
undefined behaviour.

Our use of PCRE2 only allows searching through non UTF-8 validated data
safely through the use of the PCRE2_MATCH_INVALID_UTF flag, that is only
available after 10.34, so restrict the change to newer versions of PCRE
and revert to the old logic for older releases, which will still allow
for matching not using UTF-8 for likely most usecases (as shown in the
tests).

Fix one test that was using an expression that wouldn't fail without the
new code so it can be forced to fail if it is missing and restrict it to
run only for newer PCRE releases; while at it do some minor refactoring
to cleanup the fallout for when that test might be skipped or might
succeed under the new conditions.

Keeping the overly complex and unnecessary logic for now, to reduce risk
but with the hope that it will be cleaned up later.

Helped-by: René Scharfe <l.s.r@web.de>
Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
v2:
* restrict the code at compile time instead of reverting
* "fix" test to document  the behaviour under both PCRE code versions
* update commit message to better explain the issue

 grep.c                          |  7 ++++++-
 t/t7812-grep-icase-non-ascii.sh | 32 ++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/grep.c b/grep.c
index f6e113e9f0..0126aa3db4 100644
--- a/grep.c
+++ b/grep.c
@@ -382,12 +382,17 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
+#ifdef GIT_PCRE2_VERSION_10_34_OR_HIGHER
 	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
 	    (!opt->ignore_locale && is_utf8_locale() &&
 	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
 					    (p->fixed || p->is_fixed))))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
-
+#else
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+		options |= PCRE2_UTF;
+#endif
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
 	/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */
 	if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 22487d90fd..3bfe1ee728 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,14 +53,27 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
 	test_cmp expected actual
 '
 
-test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
-	cat >expected <<-\EOF &&
-	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
-	EOF
+test_expect_success GETTEXT_LOCALE,PCRE 'setup ascii pattern on UTF-8 data' '
 	test_write_lines "forth" >file4 &&
 	git add file4 &&
 	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
-	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+	test_write_lines "fifth" >file5 &&
+	git add file5 &&
+	GIT_COMMITTER_NAME="Ç O Mîtter" &&
+	GIT_COMMITTER_EMAIL="committer@example.com" &&
+	git -c i18n.commitEncoding=latin1 commit -m thïrd
+'
+
+test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
+	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE,PCRE2_MATCH_INVALID_UTF 'log --author with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	Author: <BOLD;RED>A U Thor<RESET> <author@example.com>
+	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+	EOF
+	git log --color=always --perl-regexp --author=". . Thor" >log &&
 	grep Author log >actual.raw &&
 	test_decode_color <actual.raw >actual &&
 	test_cmp expected actual
@@ -70,11 +83,6 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern o
 	cat >expected <<-\EOF &&
 	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
 	EOF
-	test_write_lines "fifth" >file5 &&
-	git add file5 &&
-	GIT_COMMITTER_NAME="Ç O Mîtter" &&
-	GIT_COMMITTER_EMAIL="committer@example.com" &&
-	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
 	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
 	grep Commit: log >actual.raw &&
 	test_decode_color <actual.raw >actual &&
@@ -141,10 +149,6 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invali
 	test_cmp invalid-0xe5 actual
 '
 
-test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
-	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
-'
-
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
 	test_might_fail git grep -hi "Æ" invalid-0x80 >actual &&
 	test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual
-- 
2.34.0.352.g07dee3c5e1


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH] grep: avoid setting UTF mode when not needed
  2021-11-16 13:35                     ` Hamza Mahfooz
@ 2021-11-17 14:31                       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-17 14:31 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Carlo Marcelo Arenas Belón, git, gitster, Andreas Schwab


On Tue, Nov 16 2021, Hamza Mahfooz wrote:

> Does anyone know if René's patch causes older PCRE versions to break?
> If it doesn't I think René's patch plus Ævar's fix is the way to go.

I haven't tested, but I think a very good thing to do/test (hint hint!)
is to add a CI job where we build/test git in combination with various
PCRE versions.

PCRE2 used to be in SVN, but nowadays it's in a git repository on
github: https://github.com/PhilipHazel/pcre2

It should be a small matter of copying the existing template to set up a
"matrix" where we test various known older pcre versions, they're
available as tags in the git repository.

"--enable-jit" and "--disable-jit" should be part of that matrix too,
and "--enable-utf" and "--disable-utf".

See the "microsoft/vcpkg" and "regular:" parts of
.github/workflows/main.git for relevant examples, i.e. this would
e.g. clone pcre2.git into compat/pcre2-repo, check out a given version
from the matrix. Then:

    cd compat/pcre2-repo &&
    ./autogen.sh &&
    ./configure --prefix="$PWD/inst" &&
    make install

Then at the top-level:

    make USE_LIBPCRE=Y CFLAGS=-O3 LIBPCREDIR="$PWD/compat/pcre2-repo/inst"

Then run "make test", for saving some CPU & speeding up the runs it
should be more than sufficient to only run those tests that match
*{grep,log,pickaxe}*
    

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-16  9:38             ` Carlo Arenas
  2021-11-16  9:55               ` Andreas Schwab
@ 2021-11-17 18:46               ` René Scharfe
  2021-11-17 19:56                 ` Carlo Arenas
  2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
  1 sibling, 2 replies; 47+ messages in thread
From: René Scharfe @ 2021-11-17 18:46 UTC (permalink / raw)
  To: Carlo Arenas, Andreas Schwab
  Cc: Ævar Arnfjörð Bjarmason, Hamza Mahfooz, git,
	Junio C Hamano

Am 16.11.21 um 10:38 schrieb Carlo Arenas:
> On Tue, Nov 16, 2021 at 1:30 AM Andreas Schwab <schwab@linux-m68k.org> wrote:
>>
>> expecting success of 7812.13 'PCRE v2: grep ASCII from invalid UTF-8 data':
>>         git grep -h "var" invalid-0x80 >actual &&
>>         test_cmp expected actual &&
>>         git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
>>         test_cmp expected actual
>>
>> ++ git grep -h var invalid-0x80
>> ++ test_cmp expected actual
>> ++ test 2 -ne 2
>> ++ eval 'diff -u' '"$@"'
>> +++ diff -u expected actual
>> ++ git grep -h '(*NO_JIT)var' invalid-0x80
>> fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set
>
> That is exactly what I was worried about, this is not failing one
> test, but making `git grep` unusable in any repository that has any
> binary files that might be reachable by it, and it is likely affecting
> anyone using PCRE older than 10.34

Let's have a look at the map.  Here are the differences between the
versions regarding use of PCRE2_UTF:

o: opt->ignore_locale
h: has_non_ascii(p->pattern)
i: is_utf8_locale()
l: !opt->ignore_case && (p->fixed || p->is_fixed)

o h i l master hamza rene2
0 0 0 0      0     1     0
0 0 0 1      0     1     0
0 0 1 0      0     1     1
0 0 1 1      0     1     0  <== 7812.13, confirmed using fprint() debugging

So http://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
should not have this breakage, because it doesn't enable PCRE2_UTF for
literal patterns.

René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-17 18:46               ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data René Scharfe
@ 2021-11-17 19:56                 ` Carlo Arenas
  2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-17 19:56 UTC (permalink / raw)
  To: René Scharfe
  Cc: Andreas Schwab, Ævar Arnfjörð Bjarmason,
	Hamza Mahfooz, git, Junio C Hamano

On Wed, Nov 17, 2021 at 10:46 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 16.11.21 um 10:38 schrieb Carlo Arenas:
> > On Tue, Nov 16, 2021 at 1:30 AM Andreas Schwab <schwab@linux-m68k.org> wrote:
> >>
> >> expecting success of 7812.13 'PCRE v2: grep ASCII from invalid UTF-8 data':
> >>         git grep -h "var" invalid-0x80 >actual &&
> >>         test_cmp expected actual &&
> >>         git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
> >>         test_cmp expected actual
> >>
> >> ++ git grep -h var invalid-0x80
> >> ++ test_cmp expected actual
> >> ++ test 2 -ne 2
> >> ++ eval 'diff -u' '"$@"'
> >> +++ diff -u expected actual
> >> ++ git grep -h '(*NO_JIT)var' invalid-0x80
> >> fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set
> >
> > That is exactly what I was worried about, this is not failing one
> > test, but making `git grep` unusable in any repository that has any
> > binary files that might be reachable by it, and it is likely affecting
> > anyone using PCRE older than 10.34
>
> Let's have a look at the map.  Here are the differences between the
> versions regarding use of PCRE2_UTF:
>
> o: opt->ignore_locale
> h: has_non_ascii(p->pattern)
> i: is_utf8_locale()
> l: !opt->ignore_case && (p->fixed || p->is_fixed)
>
> o h i l master hamza rene2
> 0 0 0 0      0     1     0
> 0 0 0 1      0     1     0
> 0 0 1 0      0     1     1
> 0 0 1 1      0     1     0  <== 7812.13, confirmed using fprint() debugging
>
> So http://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
> should not have this breakage, because it doesn't enable PCRE2_UTF for
> literal patterns.

Correct, and indeed I had your expression instead of the ugly one in
my draft for v2[1] but then I found the tests were even more broken
than reported originally and got worried it might introduce yet
another issue because of the brittleness of the rest of the code
around it.  I am hoping it will be introduced in a follow up series
though, and unless this code gets thrown away in the promised
refactoring by Ævar which I haven't seen yet and once this is in
maint.

IMHO and in line with the previous warnings you raised[2] during the
development of this feature, it might be worth looking at it more
deeply, but at least the proposed patch keeps the feature working in
practice (the only case that won't work as expected will be when the
expression includes "." with the intention of matching a UTF-8
character and while using PCRE2 older than 10.34) and more importantly
fixes the regression that it introduced.

Carlo

[1] https://lore.kernel.org/git/20211117102329.95456-1-carenas@gmail.com/
[2] https://lore.kernel.org/git/fc7eb9fc-9521-5484-b05f-9c20086fd485@web.de/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-17 18:46               ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data René Scharfe
  2021-11-17 19:56                 ` Carlo Arenas
@ 2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
  2021-11-17 21:53                   ` Carlo Arenas
  2021-11-18 18:17                   ` Junio C Hamano
  1 sibling, 2 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-17 20:59 UTC (permalink / raw)
  To: René Scharfe
  Cc: Carlo Arenas, Andreas Schwab, Hamza Mahfooz, git, Junio C Hamano


On Wed, Nov 17 2021, René Scharfe wrote:

> Am 16.11.21 um 10:38 schrieb Carlo Arenas:
>> On Tue, Nov 16, 2021 at 1:30 AM Andreas Schwab <schwab@linux-m68k.org> wrote:
>>>
>>> expecting success of 7812.13 'PCRE v2: grep ASCII from invalid UTF-8 data':
>>>         git grep -h "var" invalid-0x80 >actual &&
>>>         test_cmp expected actual &&
>>>         git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
>>>         test_cmp expected actual
>>>
>>> ++ git grep -h var invalid-0x80
>>> ++ test_cmp expected actual
>>> ++ test 2 -ne 2
>>> ++ eval 'diff -u' '"$@"'
>>> +++ diff -u expected actual
>>> ++ git grep -h '(*NO_JIT)var' invalid-0x80
>>> fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set
>>
>> That is exactly what I was worried about, this is not failing one
>> test, but making `git grep` unusable in any repository that has any
>> binary files that might be reachable by it, and it is likely affecting
>> anyone using PCRE older than 10.34
>
> Let's have a look at the map.  Here are the differences between the
> versions regarding use of PCRE2_UTF:
>
> o: opt->ignore_locale
> h: has_non_ascii(p->pattern)
> i: is_utf8_locale()
> l: !opt->ignore_case && (p->fixed || p->is_fixed)
>
> o h i l master hamza rene2
> 0 0 0 0      0     1     0
> 0 0 0 1      0     1     0
> 0 0 1 0      0     1     1
> 0 0 1 1      0     1     0  <== 7812.13, confirmed using fprint() debugging
>
> So http://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
> should not have this breakage, because it doesn't enable PCRE2_UTF for
> literal patterns.

PCRE2_UTF will also matter for literal patterns. Try to peel apart the
two bytes in "é" and match them under -i with/without PCRE_UTF.

I don't know what the right behavior should be here (haven't had time to
dig), but it matters for matching.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
@ 2021-11-17 21:53                   ` Carlo Arenas
  2021-11-18  0:00                     ` Ævar Arnfjörð Bjarmason
  2021-11-18 18:17                   ` Junio C Hamano
  1 sibling, 1 reply; 47+ messages in thread
From: Carlo Arenas @ 2021-11-17 21:53 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: René Scharfe, Andreas Schwab, Hamza Mahfooz, git, Junio C Hamano

On Wed, Nov 17, 2021 at 1:01 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> PCRE2_UTF will also matter for literal patterns. Try to peel apart the
> two bytes in "é" and match them under -i with/without PCRE_UTF.

Is there a real use case for why someone would do that? and how is
that "literal" valid UTF to warrant setting PCRE2_UTF?
I would expect that someone including random bytes in the expression
is really more interested in binary matching anyway and the use of -i
with it probably should be an error.

Indeed I suspect the fact that pcre2_compile lets it through might be
a bug in PCRE2

$ pcre2test
PCRE2 version 10.39 2021-10-29
  re> /\303/utf,caseless
data> \303
 0: \x{c3}
data> é
No match

Carlo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-17 21:53                   ` Carlo Arenas
@ 2021-11-18  0:00                     ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-18  0:00 UTC (permalink / raw)
  To: Carlo Arenas
  Cc: René Scharfe, Andreas Schwab, Hamza Mahfooz, git, Junio C Hamano


On Wed, Nov 17 2021, Carlo Arenas wrote:

> On Wed, Nov 17, 2021 at 1:01 PM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> PCRE2_UTF will also matter for literal patterns. Try to peel apart the
>> two bytes in "é" and match them under -i with/without PCRE_UTF.
>
> Is there a real use case for why someone would do that? and how is
> that "literal" valid UTF to warrant setting PCRE2_UTF?

We don't check the validity of the input pattern as correct UTF-8 before
feeding it to PCRE. We don't even check if you're under a UTF-8 locale,
a call to is_utf8_locale() is one of the conditions we'll try (but check
how loose we are under NO_GETTEXT=Y even then), but it's not a necessary
one.

So in practice we'll likely have users pasting say Big5, UTF-32, JIS
encoding or whatever into "git grep" and then grepping arbitrary binary
data.

I don't really have a specific use-case in mind. I'm just trying to
counter the apparent recurring misconception that's popped up more than
once in these recent threads that the UTF-8 *mode* is only something we
need to worry about if the pattern contains non-ASCII.

The implications of UTF-8 as a matching mode go much deeper than that in
Perl's and PCRE's regex engines, e.g. in this case what's considered a
character boundary.

> I would expect that someone including random bytes in the expression
> is really more interested in binary matching anyway and the use of -i
> with it probably should be an error.
>
> Indeed I suspect the fact that pcre2_compile lets it through might be
> a bug in PCRE2
>
> $ pcre2test
> PCRE2 version 10.39 2021-10-29
>   re> /\303/utf,caseless
> data> \303
>  0: \x{c3}
> data> é
> No match

It's really not "random bytes".

We're not talking about someone doing a git grep where the argument is
fed from a "cat" of /dev/urandom. Just plain old boring natural language
encodings in common use can and will conflict with what's considered a
character boundary in UTF-8,

Anyway, I haven't had time to re-page this topic at large into my
wetware and really don't know what we should be doing exactly at this
point to move forward, except my previous suggestion to either revert
the recent changes, or at least to narrow them down so that we get the
old behavior for everything except the revision.c "git log" caller.

I think a really good next step aside from dealing with that immediate
problem would be to harden some of our test data in a way similar to
what we've got in t7816-grep-binary-pattern.sh, but for partial matches,
mixtures of valid/invalid/partial character patterns and subjects etc.

We might be able to lift some tests from existing test suites,
perl.git's t/re/re_tests and pcre2.git's RunGrepTest (And testdata/
directory) are probably good places to start looking.

We don't need to test e.g. PCRE itself independently, but it is worth
testing how whatever special-sauce we add on top (if and when to turn on
its flags based on heuristics) impacts things.

We've also got the problem that whatever code we write will target
multiple PCREv2 versions, which isn't something PCREv2 itself needs to
deal with.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-17 10:23                   ` [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE Carlo Marcelo Arenas Belón
@ 2021-11-18  7:29                     ` Junio C Hamano
  2021-11-18 10:15                       ` Carlo Arenas
  2021-11-18 21:21                     ` Hamza Mahfooz
  1 sibling, 1 reply; 47+ messages in thread
From: Junio C Hamano @ 2021-11-18  7:29 UTC (permalink / raw)
  To: Carlo Marcelo Arenas Belón
  Cc: git, avarab, someguy, René Scharfe, Andreas Schwab

Carlo Marcelo Arenas Belón  <carenas@gmail.com> writes:

> Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii patterns
> and UTF-8 data, 2021-10-15), PCRE2_UTF mode is enabled for cases where it
> will trigger UTF-8 validation errors (as reported) or can result in
> undefined behaviour.
>
> Our use of PCRE2 only allows searching through non UTF-8 validated data
> safely through the use of the PCRE2_MATCH_INVALID_UTF flag, that is only
> available after 10.34, so restrict the change to newer versions of PCRE
> and revert to the old logic for older releases, which will still allow
> for matching not using UTF-8 for likely most usecases (as shown in the
> tests).
>
> Fix one test that was using an expression that wouldn't fail without the
> new code so it can be forced to fail if it is missing and restrict it to
> run only for newer PCRE releases; while at it do some minor refactoring
> to cleanup the fallout for when that test might be skipped or might
> succeed under the new conditions.
>
> Keeping the overly complex and unnecessary logic for now, to reduce risk
> but with the hope that it will be cleaned up later.
>
> Helped-by: René Scharfe <l.s.r@web.de>
> Reported-by: Andreas Schwab <schwab@linux-m68k.org>
> Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
> ---
> v2:
> * restrict the code at compile time instead of reverting
> * "fix" test to document  the behaviour under both PCRE code versions
> * update commit message to better explain the issue

So, is this the final verrsion everybody is happy with, instead of
just reverting the whole "paint hits in log --grep" topic, or did I
miss more discussion in the original thread?

With too many patches on the list, it is a bit hard to keep track of
the more urgent regresison fixes X-<.

Thanks.

>  grep.c                          |  7 ++++++-
>  t/t7812-grep-icase-non-ascii.sh | 32 ++++++++++++++++++--------------
>  2 files changed, 24 insertions(+), 15 deletions(-)
>
> diff --git a/grep.c b/grep.c
> index f6e113e9f0..0126aa3db4 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,12 +382,17 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
>  		}
>  		options |= PCRE2_CASELESS;
>  	}
> +#ifdef GIT_PCRE2_VERSION_10_34_OR_HIGHER
>  	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>  	    (!opt->ignore_locale && is_utf8_locale() &&
>  	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>  					    (p->fixed || p->is_fixed))))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> -
> +#else
> +	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> +		options |= PCRE2_UTF;
> +#endif
>  #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
>  	/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */
>  	if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | PCRE2_CASELESS))
> diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
> index 22487d90fd..3bfe1ee728 100755
> --- a/t/t7812-grep-icase-non-ascii.sh
> +++ b/t/t7812-grep-icase-non-ascii.sh
> @@ -53,14 +53,27 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
>  	test_cmp expected actual
>  '
>  
> -test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
> -	cat >expected <<-\EOF &&
> -	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
> -	EOF
> +test_expect_success GETTEXT_LOCALE,PCRE 'setup ascii pattern on UTF-8 data' '
>  	test_write_lines "forth" >file4 &&
>  	git add file4 &&
>  	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
> -	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
> +	test_write_lines "fifth" >file5 &&
> +	git add file5 &&
> +	GIT_COMMITTER_NAME="Ç O Mîtter" &&
> +	GIT_COMMITTER_EMAIL="committer@example.com" &&
> +	git -c i18n.commitEncoding=latin1 commit -m thïrd
> +'
> +
> +test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
> +	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
> +'
> +
> +test_expect_success GETTEXT_LOCALE,PCRE,PCRE2_MATCH_INVALID_UTF 'log --author with an ascii pattern on UTF-8 data' '
> +	cat >expected <<-\EOF &&
> +	Author: <BOLD;RED>A U Thor<RESET> <author@example.com>
> +	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
> +	EOF
> +	git log --color=always --perl-regexp --author=". . Thor" >log &&
>  	grep Author log >actual.raw &&
>  	test_decode_color <actual.raw >actual &&
>  	test_cmp expected actual
> @@ -70,11 +83,6 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern o
>  	cat >expected <<-\EOF &&
>  	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
>  	EOF
> -	test_write_lines "fifth" >file5 &&
> -	git add file5 &&
> -	GIT_COMMITTER_NAME="Ç O Mîtter" &&
> -	GIT_COMMITTER_EMAIL="committer@example.com" &&
> -	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
>  	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
>  	grep Commit: log >actual.raw &&
>  	test_decode_color <actual.raw >actual &&
> @@ -141,10 +149,6 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invali
>  	test_cmp invalid-0xe5 actual
>  '
>  
> -test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
> -	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
> -'
> -
>  test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
>  	test_might_fail git grep -hi "Æ" invalid-0x80 >actual &&
>  	test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-18  7:29                     ` Junio C Hamano
@ 2021-11-18 10:15                       ` Carlo Arenas
  2021-11-18 12:49                         ` Hamza Mahfooz
  0 siblings, 1 reply; 47+ messages in thread
From: Carlo Arenas @ 2021-11-18 10:15 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, avarab, someguy, René Scharfe, Andreas Schwab

On Wed, Nov 17, 2021 at 11:29 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> So, is this the final verrsion everybody is happy with, instead of
> just reverting the whole "paint hits in log --grep" topic, or did I
> miss more discussion in the original thread?

This will likely need at least some ACKs.

Apologies for the noise.

Carlo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-18 10:15                       ` Carlo Arenas
@ 2021-11-18 12:49                         ` Hamza Mahfooz
  2021-11-19  6:58                           ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 47+ messages in thread
From: Hamza Mahfooz @ 2021-11-18 12:49 UTC (permalink / raw)
  To: Carlo Arenas
  Cc: Junio C Hamano, git, avarab, René Scharfe, Andreas Schwab

If none of the patches that are trying to fix our usage of PCRE2 are 
ACKed.
It would be preferable (from my perspective) to revert only ae39ba431a
(grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data,
2021-10-15) and disable the feature (for the time being) if PCRE2 is 
selected, like so:

diff --git a/pretty.c b/pretty.c
index 1af5b093ae..a53d008d1e 100644
--- a/pretty.c
+++ b/pretty.c
@@ -452,7 +452,7 @@ static void append_line_with_color(struct strbuf 
*sb, struct grep_opt *opt,
  buf = line;
  eol = buf + linelen;

- if (!opt || !want_color(color) || opt->invert)
+ if (!opt || !want_color(color) || opt->invert || opt->pcre2)
   goto end;

  line_color = opt->colors[GREP_COLOR_SELECTED];


On Thu, Nov 18 2021 at 02:15:49 AM -0800, Carlo Arenas 
<carenas@gmail.com> wrote:
> 
> This will likely need at least some ACKs.
> 
> Apologies for the noise.
> 
> Carlo



^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
  2021-11-17 21:53                   ` Carlo Arenas
@ 2021-11-18 18:17                   ` Junio C Hamano
  2021-11-18 20:57                     ` René Scharfe
  1 sibling, 1 reply; 47+ messages in thread
From: Junio C Hamano @ 2021-11-18 18:17 UTC (permalink / raw)
  To: Carlo Arenas, Ævar Arnfjörð Bjarmason
  Cc: René Scharfe, Andreas Schwab, Hamza Mahfooz, git

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

>> Let's have a look at the map.  Here are the differences between the
>> versions regarding use of PCRE2_UTF:
>>
>> o: opt->ignore_locale
>> h: has_non_ascii(p->pattern)
>> i: is_utf8_locale()
>> l: !opt->ignore_case && (p->fixed || p->is_fixed)
>>
>> o h i l master hamza rene2
>> 0 0 0 0      0     1     0
>> 0 0 0 1      0     1     0
>> 0 0 1 0      0     1     1
>> 0 0 1 1      0     1     0  <== 7812.13, confirmed using fprint() debugging
>>
>> So http://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
>> should not have this breakage, because it doesn't enable PCRE2_UTF for
>> literal patterns.
>
> PCRE2_UTF will also matter for literal patterns. Try to peel apart the
> two bytes in "é" and match them under -i with/without PCRE_UTF.

Sorry for being late to the party, but doesn't "literal" in the
context of this thread mean the column "l" above, i.e. we are not
ignoring case and fixed or is_fixed member is set?  So "under -i"
disqualifies as an example for "will also matter for literal",
doesn't it?

In hindsight, I guess we could have pushed a bit harder when René's

-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	if (!opt->ignore_locale && is_utf8_locale() &&
 	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);

in https://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
(is that what is called 'rene2' above?) was raised on Oct 17th to
amend/fix Hamza's [v13 3/3]; that would have prevented 'master' from
having this breakage?

Carlo, in your [PATCH v2] in <20211117102329.95456-1-carenas@gmail.com>,
I see that the #else side for older PCREv2 users essentially reverts
what Hamza's [PATCH v13 3/3] did to this area.

+#else
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+		options |= PCRE2_UTF;
+#endif

I guess this is a lot of change in the amount of text involved but
the least amount of actual change in the behaviour.  For those with
newer PCREv2, the behaviour would be the same as v2.34.0, and for
others, the behaviour would be the same as v2.33.0.

Having said all that, because the consensus seems to be that the
whole "when we should match in UTF mode" may need to be rethought, I
think reverting Hamza's [v13 3/3] would be the simplest way to clean
up the mess for v2.34.1 that will give us a cleaner slate to later
build on, than applying this patch.

So, I dunno.  Comments from those involved in the discussion?

Thanks.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-18 18:17                   ` Junio C Hamano
@ 2021-11-18 20:57                     ` René Scharfe
  2021-11-19  7:00                       ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 47+ messages in thread
From: René Scharfe @ 2021-11-18 20:57 UTC (permalink / raw)
  To: Junio C Hamano, Carlo Arenas, Ævar Arnfjörð Bjarmason
  Cc: Andreas Schwab, Hamza Mahfooz, git

Am 18.11.21 um 19:17 schrieb Junio C Hamano:
> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>>> Let's have a look at the map.  Here are the differences between the
>>> versions regarding use of PCRE2_UTF:
>>>
>>> o: opt->ignore_locale
>>> h: has_non_ascii(p->pattern)
>>> i: is_utf8_locale()
>>> l: !opt->ignore_case && (p->fixed || p->is_fixed)
>>>
>>> o h i l master hamza rene2
>>> 0 0 0 0      0     1     0
>>> 0 0 0 1      0     1     0
>>> 0 0 1 0      0     1     1
>>> 0 0 1 1      0     1     0  <== 7812.13, confirmed using fprint() debugging
>>>
>>> So http://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
>>> should not have this breakage, because it doesn't enable PCRE2_UTF for
>>> literal patterns.
>>
>> PCRE2_UTF will also matter for literal patterns. Try to peel apart the
>> two bytes in "é" and match them under -i with/without PCRE_UTF.
>
> Sorry for being late to the party, but doesn't "literal" in the
> context of this thread mean the column "l" above, i.e. we are not
> ignoring case and fixed or is_fixed member is set?  So "under -i"
> disqualifies as an example for "will also matter for literal",
> doesn't it?

Correct.

> In hindsight, I guess we could have pushed a bit harder when René's
>
> -	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	if (!opt->ignore_locale && is_utf8_locale() &&
>  	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
>
> in https://public-inbox.org/git/0ea73e7a-6d43-e223-ab2e-24c684102856@web.de/
> (is that what is called 'rene2' above?) was raised on Oct 17th to
> amend/fix Hamza's [v13 3/3]; that would have prevented 'master' from
> having this breakage?

Yes, that the change I meant with "rene2".

> Carlo, in your [PATCH v2] in <20211117102329.95456-1-carenas@gmail.com>,
> I see that the #else side for older PCREv2 users essentially reverts
> what Hamza's [PATCH v13 3/3] did to this area.
>
> +#else
> +	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> +		options |= PCRE2_UTF;
> +#endif
>
> I guess this is a lot of change in the amount of text involved but
> the least amount of actual change in the behaviour.  For those with
> newer PCREv2, the behaviour would be the same as v2.34.0, and for
> others, the behaviour would be the same as v2.33.0.
>
> Having said all that, because the consensus seems to be that the
> whole "when we should match in UTF mode" may need to be rethought, I
> think reverting Hamza's [v13 3/3] would be the simplest way to clean
> up the mess for v2.34.1 that will give us a cleaner slate to later
> build on, than applying this patch.

Makes sense to me.  It gives a better starting point to solve the issue
afresh without getting entangled in mind-melting boolean expressions.

René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-17 10:23                   ` [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE Carlo Marcelo Arenas Belón
  2021-11-18  7:29                     ` Junio C Hamano
@ 2021-11-18 21:21                     ` Hamza Mahfooz
  1 sibling, 0 replies; 47+ messages in thread
From: Hamza Mahfooz @ 2021-11-18 21:21 UTC (permalink / raw)
  To: Carlo Marcelo Arenas Belón
  Cc: git, avarab, gitster, René Scharfe, Andreas Schwab

Building on to your patch we can disable the feature if an older 
version of PCRE2 is used, like so:

diff --git a/pretty.c b/pretty.c
index 1af5b093ae..9bd3f61b30 100644
--- a/pretty.c
+++ b/pretty.c
@@ -451,8 +451,11 @@ static void append_line_with_color(struct strbuf 
*sb, struct grep_opt *opt,

  buf = line;
  eol = buf + linelen;
-
+#ifdef GIT_PCRE2_VERSION_10_34_OR_HIGHER
  if (!opt || !want_color(color) || opt->invert)
+#else
+ if (!opt || !want_color(color) || opt->invert || opt->pcre2)
+#endif
   goto end;

  line_color = opt->colors[GREP_COLOR_SELECTED];



On Wed, Nov 17 2021 at 02:23:29 AM -0800, Carlo Marcelo Arenas Belón 
<carenas@gmail.com> wrote:
> Since ae39ba431a (grep/pcre2: fix an edge case concerning ascii 
> patterns
> and UTF-8 data, 2021-10-15), PCRE2_UTF mode is enabled for cases 
> where it
> will trigger UTF-8 validation errors (as reported) or can result in
> undefined behaviour.
> 
> Our use of PCRE2 only allows searching through non UTF-8 validated 
> data
> safely through the use of the PCRE2_MATCH_INVALID_UTF flag, that is 
> only
> available after 10.34, so restrict the change to newer versions of 
> PCRE
> and revert to the old logic for older releases, which will still allow
> for matching not using UTF-8 for likely most usecases (as shown in the
> tests).
> 
> Fix one test that was using an expression that wouldn't fail without 
> the
> new code so it can be forced to fail if it is missing and restrict it 
> to
> run only for newer PCRE releases; while at it do some minor 
> refactoring
> to cleanup the fallout for when that test might be skipped or might
> succeed under the new conditions.
> 
> Keeping the overly complex and unnecessary logic for now, to reduce 
> risk
> but with the hope that it will be cleaned up later.
> 
> Helped-by: René Scharfe <l.s.r@web.de>
> Reported-by: Andreas Schwab <schwab@linux-m68k.org>
> Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
> ---
> v2:
> * restrict the code at compile time instead of reverting
> * "fix" test to document  the behaviour under both PCRE code versions
> * update commit message to better explain the issue
> 
>  grep.c                          |  7 ++++++-
>  t/t7812-grep-icase-non-ascii.sh | 32 ++++++++++++++++++--------------
>  2 files changed, 24 insertions(+), 15 deletions(-)
> 
> diff --git a/grep.c b/grep.c
> index f6e113e9f0..0126aa3db4 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,12 +382,17 @@ static void compile_pcre2_pattern(struct 
> grep_pat *p, const struct grep_opt *opt
>  		}
>  		options |= PCRE2_CASELESS;
>  	}
> +#ifdef GIT_PCRE2_VERSION_10_34_OR_HIGHER
>  	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
>  	    (!opt->ignore_locale && is_utf8_locale() &&
>  	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
>  					    (p->fixed || p->is_fixed))))
>  		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
> -
> +#else
> +	if (!opt->ignore_locale && is_utf8_locale() && 
> has_non_ascii(p->pattern) &&
> +	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> +		options |= PCRE2_UTF;
> +#endif
>  #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
>  	/* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 
> 10.36 */
>  	if (PCRE2_MATCH_INVALID_UTF && options & (PCRE2_UTF | 
> PCRE2_CASELESS))
> diff --git a/t/t7812-grep-icase-non-ascii.sh 
> b/t/t7812-grep-icase-non-ascii.sh
> index 22487d90fd..3bfe1ee728 100755
> --- a/t/t7812-grep-icase-non-ascii.sh
> +++ b/t/t7812-grep-icase-non-ascii.sh
> @@ -53,14 +53,27 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on 
> non-ascii' '
>  	test_cmp expected actual
>  '
> 
> -test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii 
> pattern on UTF-8 data' '
> -	cat >expected <<-\EOF &&
> -	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
> -	EOF
> +test_expect_success GETTEXT_LOCALE,PCRE 'setup ascii pattern on 
> UTF-8 data' '
>  	test_write_lines "forth" >file4 &&
>  	git add file4 &&
>  	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
> -	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
> +	test_write_lines "fifth" >file5 &&
> +	git add file5 &&
> +	GIT_COMMITTER_NAME="Ç O Mîtter" &&
> +	GIT_COMMITTER_EMAIL="committer@example.com" &&
> +	git -c i18n.commitEncoding=latin1 commit -m thïrd
> +'
> +
> +test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
> +	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
> +'
> +
> +test_expect_success GETTEXT_LOCALE,PCRE,PCRE2_MATCH_INVALID_UTF 'log 
> --author with an ascii pattern on UTF-8 data' '
> +	cat >expected <<-\EOF &&
> +	Author: <BOLD;RED>A U Thor<RESET> <author@example.com>
> +	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
> +	EOF
> +	git log --color=always --perl-regexp --author=". . Thor" >log &&
>  	grep Author log >actual.raw &&
>  	test_decode_color <actual.raw >actual &&
>  	test_cmp expected actual
> @@ -70,11 +83,6 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log 
> --committer with an ascii pattern o
>  	cat >expected <<-\EOF &&
>  	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
>  	EOF
> -	test_write_lines "fifth" >file5 &&
> -	git add file5 &&
> -	GIT_COMMITTER_NAME="Ç O Mîtter" &&
> -	GIT_COMMITTER_EMAIL="committer@example.com" &&
> -	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
>  	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller 
> --color=always --perl-regexp --committer=" O.*" >log &&
>  	grep Commit: log >actual.raw &&
>  	test_decode_color <actual.raw >actual &&
> @@ -141,10 +149,6 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 
> 'PCRE v2: grep non-ASCII from invali
>  	test_cmp invalid-0xe5 actual
>  '
> 
> -test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
> -	test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
> -'
> -
>  test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII 
> from invalid UTF-8 data with -i' '
>  	test_might_fail git grep -hi "Æ" invalid-0x80 >actual &&
>  	test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual
> --
> 2.34.0.352.g07dee3c5e1
> 



^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE
  2021-11-18 12:49                         ` Hamza Mahfooz
@ 2021-11-19  6:58                           ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-19  6:58 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Carlo Arenas, Junio C Hamano, git, René Scharfe, Andreas Schwab


On Thu, Nov 18 2021, Hamza Mahfooz wrote:

> If none of the patches that are trying to fix our usage of PCRE2 are
> ACKed.
> It would be preferable (from my perspective) to revert only ae39ba431a
> (grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data,
> 2021-10-15) and disable the feature (for the time being) if PCRE2 is
> selected, like so:
>
> diff --git a/pretty.c b/pretty.c
> index 1af5b093ae..a53d008d1e 100644
> --- a/pretty.c
> +++ b/pretty.c
> @@ -452,7 +452,7 @@ static void append_line_with_color(struct strbuf
> *sb, struct grep_opt *opt,
>  buf = line;
>  eol = buf + linelen;
>
> - if (!opt || !want_color(color) || opt->invert)
> + if (!opt || !want_color(color) || opt->invert || opt->pcre2)
>   goto end;

I see that depending on LC_ALL the behavior of grep.patternType=extended
can exhibit some of the same issues with character splitting, so perhaps
what we'd do with the C library serves as a useful guide?

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-18 20:57                     ` René Scharfe
@ 2021-11-19  7:00                       ` Ævar Arnfjörð Bjarmason
  2021-11-19 16:08                         ` René Scharfe
  2021-11-19 17:11                         ` Junio C Hamano
  0 siblings, 2 replies; 47+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-11-19  7:00 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Carlo Arenas, Andreas Schwab, Hamza Mahfooz, git


On Thu, Nov 18 2021, René Scharfe wrote:

> Am 18.11.21 um 19:17 schrieb Junio C Hamano:
>> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
> [...]
>> I guess this is a lot of change in the amount of text involved but
>> the least amount of actual change in the behaviour.  For those with
>> newer PCREv2, the behaviour would be the same as v2.34.0, and for
>> others, the behaviour would be the same as v2.33.0.
>>
>> Having said all that, because the consensus seems to be that the
>> whole "when we should match in UTF mode" may need to be rethought, I
>> think reverting Hamza's [v13 3/3] would be the simplest way to clean
>> up the mess for v2.34.1 that will give us a cleaner slate to later
>> build on, than applying this patch.
>
> Makes sense to me.  It gives a better starting point to solve the issue
> afresh without getting entangled in mind-melting boolean expressions.

Yes, agreed. As noted I haven't had time to dig deeply into this, but
from what I've seen so far there doesn't seem to be any obvious way
forward in terms of a quick fix.

I thought perhaps your patch would be that (but I haven't looked into it
carefully enough), but since you're on-board with reverting & retrying.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-19  7:00                       ` Ævar Arnfjörð Bjarmason
@ 2021-11-19 16:08                         ` René Scharfe
  2021-11-19 17:33                           ` Carlo Arenas
  2021-11-19 17:11                         ` Junio C Hamano
  1 sibling, 1 reply; 47+ messages in thread
From: René Scharfe @ 2021-11-19 16:08 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Carlo Arenas, Andreas Schwab, Hamza Mahfooz, git

Am 19.11.21 um 08:00 schrieb Ævar Arnfjörð Bjarmason:
>
> On Thu, Nov 18 2021, René Scharfe wrote:
>
>> Am 18.11.21 um 19:17 schrieb Junio C Hamano:
>>> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>> [...]
>>> I guess this is a lot of change in the amount of text involved but
>>> the least amount of actual change in the behaviour.  For those with
>>> newer PCREv2, the behaviour would be the same as v2.34.0, and for
>>> others, the behaviour would be the same as v2.33.0.
>>>
>>> Having said all that, because the consensus seems to be that the
>>> whole "when we should match in UTF mode" may need to be rethought, I
>>> think reverting Hamza's [v13 3/3] would be the simplest way to clean
>>> up the mess for v2.34.1 that will give us a cleaner slate to later
>>> build on, than applying this patch.
>>
>> Makes sense to me.  It gives a better starting point to solve the issue
>> afresh without getting entangled in mind-melting boolean expressions.
>
> Yes, agreed. As noted I haven't had time to dig deeply into this, but
> from what I've seen so far there doesn't seem to be any obvious way
> forward in terms of a quick fix.
>
> I thought perhaps your patch would be that (but I haven't looked into it
> carefully enough), but since you're on-board with reverting & retrying.

That patch should fix the edge case without any side-effects -- at least
I haven't seen any reports of ill effects that would apply to it.  It's
easier to understand and reason about when applied after reverting, I
think.  But it's only for grep.c and I don't know the situation in t/.

René

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-19  7:00                       ` Ævar Arnfjörð Bjarmason
  2021-11-19 16:08                         ` René Scharfe
@ 2021-11-19 17:11                         ` Junio C Hamano
  1 sibling, 0 replies; 47+ messages in thread
From: Junio C Hamano @ 2021-11-19 17:11 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: René Scharfe, Carlo Arenas, Andreas Schwab, Hamza Mahfooz, git

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> On Thu, Nov 18 2021, René Scharfe wrote:
>>
>> Makes sense to me.  It gives a better starting point to solve the issue
>> afresh without getting entangled in mind-melting boolean expressions.
>
> Yes, agreed. As noted I haven't had time to dig deeply into this, but
> from what I've seen so far there doesn't seem to be any obvious way
> forward in terms of a quick fix.
>
> I thought perhaps your patch would be that (but I haven't looked into it
> carefully enough), but since you're on-board with reverting & retrying.

OK.  Here is what I'll queue 'master', in the hope of later merging
down and be part of v2.34.1.

Thanks, all.

----- >8 --------- >8 --------- >8 --------- >8 -----
Subject: [PATCH] Revert "grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data"

This reverts commit ae39ba431ab861548eb60b4bd2e1d8b8813db76f, as it
breaks "grep" when looking for a string in non UTF-8 haystack, when
linked with certain versions of PCREv2 library.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 grep.c                          |  6 ++---
 t/t7812-grep-icase-non-ascii.sh | 48 ---------------------------------
 2 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/grep.c b/grep.c
index f6e113e9f0..fe847a0111 100644
--- a/grep.c
+++ b/grep.c
@@ -382,10 +382,8 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
-	    (!opt->ignore_locale && is_utf8_locale() &&
-	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
-					    (p->fixed || p->is_fixed))))
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
+	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 22487d90fd..e5d1e4ea68 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,54 +53,6 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
 	test_cmp expected actual
 '
 
-test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
-	cat >expected <<-\EOF &&
-	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
-	EOF
-	test_write_lines "forth" >file4 &&
-	git add file4 &&
-	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
-	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
-	grep Author log >actual.raw &&
-	test_decode_color <actual.raw >actual &&
-	test_cmp expected actual
-'
-
-test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
-	cat >expected <<-\EOF &&
-	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
-	EOF
-	test_write_lines "fifth" >file5 &&
-	git add file5 &&
-	GIT_COMMITTER_NAME="Ç O Mîtter" &&
-	GIT_COMMITTER_EMAIL="committer@example.com" &&
-	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
-	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
-	grep Commit: log >actual.raw &&
-	test_decode_color <actual.raw >actual &&
-	test_cmp expected actual
-'
-
-test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
-	cat >expected <<-\EOF &&
-	    sé<BOLD;RED>con<RESET>d
-	EOF
-	git log -1 --color=always --perl-regexp --grep="con" >log &&
-	grep con log >actual.raw &&
-	test_decode_color <actual.raw >actual &&
-	test_cmp expected actual
-'
-
-test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
-	cat >expected <<-\EOF &&
-	    <BOLD;RED>thïrd<RESET>
-	EOF
-	git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
-	grep "th.*rd" log >actual.raw &&
-	test_decode_color <actual.raw >actual &&
-	test_cmp expected actual
-'
-
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
 	printf "\\200\\n" >invalid-0x80 &&
 	echo "ævar" >expected &&
-- 
2.34.0-202-gd9146917d7


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
  2021-11-19 16:08                         ` René Scharfe
@ 2021-11-19 17:33                           ` Carlo Arenas
  0 siblings, 0 replies; 47+ messages in thread
From: Carlo Arenas @ 2021-11-19 17:33 UTC (permalink / raw)
  To: René Scharfe
  Cc: Ævar Arnfjörð Bjarmason, Junio C Hamano,
	Andreas Schwab, Hamza Mahfooz, git

On Fri, Nov 19, 2021 at 8:08 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 19.11.21 um 08:00 schrieb Ævar Arnfjörð Bjarmason:
> >
> > On Thu, Nov 18 2021, René Scharfe wrote:
> >
> >> Am 18.11.21 um 19:17 schrieb Junio C Hamano:
> >>> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
> >> [...]
> >>> I guess this is a lot of change in the amount of text involved but
> >>> the least amount of actual change in the behaviour.  For those with
> >>> newer PCREv2, the behaviour would be the same as v2.34.0, and for
> >>> others, the behaviour would be the same as v2.33.0.
> >>>
> >>> Having said all that, because the consensus seems to be that the
> >>> whole "when we should match in UTF mode" may need to be rethought, I
> >>> think reverting Hamza's [v13 3/3] would be the simplest way to clean
> >>> up the mess for v2.34.1 that will give us a cleaner slate to later
> >>> build on, than applying this patch.
> >>
> >> Makes sense to me.  It gives a better starting point to solve the issue
> >> afresh without getting entangled in mind-melting boolean expressions.
> >
> > Yes, agreed. As noted I haven't had time to dig deeply into this, but
> > from what I've seen so far there doesn't seem to be any obvious way
> > forward in terms of a quick fix.
> >
> > I thought perhaps your patch would be that (but I haven't looked into it
> > carefully enough), but since you're on-board with reverting & retrying.
>
> That patch should fix the edge case without any side-effects -- at least
> I haven't seen any reports of ill effects that would apply to it.

Since it isn't restricted to log, it will still cause a regression to
the `git grep` case with binary data for versions of PCRE2 older than
10.34 and unlike the previous one it might not trigger an error in the
testsuite just because we are missing a test for it.

> It's
> easier to understand and reason about when applied after reverting, I
> think.  But it's only for grep.c and I don't know the situation in t/.

We had been focusing in PCRE in this discussion, but I see the strict
behaviour of older PCRE2 as just a "coal mine canary" to point to the
bigger problem that we are expecting git's regex to handle safely and
correctly from the point of UTF, what is technically a binary match
with --color making the mismatch obvious.

The issue is not unique to PCRE, and seems Ævar also acknowledges[1]
that by seeing the same bug this was attempting to fix with probably
some version of glibc's ERE.  I suspect FreeBSD's (and derivatives) is
also broken and might be throwing REGILLSEQ errors as well, so I think
that it is better to revert the whole thing.

Carlo

[1] https://lore.kernel.org/git/211119.86r1bc4om5.gmgdl@evledraar.gmail.com/

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2021-11-19 17:33 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-15 16:13 [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
2021-10-15 16:13 ` [PATCH v13 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
2021-10-15 16:13 ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
2021-10-15 20:03   ` Junio C Hamano
2021-10-16 16:25     ` René Scharfe
2021-10-16 17:12       ` Ævar Arnfjörð Bjarmason
2021-10-16 19:44         ` René Scharfe
2021-10-17  6:00           ` Junio C Hamano
2021-10-17  6:55             ` René Scharfe
2021-10-17  9:44               ` Ævar Arnfjörð Bjarmason
2021-11-15 20:43   ` Andreas Schwab
2021-11-15 22:41     ` Ævar Arnfjörð Bjarmason
2021-11-16  2:12       ` Carlo Arenas
2021-11-16  8:41       ` Andreas Schwab
2021-11-16  9:06         ` Carlo Arenas
2021-11-16  9:18           ` Andreas Schwab
2021-11-16  9:29           ` Andreas Schwab
2021-11-16  9:38             ` Carlo Arenas
2021-11-16  9:55               ` Andreas Schwab
2021-11-16 11:00                 ` [PATCH] grep: avoid setting UTF mode when not needed Carlo Marcelo Arenas Belón
2021-11-16 12:32                   ` Ævar Arnfjörð Bjarmason
2021-11-16 13:35                     ` Hamza Mahfooz
2021-11-17 14:31                       ` Ævar Arnfjörð Bjarmason
2021-11-16 18:22                     ` Carlo Arenas
2021-11-16 18:48                     ` Junio C Hamano
2021-11-17 10:23                   ` [PATCH v2] grep: avoid setting UTF mode when dangerous with PCRE Carlo Marcelo Arenas Belón
2021-11-18  7:29                     ` Junio C Hamano
2021-11-18 10:15                       ` Carlo Arenas
2021-11-18 12:49                         ` Hamza Mahfooz
2021-11-19  6:58                           ` Ævar Arnfjörð Bjarmason
2021-11-18 21:21                     ` Hamza Mahfooz
2021-11-17 18:46               ` [PATCH v13 3/3] grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data René Scharfe
2021-11-17 19:56                 ` Carlo Arenas
2021-11-17 20:59                 ` Ævar Arnfjörð Bjarmason
2021-11-17 21:53                   ` Carlo Arenas
2021-11-18  0:00                     ` Ævar Arnfjörð Bjarmason
2021-11-18 18:17                   ` Junio C Hamano
2021-11-18 20:57                     ` René Scharfe
2021-11-19  7:00                       ` Ævar Arnfjörð Bjarmason
2021-11-19 16:08                         ` René Scharfe
2021-11-19 17:33                           ` Carlo Arenas
2021-11-19 17:11                         ` Junio C Hamano
2021-10-15 18:05 ` [PATCH v13 1/3] grep: refactor next_match() and match_one_pattern() for external use Junio C Hamano
2021-10-15 18:24   ` Hamza Mahfooz
2021-10-15 19:28     ` Junio C Hamano
2021-10-15 19:40       ` Hamza Mahfooz
2021-10-15 19:49       ` Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.