All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v11 1/3] grep: refactor next_match() and match_one_pattern() for external use
@ 2021-10-07 20:31 Hamza Mahfooz
  2021-10-07 20:31 ` [PATCH v11 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
  2021-10-07 20:31 ` [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
  0 siblings, 2 replies; 6+ messages in thread
From: Hamza Mahfooz @ 2021-10-07 20:31 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Ævar Arnfjörð Bjarmason, Hamza Mahfooz

These changes are made in preparation of, the colorization support for the
"git log" subcommands that, rely on regex functionality (i.e. "--author",
"--committer" and "--grep"). These changes are necessary primarily because
match_one_pattern() expects header lines to be prefixed, however, in
pretty, the prefixes are stripped from the lines because the name-email
pairs need to go through additional parsing, before they can be printed and
because next_match() doesn't handle the case of
"ctx == GREP_CONTEXT_HEAD" at all. So, teach next_match() how to handle the
new case and move match_one_pattern()'s core logic to
headerless_match_one_pattern() while preserving match_one_pattern()'s uses
that depend on the additional processing.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v5: separate grep changes from pretty changes.

v6: rescope some variables.

v7: export header_field[] and allow for subsequent matches on header lines
    in match_one_pattern().

v8: add headerless_match_one_pattern() and move header_field[] back.

v9: get rid of the new check in headerless_match_one_pattern(), move the
    header pattern filtering logic in grep_next_match() and document
    grep_next_match() in grep.h.

v10: add a "magic" comment in grep_next_match() to signify a fall through
     in the switch statement.
---
 grep.c | 79 ++++++++++++++++++++++++++++++++++++----------------------
 grep.h |  9 +++++++
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/grep.c b/grep.c
index 14fe8a0fd2..fe847a0111 100644
--- a/grep.c
+++ b/grep.c
@@ -944,10 +944,10 @@ static struct {
 	{ "reflog ", 7 },
 };
 
-static int match_one_pattern(struct grep_pat *p,
-			     const char *bol, const char *eol,
-			     enum grep_context ctx,
-			     regmatch_t *pmatch, int eflags)
+static int headerless_match_one_pattern(struct grep_pat *p,
+					const char *bol, const char *eol,
+					enum grep_context ctx,
+					regmatch_t *pmatch, int eflags)
 {
 	int hit = 0;
 	const char *start = bol;
@@ -956,25 +956,6 @@ static int match_one_pattern(struct grep_pat *p,
 	    ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
 		return 0;
 
-	if (p->token == GREP_PATTERN_HEAD) {
-		const char *field;
-		size_t len;
-		assert(p->field < ARRAY_SIZE(header_field));
-		field = header_field[p->field].field;
-		len = header_field[p->field].len;
-		if (strncmp(bol, field, len))
-			return 0;
-		bol += len;
-		switch (p->field) {
-		case GREP_HEADER_AUTHOR:
-		case GREP_HEADER_COMMITTER:
-			strip_timestamp(bol, &eol);
-			break;
-		default:
-			break;
-		}
-	}
-
  again:
 	hit = patmatch(p, bol, eol, pmatch, eflags);
 
@@ -1025,6 +1006,36 @@ static int match_one_pattern(struct grep_pat *p,
 	return hit;
 }
 
+static int match_one_pattern(struct grep_pat *p,
+			     const char *bol, const char *eol,
+			     enum grep_context ctx, regmatch_t *pmatch,
+			     int eflags)
+{
+	const char *field;
+	size_t len;
+
+	if (p->token == GREP_PATTERN_HEAD) {
+		assert(p->field < ARRAY_SIZE(header_field));
+		field = header_field[p->field].field;
+		len = header_field[p->field].len;
+		if (strncmp(bol, field, len))
+			return 0;
+		bol += len;
+
+		switch (p->field) {
+		case GREP_HEADER_AUTHOR:
+		case GREP_HEADER_COMMITTER:
+			strip_timestamp(bol, &eol);
+			break;
+		default:
+			break;
+		}
+	}
+
+	return headerless_match_one_pattern(p, bol, eol, ctx, pmatch, eflags);
+}
+
+
 static int match_expr_eval(struct grep_opt *opt, struct grep_expr *x,
 			   const char *bol, const char *eol,
 			   enum grep_context ctx, ssize_t *col,
@@ -1143,7 +1154,7 @@ static int match_next_pattern(struct grep_pat *p,
 {
 	regmatch_t match;
 
-	if (!match_one_pattern(p, bol, eol, ctx, &match, eflags))
+	if (!headerless_match_one_pattern(p, bol, eol, ctx, &match, eflags))
 		return 0;
 	if (match.rm_so < 0 || match.rm_eo < 0)
 		return 0;
@@ -1158,19 +1169,26 @@ static int match_next_pattern(struct grep_pat *p,
 	return 1;
 }
 
-static int next_match(struct grep_opt *opt,
-		      const char *bol, const char *eol,
-		      enum grep_context ctx, regmatch_t *pmatch, int eflags)
+int grep_next_match(struct grep_opt *opt,
+		    const char *bol, const char *eol,
+		    enum grep_context ctx, regmatch_t *pmatch,
+		    enum grep_header_field field, int eflags)
 {
 	struct grep_pat *p;
 	int hit = 0;
 
 	pmatch->rm_so = pmatch->rm_eo = -1;
 	if (bol < eol) {
-		for (p = opt->pattern_list; p; p = p->next) {
+		for (p = ((ctx == GREP_CONTEXT_HEAD)
+			   ? opt->header_list : opt->pattern_list);
+			  p; p = p->next) {
 			switch (p->token) {
-			case GREP_PATTERN: /* atom */
 			case GREP_PATTERN_HEAD:
+				if ((field != GREP_HEADER_FIELD_MAX) &&
+				    (p->field != field))
+					continue;
+				/* fall thru */
+			case GREP_PATTERN: /* atom */
 			case GREP_PATTERN_BODY:
 				hit |= match_next_pattern(p, bol, eol, ctx,
 							  pmatch, eflags);
@@ -1261,7 +1279,8 @@ static void show_line(struct grep_opt *opt,
 			else if (sign == '=')
 				line_color = opt->colors[GREP_COLOR_FUNCTION];
 		}
-		while (next_match(opt, bol, eol, ctx, &match, eflags)) {
+		while (grep_next_match(opt, bol, eol, ctx, &match,
+				       GREP_HEADER_FIELD_MAX, eflags)) {
 			if (match.rm_so == match.rm_eo)
 				break;
 
diff --git a/grep.h b/grep.h
index 3cb8a83ae8..808ad76f0c 100644
--- a/grep.h
+++ b/grep.h
@@ -191,6 +191,15 @@ void compile_grep_patterns(struct grep_opt *opt);
 void free_grep_patterns(struct grep_opt *opt);
 int grep_buffer(struct grep_opt *opt, const char *buf, unsigned long size);
 
+/* The field parameter is only used to filter header patterns
+ * (where appropriate). If filtering isn't desirable
+ * GREP_HEADER_FIELD_MAX should be supplied.
+ */
+int grep_next_match(struct grep_opt *opt,
+		    const char *bol, const char *eol,
+		    enum grep_context ctx, regmatch_t *pmatch,
+		    enum grep_header_field field, int eflags);
+
 struct grep_source {
 	char *name;
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v11 2/3] pretty: colorize pattern matches in commit messages
  2021-10-07 20:31 [PATCH v11 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
@ 2021-10-07 20:31 ` Hamza Mahfooz
  2021-10-07 20:31 ` [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
  1 sibling, 0 replies; 6+ messages in thread
From: Hamza Mahfooz @ 2021-10-07 20:31 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Ævar Arnfjörð Bjarmason, Hamza Mahfooz

The "git log" command limits its output to the commits that contain strings
matched by a pattern when the "--grep=<pattern>" option is used, but unlike
output from "git grep -e <pattern>", the matches are not highlighted,
making them harder to spot.

Teach the pretty-printer code to highlight matches from the
"--grep=<pattern>", "--author=<pattern>" and "--committer=<pattern>"
options (to view the last one, you may have to ask for --pretty=fuller).

Also, it must be noted that we are effectively greping the content twice
(because it would be a hassle to rework the existing matching code to do
a /g match and then pass it all down to the coloring code), however it only
slows down "git log --author=^H" on this repository by around 1-2%
(compared to v2.33.0), so it should be a small enough slow down to justify
the addition of the feature.

Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: make the commit message whole (add the missing ingredients), rename
    append_matched_line() to append_line_with_color(), use
    colors[GREP_COLOR_MATCH_SELECTED] instead of
    colors[GREP_COLOR_MATCH_CONTEXT], allow the background color to be
    customized, don't copy strings to a buffer when not coloring in
    append_line_with_color(), rename next_match() to grep_next_match(),
    repurpose grep_next_match()/match_one_pattern() for use in
    append_line_with_color() (allowing us to remove duplicated matching
    code in append_line_with_color()), document how to customize the
    feature and modify some of the tests to fit the feature better.

v3: fix a formatting issue with the added documentation.

v4: add strbuf_add_with_color(), use the correct color code scheme in the
    unit tests and add more unit tests.

v5: separate grep changes from pretty changes and add some performance
    analysis in the commit message.

v6: put the documentation in the correct place, cleanup pretty.c and
    format the unit tests according to the current convention.

v7: get rid of all manual strbuf management, constify where now appropriate
    and fix the header line prefix issue properly.

v8: remove code that relies on grep_header_fields[].

v11: clarify why we are grepping twice.
---
 Documentation/config/color.txt |   7 ++-
 pretty.c                       | 101 +++++++++++++++++++++++++++++----
 t/t4202-log.sh                 |  51 +++++++++++++++++
 3 files changed, 145 insertions(+), 14 deletions(-)

diff --git a/Documentation/config/color.txt b/Documentation/config/color.txt
index e05d520a86..91d9a9da32 100644
--- a/Documentation/config/color.txt
+++ b/Documentation/config/color.txt
@@ -104,9 +104,12 @@ color.grep.<slot>::
 `matchContext`;;
 	matching text in context lines
 `matchSelected`;;
-	matching text in selected lines
+	matching text in selected lines. Also, used to customize the following
+	linkgit:git-log[1] subcommands: `--grep`, `--author` and `--committer`.
 `selected`;;
-	non-matching text in selected lines
+	non-matching text in selected lines. Also, used to customize the
+	following linkgit:git-log[1] subcommands: `--grep`, `--author` and
+	`--committer`.
 `separator`;;
 	separators between fields on a line (`:`, `-`, and `=`)
 	and between hunks (`--`)
diff --git a/pretty.c b/pretty.c
index 73b5ead509..2dd94af886 100644
--- a/pretty.c
+++ b/pretty.c
@@ -431,6 +431,52 @@ const char *show_ident_date(const struct ident_split *ident,
 	return show_date(date, tz, mode);
 }
 
+static inline void strbuf_add_with_color(struct strbuf *sb, const char *color,
+					 const char *buf, size_t buflen)
+{
+	strbuf_addstr(sb, color);
+	strbuf_add(sb, buf, buflen);
+	if (*color)
+		strbuf_addstr(sb, GIT_COLOR_RESET);
+}
+
+static void append_line_with_color(struct strbuf *sb, struct grep_opt *opt,
+				   const char *line, size_t linelen,
+				   int color, enum grep_context ctx,
+				   enum grep_header_field field)
+{
+	const char *buf, *eol, *line_color, *match_color;
+	regmatch_t match;
+	int eflags = 0;
+
+	buf = line;
+	eol = buf + linelen;
+
+	if (!opt || !want_color(color) || opt->invert)
+		goto end;
+
+	line_color = opt->colors[GREP_COLOR_SELECTED];
+	match_color = opt->colors[GREP_COLOR_MATCH_SELECTED];
+
+	while (grep_next_match(opt, buf, eol, ctx, &match, field, eflags)) {
+		if (match.rm_so == match.rm_eo)
+			break;
+
+		strbuf_add_with_color(sb, line_color, buf, match.rm_so);
+		strbuf_add_with_color(sb, match_color, buf + match.rm_so,
+				      match.rm_eo - match.rm_so);
+		buf += match.rm_eo;
+		eflags = REG_NOTBOL;
+	}
+
+	if (eflags)
+		strbuf_add_with_color(sb, line_color, buf, eol - buf);
+	else {
+end:
+		strbuf_add(sb, buf, eol - buf);
+	}
+}
+
 void pp_user_info(struct pretty_print_context *pp,
 		  const char *what, struct strbuf *sb,
 		  const char *line, const char *encoding)
@@ -496,9 +542,26 @@ void pp_user_info(struct pretty_print_context *pp,
 			strbuf_addch(sb, '\n');
 		strbuf_addf(sb, " <%.*s>\n", (int)maillen, mailbuf);
 	} else {
-		strbuf_addf(sb, "%s: %.*s%.*s <%.*s>\n", what,
-			    (pp->fmt == CMIT_FMT_FULLER) ? 4 : 0, "    ",
-			    (int)namelen, namebuf, (int)maillen, mailbuf);
+		struct strbuf id = STRBUF_INIT;
+		enum grep_header_field field = GREP_HEADER_FIELD_MAX;
+		struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
+		if (!strcmp(what, "Author"))
+			field = GREP_HEADER_AUTHOR;
+		else if (!strcmp(what, "Commit"))
+			field = GREP_HEADER_COMMITTER;
+
+		strbuf_addf(sb, "%s: ", what);
+		if (pp->fmt == CMIT_FMT_FULLER)
+			strbuf_addchars(sb, ' ', 4);
+
+		strbuf_addf(&id, "%.*s <%.*s>", (int)namelen, namebuf,
+			    (int)maillen, mailbuf);
+
+		append_line_with_color(sb, opt, id.buf, id.len, pp->color,
+				       GREP_CONTEXT_HEAD, field);
+		strbuf_addch(sb, '\n');
+		strbuf_release(&id);
 	}
 
 	switch (pp->fmt) {
@@ -1939,8 +2002,9 @@ static int pp_utf8_width(const char *start, const char *end)
 	return width;
 }
 
-static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
-				 const char *line, int linelen)
+static void strbuf_add_tabexpand(struct strbuf *sb, struct grep_opt *opt,
+				 int color, int tabwidth, const char *line,
+				 int linelen)
 {
 	const char *tab;
 
@@ -1957,7 +2021,9 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
 			break;
 
 		/* Output the data .. */
-		strbuf_add(sb, line, tab - line);
+		append_line_with_color(sb, opt, line, tab - line, color,
+				       GREP_CONTEXT_BODY,
+				       GREP_HEADER_FIELD_MAX);
 
 		/* .. and the de-tabified tab */
 		strbuf_addchars(sb, ' ', tabwidth - (width % tabwidth));
@@ -1972,7 +2038,8 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
 	 * worrying about width - there's nothing more to
 	 * align.
 	 */
-	strbuf_add(sb, line, linelen);
+	append_line_with_color(sb, opt, line, linelen, color, GREP_CONTEXT_BODY,
+			       GREP_HEADER_FIELD_MAX);
 }
 
 /*
@@ -1984,11 +2051,16 @@ static void pp_handle_indent(struct pretty_print_context *pp,
 			     struct strbuf *sb, int indent,
 			     const char *line, int linelen)
 {
+	struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
 	strbuf_addchars(sb, ' ', indent);
 	if (pp->expand_tabs_in_log)
-		strbuf_add_tabexpand(sb, pp->expand_tabs_in_log, line, linelen);
+		strbuf_add_tabexpand(sb, opt, pp->color, pp->expand_tabs_in_log,
+				     line, linelen);
 	else
-		strbuf_add(sb, line, linelen);
+		append_line_with_color(sb, opt, line, linelen, pp->color,
+				       GREP_CONTEXT_BODY,
+				       GREP_HEADER_FIELD_MAX);
 }
 
 static int is_mboxrd_from(const char *line, int len)
@@ -2006,7 +2078,9 @@ void pp_remainder(struct pretty_print_context *pp,
 		  struct strbuf *sb,
 		  int indent)
 {
+	struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
 	int first = 1;
+
 	for (;;) {
 		const char *line = *msg_p;
 		int linelen = get_one_line(line);
@@ -2027,14 +2101,17 @@ void pp_remainder(struct pretty_print_context *pp,
 		if (indent)
 			pp_handle_indent(pp, sb, indent, line, linelen);
 		else if (pp->expand_tabs_in_log)
-			strbuf_add_tabexpand(sb, pp->expand_tabs_in_log,
-					     line, linelen);
+			strbuf_add_tabexpand(sb, opt, pp->color,
+					     pp->expand_tabs_in_log, line,
+					     linelen);
 		else {
 			if (pp->fmt == CMIT_FMT_MBOXRD &&
 					is_mboxrd_from(line, linelen))
 				strbuf_addch(sb, '>');
 
-			strbuf_add(sb, line, linelen);
+			append_line_with_color(sb, opt, line, linelen,
+					       pp->color, GREP_CONTEXT_BODY,
+					       GREP_HEADER_FIELD_MAX);
 		}
 		strbuf_addch(sb, '\n');
 	}
diff --git a/t/t4202-log.sh b/t/t4202-log.sh
index 9dfead936b..3d240bba57 100755
--- a/t/t4202-log.sh
+++ b/t/t4202-log.sh
@@ -449,6 +449,57 @@ test_expect_success !FAIL_PREREQS 'log with various grep.patternType configurati
 	)
 '
 
+test_expect_success 'log --author' '
+	cat >expect <<-\EOF &&
+	Author: <BOLD;RED>A U<RESET> Thor <author@example.com>
+	EOF
+	git log -1 --color=always --author="A U" >log &&
+	grep Author log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'log --committer' '
+	cat >expect <<-\EOF &&
+	Commit:     C O Mitter <committer@<BOLD;RED>example<RESET>.com>
+	EOF
+	git log -1 --color=always --pretty=fuller --committer="example" >log &&
+	grep "Commit:" log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'log -i --grep with color' '
+	cat >expect <<-\EOF &&
+	    <BOLD;RED>Sec<RESET>ond
+	    <BOLD;RED>sec<RESET>ond
+	EOF
+	git log --color=always -i --grep=^sec >log &&
+	grep -i sec log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.selected log --grep' '
+	cat >expect <<-\EOF &&
+	    <GREEN>th<RESET><BOLD;RED>ir<RESET><GREEN>d<RESET>
+	EOF
+	git -c color.grep.selected="green" log --color=always --grep=ir >log &&
+	grep ir log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.matchSelected log --grep' '
+	cat >expect <<-\EOF &&
+	    <BLUE>i<RESET>n<BLUE>i<RESET>t<BLUE>i<RESET>al
+	EOF
+	git -c color.grep.matchSelected="blue" log --color=always --grep=i >log &&
+	grep al log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expect actual
+'
+
 cat > expect <<EOF
 * Second
 * sixth
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-07 20:31 [PATCH v11 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
  2021-10-07 20:31 ` [PATCH v11 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
@ 2021-10-07 20:31 ` Hamza Mahfooz
  2021-10-08 21:26   ` Junio C Hamano
  2021-10-09  6:44   ` Junio C Hamano
  1 sibling, 2 replies; 6+ messages in thread
From: Hamza Mahfooz @ 2021-10-07 20:31 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Ævar Arnfjörð Bjarmason, Hamza Mahfooz

If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:

    $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
    grep: (standard input): binary file matches

So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
the pattern is composed of only ascii characters), so long as the log
output is encoded using UTF-8.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
 grep.c                          |  6 +++--
 grep.h                          |  1 +
 revision.c                      |  2 ++
 t/t7812-grep-icase-non-ascii.sh | 48 +++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/grep.c b/grep.c
index fe847a0111..978d30f053 100644
--- a/grep.c
+++ b/grep.c
@@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
-	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+	if ((opt->utf8_all_the_things && !has_non_ascii(p->pattern)) ||
+	    (!opt->ignore_locale && is_utf8_locale() &&
+	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
+					    (p->fixed || p->is_fixed))))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
diff --git a/grep.h b/grep.h
index 808ad76f0c..c9ddd637d1 100644
--- a/grep.h
+++ b/grep.h
@@ -167,6 +167,7 @@ struct grep_opt {
 	int extended_regexp_option;
 	int pattern_type_option;
 	int ignore_locale;
+	int utf8_all_the_things;
 	char colors[NR_GREP_COLORS][COLOR_MAXLEN];
 	unsigned pre_context;
 	unsigned post_context;
diff --git a/revision.c b/revision.c
index 0dabb5a0bc..0d751dceb7 100644
--- a/revision.c
+++ b/revision.c
@@ -2874,6 +2874,8 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s
 				 &revs->grep_filter);
 	if (!is_encoding_utf8(get_log_output_encoding()))
 		revs->grep_filter.ignore_locale = 1;
+	else
+		revs->grep_filter.utf8_all_the_things = 1;
 	compile_grep_patterns(&revs->grep_filter);
 
 	if (revs->reverse && revs->reflog_info)
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index e5d1e4ea68..42323b31c0 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
 	test_cmp expected actual
 '
 
+test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+	EOF
+	test_write_lines "forth" >file4 &&
+	git add file4 &&
+	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
+	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+	grep Author log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
+	EOF
+	test_write_lines "fifth" >file5 &&
+	git add file5 &&
+	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
+	export GIT_COMMITTER_EMAIL="committer@example.com" &&
+	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
+	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
+	grep Commit: log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	    sé<BOLD;RED>con<RESET>d
+	EOF
+	git log -1 --color=always --perl-regexp --grep="con" >log &&
+	grep con log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	    <BOLD;RED>thïrd<RESET>
+	EOF
+	git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
+	grep "th.*rd" log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
 	printf "\\200\\n" >invalid-0x80 &&
 	echo "ævar" >expected &&
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-07 20:31 ` [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
@ 2021-10-08 21:26   ` Junio C Hamano
  2021-10-09  6:44   ` Junio C Hamano
  1 sibling, 0 replies; 6+ messages in thread
From: Junio C Hamano @ 2021-10-08 21:26 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git, Ævar Arnfjörð Bjarmason

Hamza Mahfooz <someguy@effective-light.com> writes:

> If we attempt to grep non-ascii log message text with an ascii pattern, we
> run into the following issue:
>
>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>     grep: (standard input): binary file matches
>
> So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
> the pattern is composed of only ascii characters), so long as the log
> output is encoded using UTF-8.

We'd need this only if we are using pcre2 backend, no?  If that is
the case, that fact needs to be recorded in the proposed log message
to help later developers, when they wonder why this "all-the-things"
knob exists.

And if it is the case that this bit is needed only to work around a
glitch while using pcre2 backend, I'd rather want to see a solution
that does not need to contaminate the more generic "struct grep_opt"
data and "setup_revisions()" codepath.

In other words, can't the function compile_pcre2_pattern() make the
"is log output encoding utf8?" decision locally and act accordingly?

Thanks.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-07 20:31 ` [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
  2021-10-08 21:26   ` Junio C Hamano
@ 2021-10-09  6:44   ` Junio C Hamano
  2021-10-09 15:52     ` Hamza Mahfooz
  1 sibling, 1 reply; 6+ messages in thread
From: Junio C Hamano @ 2021-10-09  6:44 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: git, Ævar Arnfjörð Bjarmason

Hamza Mahfooz <someguy@effective-light.com> writes:

> diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
> index e5d1e4ea68..42323b31c0 100755
> --- a/t/t7812-grep-icase-non-ascii.sh
> +++ b/t/t7812-grep-icase-non-ascii.sh
> @@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
> ...
> +	git add file5 &&
> +	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
> +	export GIT_COMMITTER_EMAIL="committer@example.com" &&

These lines are flagged as an error by test-lint; I've queued this
on top to make today's integration result to pass our tests.

Thanks.


 t/t7812-grep-icase-non-ascii.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 42323b31c0..313d4894ad 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -72,8 +72,8 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern o
 	EOF
 	test_write_lines "fifth" >file5 &&
 	git add file5 &&
-	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
-	export GIT_COMMITTER_EMAIL="committer@example.com" &&
+	GIT_COMMITTER_NAME="Ç O Mîtter" GIT_COMMITTER_EMAIL="committer@example.com" &&
+	export GIT_COMMITTER_NAME GIT_COMMITTER_EMAIL &&
 	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
 	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
 	grep Commit: log >actual.raw &&
-- 
2.33.0-960-g8d902839aa


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
  2021-10-09  6:44   ` Junio C Hamano
@ 2021-10-09 15:52     ` Hamza Mahfooz
  0 siblings, 0 replies; 6+ messages in thread
From: Hamza Mahfooz @ 2021-10-09 15:52 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Ævar Arnfjörð Bjarmason


On Fri, Oct 8 2021 at 11:44:14 PM -0700, Junio C Hamano 
<gitster@pobox.com> wrote:
> These lines are flagged as an error by test-lint; I've queued this
> on top to make today's integration result to pass our tests.

I fixed this issue in v12 [1] btw.

[1] 
https://lkml.kernel.org/r/20211008224918.603392-3-someguy@effective-light.com



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-10-09 15:52 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-07 20:31 [PATCH v11 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
2021-10-07 20:31 ` [PATCH v11 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
2021-10-07 20:31 ` [PATCH v11 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
2021-10-08 21:26   ` Junio C Hamano
2021-10-09  6:44   ` Junio C Hamano
2021-10-09 15:52     ` Hamza Mahfooz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.