git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] grep: fix multibyte regex handling under macOS
@ 2022-08-23 19:08 Diomidis Spinellis via GitGitGadget
  2022-08-23 20:40 ` Eric Sunshine
  2022-08-23 21:26 ` [PATCH v2] " Diomidis Spinellis via GitGitGadget
  0 siblings, 2 replies; 6+ messages in thread
From: Diomidis Spinellis via GitGitGadget @ 2022-08-23 19:08 UTC (permalink / raw)
  To: git; +Cc: Diomidis Spinellis, Diomidis Spinellis

From: Diomidis Spinellis <dds@aueb.gr>

The 2013 commit 29de20504e9 fixed t0070-fundamental.sh under Darwin
(macOS) by adopting Git's regex library.  However, this library is
compiled with NO_MBSUPPORT, which causes git-grep to work incorrectly
on multibyte (e.g. UTF-8) files.  Current macOS versions pass
t0070-fundamental.sh with the native macOS regex library, which
also supports multibyte characters.

Adjust the Makefile to use the native regex library, and call
setlocale(3) to set CTYPE according to the user's preference.  The
setlocale(3) call is required on all platforms, but in platforms
supporting gettext(3), setlocale(3) is called as a side-effect of
initializing gettext(3).

To avoid running the new tests on platforms still using the
compatibility library, which is compiled without multibyte
support, store the corresponding NO_REGEX setting in the
GIT-BUILD-OPTIONS file.  This makes it available to the test
scripts.  In addition, adjust the test-tool regex command to
work with multibyte regexes to further test a platform's
support for them.

Signed-off-by: Diomidis Spinellis <dds@aueb.gr>
---
    grep: fix multibyte regex handling under macOS
    
    The 2013 commit 29de20504e9 fixed t0070-fundamental.sh under Darwin
    (macOS) by adopting Git's regex library. However, this library is
    compiled with NO_MBSUPPORT, which causes git-grep to work incorrectly on
    multibyte (e.g. UTF-8) files. Current macOS versions pass
    t0070-fundamental.sh with the native macOS regex library, which also
    supports multibyte characters.
    
    Adjust the Makefile to use the native regex library, and call
    setlocale(3) to set CTYPE according to the user's preference. The
    setlocale(3) call is required on all platforms, but in platforms
    supporting gettext(3), setlocale(3) is called as a side-effect of
    initializing gettext(3).
    
    To avoid running the new tests on platforms still using the
    compatibility library, which is compiled without multibyte support,
    store the corresponding NO_REGEX setting in the GIT-BUILD-OPTIONS file.
    This makes it available to the test scripts. In addition, adjust the
    test-tool regex command to work with multibyte regexes to further test a
    platform's support for them.
    
    Signed-off-by: Diomidis Spinellis dds@aueb.gr

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1313%2Fdspinellis%2Ffix-macos-mb-grep-v1
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1313/dspinellis/fix-macos-mb-grep-v1
Pull-Request: https://github.com/git/git/pull/1313

 Makefile                  |  2 +-
 grep.c                    |  6 +++++-
 t/helper/test-regex.c     |  2 ++
 t/t7818-grep-multibyte.sh | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100755 t/t7818-grep-multibyte.sh

diff --git a/Makefile b/Makefile
index 04d0fd1fe60..d1a98257150 100644
--- a/Makefile
+++ b/Makefile
@@ -1427,7 +1427,6 @@ ifeq ($(uname_S),Darwin)
 		APPLE_COMMON_CRYPTO = YesPlease
 		COMPAT_CFLAGS += -DAPPLE_COMMON_CRYPTO
 	endif
-	NO_REGEX = YesPlease
 	PTHREAD_LIBS =
 endif
 
@@ -2970,6 +2969,7 @@ GIT-BUILD-OPTIONS: FORCE
 	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@+
 	@echo NO_PTHREADS=\''$(subst ','\'',$(subst ','\'',$(NO_PTHREADS)))'\' >>$@+
 	@echo NO_PYTHON=\''$(subst ','\'',$(subst ','\'',$(NO_PYTHON)))'\' >>$@+
+	@echo NO_REGEX=\''$(subst ','\'',$(subst ','\'',$(NO_REGEX)))'\' >>$@+
 	@echo NO_UNIX_SOCKETS=\''$(subst ','\'',$(subst ','\'',$(NO_UNIX_SOCKETS)))'\' >>$@+
 	@echo PAGER_ENV=\''$(subst ','\'',$(subst ','\'',$(PAGER_ENV)))'\' >>$@+
 	@echo DC_SHA1=\''$(subst ','\'',$(subst ','\'',$(DC_SHA1)))'\' >>$@+
diff --git a/grep.c b/grep.c
index 82eb7da1022..c31657c3da3 100644
--- a/grep.c
+++ b/grep.c
@@ -10,6 +10,8 @@
 #include "quote.h"
 #include "help.h"
 
+#include <locale.h>
+
 static int grep_source_load(struct grep_source *gs);
 static int grep_source_is_binary(struct grep_source *gs,
 				 struct index_state *istate);
@@ -707,8 +709,10 @@ static struct grep_expr *grep_splice_or(struct grep_expr *x, struct grep_expr *y
 void compile_grep_patterns(struct grep_opt *opt)
 {
 	struct grep_pat *p;
-	struct grep_expr *header_expr = prep_header_patterns(opt);
+	struct grep_expr *header_expr;
 
+	setlocale(LC_CTYPE, "");
+	header_expr = prep_header_patterns(opt);
 	for (p = opt->pattern_list; p; p = p->next) {
 		switch (p->token) {
 		case GREP_PATTERN: /* atom */
diff --git a/t/helper/test-regex.c b/t/helper/test-regex.c
index d6f28ca8d14..ae4d7854abd 100644
--- a/t/helper/test-regex.c
+++ b/t/helper/test-regex.c
@@ -1,5 +1,6 @@
 #include "test-tool.h"
 #include "gettext.h"
+#include <locale.h>
 
 struct reg_flag {
 	const char *name;
@@ -85,6 +86,7 @@ int cmd__regex(int argc, const char **argv)
 	}
 	git_setup_gettext();
 
+	setlocale(LC_CTYPE, "");
 	ret = regcomp(&r, pat, flags);
 	if (ret) {
 		if (silent)
diff --git a/t/t7818-grep-multibyte.sh b/t/t7818-grep-multibyte.sh
new file mode 100755
index 00000000000..72129880fac
--- /dev/null
+++ b/t/t7818-grep-multibyte.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+test_description='grep multibyte characters'
+
+. ./test-lib.sh
+
+# Multibyte regex search is only supported with a native regex library
+# that supports them.
+# (The included library is compiled with NO_MBSUPPORT) and only if it
+test -z "$NO_REGEX" &&
+  LC_ALL=en_US.UTF-8 test-tool regex '^.$' '¿' &&
+  test_set_prereq MB_REGEX
+
+if ! test_have_prereq MB_REGEX
+then
+  skip_all='multibyte grep tests; Git compiled with NO_REGEX, NO_MBSUPPORT'
+  test_done
+fi
+
+test_expect_success 'setup' '
+	test_write_lines "¿" >file &&
+	git add file &&
+	LC_ALL="en_US.UTF-8" &&
+	export LC_ALL
+'
+test_expect_success 'grep exactly one char in single-char multibyte file' '
+	git grep "^.$"
+'
+
+test_expect_code 1 'grep two chars in single-char multibyte file' '
+	git grep ".."
+'
+
+test_done

base-commit: ad60dddad72dfb8367bd695028b5b8dc6c33661b
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-08-25  8:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-23 19:08 [PATCH] grep: fix multibyte regex handling under macOS Diomidis Spinellis via GitGitGadget
2022-08-23 20:40 ` Eric Sunshine
2022-08-24  5:13   ` Diomidis Spinellis
2022-08-23 21:26 ` [PATCH v2] " Diomidis Spinellis via GitGitGadget
2022-08-24 18:56   ` Junio C Hamano
2022-08-25  8:23     ` Diomidis Spinellis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).