All of lore.kernel.org
 help / color / mirror / Atom feed
From: lars.schneider@autodesk.com
To: git@vger.kernel.org
Cc: gitster@pobox.com, tboegi@web.de, j6t@kdbg.org,
	sunshine@sunshineco.com, peff@peff.net,
	ramsay@ramsayjones.plus.com, Johannes.Schindelin@gmx.de,
	pclouds@gmail.com, Lars Schneider <larsxschneider@gmail.com>
Subject: [PATCH v11 07/10] convert: check for detectable errors in UTF encodings
Date: Fri,  9 Mar 2018 18:35:33 +0100	[thread overview]
Message-ID: <20180309173536.62012-8-lars.schneider@autodesk.com> (raw)
In-Reply-To: <20180309173536.62012-1-lars.schneider@autodesk.com>

From: Lars Schneider <larsxschneider@gmail.com>

Check that new content is valid with respect to the user defined
'working-tree-encoding' attribute.

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
---
 convert.c                        | 48 ++++++++++++++++++++++++++++++++++
 t/t0028-working-tree-encoding.sh | 56 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/convert.c b/convert.c
index aa59ecfe49..b80d666a6b 100644
--- a/convert.c
+++ b/convert.c
@@ -266,6 +266,51 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
 
 }
 
+static int validate_encoding(const char *path, const char *enc,
+		      const char *data, size_t len, int die_on_error)
+{
+	/* We only check for UTF here as UTF?? can be an alias for UTF-?? */
+	if (istarts_with(enc, "UTF")) {
+		/*
+		 * Check for detectable errors in UTF encodings
+		 */
+		if (has_prohibited_utf_bom(enc, data, len)) {
+			const char *error_msg = _(
+				"BOM is prohibited in '%s' if encoded as %s");
+			/*
+			 * This advice is shown for UTF-??BE and UTF-??LE encodings.
+			 */
+			const char *advise_msg = _(
+				"The file '%s' contains a byte order "
+				"mark (BOM). Please use %.6s as "
+				"working-tree-encoding.");
+			advise(advise_msg, path, enc);
+			if (die_on_error)
+				die(error_msg, path, enc);
+			else {
+				return error(error_msg, path, enc);
+			}
+
+		} else if (is_missing_required_utf_bom(enc, data, len)) {
+			const char *error_msg = _(
+				"BOM is required in '%s' if encoded as %s");
+			const char *advise_msg = _(
+				"The file '%s' is missing a byte order "
+				"mark (BOM). Please use %sBE or %sLE "
+				"(depending on the byte order) as "
+				"working-tree-encoding.");
+			advise(advise_msg, path, enc, enc);
+			if (die_on_error)
+				die(error_msg, path, enc);
+			else {
+				return error(error_msg, path, enc);
+			}
+		}
+
+	}
+	return 0;
+}
+
 static const char *default_encoding = "UTF-8";
 
 static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -291,6 +336,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
 	if (!buf && !src)
 		return 1;
 
+	if (validate_encoding(path, enc, src, src_len, die_on_error))
+		return 0;
+
 	dst = reencode_string_len(src, src_len, default_encoding, enc,
 				  &dst_len);
 	if (!dst) {
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index e492945a01..e8408dfe5c 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -62,6 +62,46 @@ test_expect_success 'check $GIT_DIR/info/attributes support' '
 
 for i in 16 32
 do
+	test_expect_success "check prohibited UTF-${i} BOM" '
+		test_when_finished "git reset --hard HEAD" &&
+
+		echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes &&
+		echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" >>.gitattributes &&
+
+		# Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian)
+		# but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32).
+		# In these cases the BOM is prohibited.
+		cp bebom.utf${i}be.raw bebom.utf${i}be &&
+		test_must_fail git add bebom.utf${i}be 2>err.out &&
+		test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
+
+		cp lebom.utf${i}le.raw lebom.utf${i}be &&
+		test_must_fail git add lebom.utf${i}be 2>err.out &&
+		test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
+
+		cp bebom.utf${i}be.raw bebom.utf${i}le &&
+		test_must_fail git add bebom.utf${i}le 2>err.out &&
+		test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out &&
+
+		cp lebom.utf${i}le.raw lebom.utf${i}le &&
+		test_must_fail git add lebom.utf${i}le 2>err.out &&
+		test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out
+	'
+
+	test_expect_success "check required UTF-${i} BOM" '
+		test_when_finished "git reset --hard HEAD" &&
+
+		echo "*.utf${i} text working-tree-encoding=utf-${i}" >>.gitattributes &&
+
+		cp nobom.utf${i}be.raw nobom.utf${i} &&
+		test_must_fail git add nobom.utf${i} 2>err.out &&
+		test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
+
+		cp nobom.utf${i}le.raw nobom.utf${i} &&
+		test_must_fail git add nobom.utf${i} 2>err.out &&
+		test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out
+	'
+
 	test_expect_success "eol conversion for UTF-${i} encoded files on checkout" '
 		test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
 		test_when_finished "git reset --hard HEAD^" &&
@@ -141,4 +181,20 @@ test_expect_success 'error if encoding round trip is not the same during refresh
 	test_i18ngrep "error: .* overwritten by checkout:" err.out
 '
 
+test_expect_success 'error if encoding garbage is already in Git' '
+	BEFORE_STATE=$(git rev-parse HEAD) &&
+	test_when_finished "git reset --hard $BEFORE_STATE" &&
+
+	# Skip the UTF-16 filter for the added file
+	# This simulates a Git version that has no checkoutEncoding support
+	cp nobom.utf16be.raw nonsense.utf16 &&
+	TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) &&
+	git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 &&
+	COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) &&
+	git update-ref refs/heads/master $COMMIT &&
+
+	git diff 2>err.out &&
+	test_i18ngrep "error: BOM is required" err.out
+'
+
 test_done
-- 
2.16.2


  parent reply	other threads:[~2018-03-09 17:37 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-09 17:35 [PATCH v11 00/10] convert: add support for different encodings lars.schneider
2018-03-09 17:35 ` [PATCH v11 01/10] strbuf: remove unnecessary NUL assignment in xstrdup_tolower() lars.schneider
2018-03-09 17:35 ` [PATCH v11 02/10] strbuf: add xstrdup_toupper() lars.schneider
2018-03-09 17:35 ` [PATCH v11 03/10] strbuf: add a case insensitive starts_with() lars.schneider
2018-03-09 17:35 ` [PATCH v11 04/10] utf8: add function to detect prohibited UTF-16/32 BOM lars.schneider
2018-03-09 17:35 ` [PATCH v11 05/10] utf8: add function to detect a missing " lars.schneider
2018-03-09 17:35 ` [PATCH v11 06/10] convert: add 'working-tree-encoding' attribute lars.schneider
2018-03-09 19:10   ` Junio C Hamano
2018-03-15 21:23     ` Lars Schneider
2018-03-18  7:24   ` Torsten Bögershausen
2018-04-01 13:24     ` Lars Schneider
2018-04-05 16:41       ` Torsten Bögershausen
2018-04-15 16:54         ` Lars Schneider
2018-03-09 17:35 ` lars.schneider [this message]
2018-03-09 19:00   ` [PATCH v11 07/10] convert: check for detectable errors in UTF encodings Junio C Hamano
2018-03-09 19:04     ` Lars Schneider
2018-03-09 19:10   ` Junio C Hamano
2018-03-09 17:35 ` [PATCH v11 08/10] convert: advise canonical UTF encoding names lars.schneider
2018-03-09 19:11   ` Junio C Hamano
2018-03-15 22:42     ` Lars Schneider
2018-03-09 17:35 ` [PATCH v11 09/10] convert: add tracing for 'working-tree-encoding' attribute lars.schneider
2018-03-09 17:35 ` [PATCH v11 10/10] convert: add round trip check based on 'core.checkRoundtripEncoding' lars.schneider
2018-03-09 20:18   ` Eric Sunshine
2018-03-09 20:22     ` Junio C Hamano
2018-03-09 20:27       ` Eric Sunshine

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180309173536.62012-8-lars.schneider@autodesk.com \
    --to=lars.schneider@autodesk.com \
    --cc=Johannes.Schindelin@gmx.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=j6t@kdbg.org \
    --cc=larsxschneider@gmail.com \
    --cc=pclouds@gmail.com \
    --cc=peff@peff.net \
    --cc=ramsay@ramsayjones.plus.com \
    --cc=sunshine@sunshineco.com \
    --cc=tboegi@web.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.