All of lore.kernel.org
 help / color / mirror / Atom feed
From: tboegi@web.de
To: peff@peff.net, j6t@kdbg.org, lars.schneider@autodesk.com,
	git@vger.kernel.org, gitster@pobox.com, patrick@luehne.de,
	larsxschneider@gmail.com
Cc: "Torsten Bögershausen" <tboegi@web.de>
Subject: [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8
Date: Mon, 26 Feb 2018 18:27:06 +0100	[thread overview]
Message-ID: <20180226172706.7822-1-tboegi@web.de> (raw)
In-Reply-To: <20171218131249.GB4665@sigill.intra.peff.net>

From: Torsten Bögershausen <tboegi@web.de>

When an UTF-16 file is commited and later changed, `git diff` shows
"Binary files XX and YY differ".

When the user wants a diff in UTF-8, a textconv needs to be specified
in .gitattributes and the textconv must be configured.

A more user-friendly diff can be produced for UTF-16 if
- the user did not use `git diff --binary`
- the blob is identified as binary
- the blob has an UTF-16 BOM
- the blob can be converted into UTF-8

Enhance the diff machinery to auto-detect UTF-16 blobs and show them
as UTF-8, unless the user specifies `git diff --binary` which creates
a binary diff.

Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
 diff.c                   | 43 ++++++++++++++++++++-
 diffcore.h               |  3 ++
 t/t4066-diff-encoding.sh | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 utf8.h                   | 11 ++++++
 4 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100755 t/t4066-diff-encoding.sh

diff --git a/diff.c b/diff.c
index fb22b19f09..51831ee94d 100644
--- a/diff.c
+++ b/diff.c
@@ -3192,6 +3192,10 @@ static void builtin_diff(const char *name_a,
 			strbuf_reset(&header);
 		}
 
+		if (one && one->reencoded_from_utf16)
+			strbuf_addf(&header, "a is converted to UTF-8 from UTF-16\n");
+		if (two && two->reencoded_from_utf16)
+			strbuf_addf(&header, "b is converted to UTF-8 from UTF-16\n");
 		mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
 		mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
 
@@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 			s->size = size;
 			s->should_free = 1;
 		}
-	}
-	else {
+		if (!s->binary && buffer_is_binary(s->data, s->size) &&
+		    buffer_has_utf16_bom(s->data, s->size)) {
+			int outsz = 0;
+			char *outbuf;
+			outbuf = reencode_string_len(s->data, (int)s->size,
+						     "UTF-8", "UTF-16", &outsz);
+			if (outbuf) {
+				if (s->should_free)
+					free(s->data);
+				if (s->should_munmap)
+					munmap(s->data, s->size);
+				s->should_munmap = 0;
+				s->data = outbuf;
+				s->size = outsz;
+				s->reencoded_from_utf16 = 1;
+				s->should_free = 1;
+			}
+		}
+	} else {
 		enum object_type type;
 		if (size_only || (flags & CHECK_BINARY)) {
 			type = sha1_object_info(s->oid.hash, &s->size);
@@ -3629,6 +3650,19 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 		s->data = read_sha1_file(s->oid.hash, &type, &s->size);
 		if (!s->data)
 			die("unable to read %s", oid_to_hex(&s->oid));
+		if (!s->binary && buffer_is_binary(s->data, s->size) &&
+		    buffer_has_utf16_bom(s->data, s->size)) {
+			int outsz = 0;
+			char *buf;
+			buf = reencode_string_len(s->data, (int)s->size,
+						  "UTF-8", "UTF-16", &outsz);
+			if (buf) {
+				free(s->data);
+				s->data = buf;
+				s->size = outsz;
+				s->reencoded_from_utf16 = 1;
+			}
+		}
 		s->should_free = 1;
 	}
 	return 0;
@@ -5695,6 +5729,10 @@ static int diff_filespec_is_identical(struct diff_filespec *one,
 
 static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
 {
+	if (p->binary) {
+		p->one->binary = 1;
+		p->two->binary = 1;
+	}
 	if (p->done_skip_stat_unmatch)
 		return p->skip_stat_unmatch_result;
 
@@ -5735,6 +5773,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt)
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 
+		p->binary = diffopt->flags.binary;
 		if (diff_filespec_check_stat_unmatch(p))
 			diff_q(&outq, p);
 		else {
diff --git a/diffcore.h b/diffcore.h
index a30da161da..3cd97bb93b 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
 	unsigned has_more_entries : 1; /* only appear in combined diff */
 	/* data should be considered "binary"; -1 means "don't know yet" */
 	signed int is_binary : 2;
+	unsigned binary : 1;
+	unsigned reencoded_from_utf16 : 1;
 	struct userdiff_driver *driver;
 };
 
@@ -72,6 +74,7 @@ struct diff_filepair {
 	unsigned is_unmerged : 1;
 	unsigned done_skip_stat_unmatch : 1;
 	unsigned skip_stat_unmatch_result : 1;
+	unsigned binary : 1;
 };
 #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged)
 
diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh
new file mode 100755
index 0000000000..9bb3c70ada
--- /dev/null
+++ b/t/t4066-diff-encoding.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+
+test_description='git diff with encoding attribute'
+
+. ./test-lib.sh
+
+printf '\303\244rger\n\303\266se\n\303\274bel\n' |
+	iconv -f UTF-8 -t UTF-16 >UTF-16
+printf '\303\266se\n\303\274bel\n\303\245st\n' |
+	iconv -f UTF-8 -t UTF-16 >file2
+
+test_expect_success 'setup' '
+	cp UTF-16 file &&
+	git add file &&
+	git commit -m "add file in UTF-16" &&
+	test_tick &&
+	echo "file encoding=UTF-16" >.gitattributes
+'
+
+test_expect_success 'diff against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..e98d27a 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+åst
+EOF
+	git diff file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff --binary against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644
+	GIT binary patch
+	literal 28
+	ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi
+
+	literal 32
+	icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4
+
+EOF
+	git diff --binary file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'commit local change' '
+	git add file &&
+	git commit -m "add file V2 in UTF-16" &&
+	test_tick
+'
+
+test_expect_success 'diff HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..e98d27a 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+åst
+EOF
+	git diff HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff --binary HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644
+	GIT binary patch
+	literal 28
+	ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi
+
+	literal 32
+	icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4
+	
+EOF
+	git diff --binary HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_done
diff --git a/utf8.h b/utf8.h
index 6bbcf31a83..a2184d0300 100644
--- a/utf8.h
+++ b/utf8.h
@@ -16,6 +16,17 @@ int utf8_fprintf(FILE *, const char *, ...);
 extern const char utf8_bom[];
 extern int skip_utf8_bom(char **, size_t);
 
+static inline int buffer_has_utf16_bom(const void *buf, size_t len) {
+  const unsigned char *text = (unsigned char *)buf;
+  if (!text ||  len < 2)
+    return 0;
+  if (text[0] == 0xff && text[1] == 0xfe)
+    return 1;
+  if (text[0] == 0xfe && text[1] == 0xff)
+    return 1;
+  return 0;
+}
+
 void strbuf_add_wrapped_text(struct strbuf *buf,
 		const char *text, int indent, int indent2, int width);
 void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
-- 
2.16.1.194.gb2e45c695d


  parent reply	other threads:[~2018-02-26 17:27 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-12-11 15:50 [PATCH v1] convert: add support for 'encoding' attribute lars.schneider
2017-12-11 18:39 ` Eric Sunshine
2017-12-11 23:47   ` Lars Schneider
2017-12-11 23:58     ` Eric Sunshine
2017-12-12 10:58       ` Lars Schneider
2017-12-11 20:47 ` Johannes Sixt
2017-12-11 23:42   ` Lars Schneider
2017-12-12  0:59     ` Junio C Hamano
2017-12-12  7:15       ` Johannes Sixt
2017-12-12 10:55         ` Lars Schneider
2017-12-12 19:31           ` Junio C Hamano
2017-12-13 17:57             ` Lars Schneider
2017-12-13 18:11               ` Junio C Hamano
2017-12-13 23:02                 ` Lars Schneider
2017-12-14 23:01                   ` Junio C Hamano
2017-12-12  7:09     ` Johannes Sixt
2017-12-18 10:13   ` Torsten Bögershausen
2017-12-18 13:12     ` Jeff King
2017-12-23  8:08       ` Torsten Bögershausen
2017-12-29 13:28       ` [PATCH/RFC 0/2] git diff --UTF-8 tboegi
2017-12-29 13:28       ` [PATCH/RFC 1/2] convert_to_git(): checksafe becomes an integer tboegi
2017-12-29 13:28       ` [PATCH/RFC 2/2] git diff: Allow to reencode into UTF-8 tboegi
2018-02-26 17:27       ` tboegi [this message]
2018-02-26 18:43         ` [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8 Peter Krefting
2018-02-27 22:39         ` Jeff King
2017-12-18 18:02     ` [PATCH v1] convert: add support for 'encoding' attribute Junio C Hamano
2017-12-18 21:55     ` Johannes Sixt
2017-12-15  9:58 ` Jeff King
2017-12-18 10:54   ` Lars Schneider
2017-12-18 12:59     ` Jeff King
2017-12-17 17:14 ` Torsten Bögershausen
2017-12-28 16:14   ` Lars Schneider
2017-12-29 12:59     ` Torsten Bögershausen
2017-12-29 13:56       ` Lars Schneider
2018-01-03 19:15       ` Junio C Hamano
2018-01-03 20:45         ` Lars Schneider

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180226172706.7822-1-tboegi@web.de \
    --to=tboegi@web.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=j6t@kdbg.org \
    --cc=lars.schneider@autodesk.com \
    --cc=larsxschneider@gmail.com \
    --cc=patrick@luehne.de \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.