All of lore.kernel.org
 help / color / mirror / Atom feed
From: "brian m. carlson" <sandals@crustytoothpaste.net>
To: git@vger.kernel.org
Cc: gitster@pobox.com
Subject: [PATCH v2 2/2] commit: reject overlong UTF-8 sequences
Date: Thu, 4 Jul 2013 17:20:34 +0000	[thread overview]
Message-ID: <20130704172034.GB267700@vauxhall.crustytoothpaste.net> (raw)
In-Reply-To: <cover.1372957719.git.sandals@crustytoothpaste.net>

The commit code accepts pseudo-UTF-8 sequences that encode a character with more
bytes than necessary.  Reject such sequences, since they are not valid UTF-8.

Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
---
 commit.c               | 17 +++++++++++------
 t/t3900-i18n-commit.sh | 11 +++++++++++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/commit.c b/commit.c
index 2264106..b59c187 100644
--- a/commit.c
+++ b/commit.c
@@ -1240,11 +1240,15 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
 static int find_invalid_utf8(const char *buf, int len)
 {
 	int offset = 0;
+	static const unsigned int max_codepoint[] = {
+		0x7f, 0x7ff, 0xffff, 0x10ffff
+	};
 
 	while (len) {
 		unsigned char c = *buf++;
 		int bytes, bad_offset;
 		unsigned int codepoint;
+		unsigned int min_val, max_val;
 
 		len--;
 		offset++;
@@ -1276,8 +1280,12 @@ static int find_invalid_utf8(const char *buf, int len)
 		if (len < bytes)
 			return bad_offset;
 
-		/* Place the encoded bits at the bottom of the value. */
+		/* Place the encoded bits at the bottom of the value and compute the
+		 * valid range.
+		 */
 		codepoint = (c & 0x7f) >> bytes;
+		min_val = max_codepoint[bytes-1] + 1;
+		max_val = max_codepoint[bytes];
 
 		offset += bytes;
 		len -= bytes;
@@ -1290,8 +1298,8 @@ static int find_invalid_utf8(const char *buf, int len)
 				return bad_offset;
 		} while (--bytes);
 
-		/* No codepoints can ever be allocated beyond U+10FFFF. */
-		if (codepoint > 0x10ffff)
+		/* Reject codepoints that are out of range for the sequence length. */
+		if (codepoint < min_val || codepoint > max_val)
 			return bad_offset;
 		/* Surrogates are only for UTF-16 and cannot be encoded in UTF-8. */
 		if ((codepoint & 0x1ff800) == 0xd800)
@@ -1308,9 +1316,6 @@ static int find_invalid_utf8(const char *buf, int len)
  *
  * If it isn't, it assumes any non-utf8 characters are Latin1,
  * and does the conversion.
- *
- * Fixme: we should probably also disallow overlong forms.
- * But we don't do that currently.
  */
 static int verify_utf8(struct strbuf *buf)
 {
diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh
index ee8ba6c..94fa1e8 100755
--- a/t/t3900-i18n-commit.sh
+++ b/t/t3900-i18n-commit.sh
@@ -50,6 +50,17 @@ test_expect_success 'UTF-8 invalid characters refused' '
 	grep "did not conform" "$HOME"/stderr
 '
 
+test_expect_success 'UTF-8 overlong sequences rejected' '
+	test_when_finished "rm -f $HOME/stderr $HOME/invalid" &&
+	rm -f "$HOME/stderr" "$HOME/invalid" &&
+	echo "UTF-8 overlong" >F &&
+	printf "\340\202\251ommit message\n\nThis is not a space:\300\240\n" \
+		>"$HOME/invalid" &&
+	git commit -a -F "$HOME/invalid" \
+		2>"$HOME"/stderr &&
+	grep "did not conform" "$HOME"/stderr
+'
+
 rm -f "$HOME/stderr"
 
 for H in ISO8859-1 eucJP ISO-2022-JP
-- 
1.8.3.1

      parent reply	other threads:[~2013-07-04 17:27 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-04 17:17 [PATCH v2 0/2] commit: improve UTF-8 validation brian m. carlson
2013-07-04 17:19 ` [PATCH v2 1/2] commit: reject invalid UTF-8 codepoints brian m. carlson
2013-07-04 19:58   ` Torsten Bögershausen
2013-07-04 20:39     ` brian m. carlson
2013-07-05 12:51   ` Peter Krefting
2013-07-08 19:36     ` Junio C Hamano
2013-07-09 11:16       ` [PATCH] commit: reject non-characters Peter Krefting
2013-08-05 12:48         ` Peter Krefting
2013-08-05 16:54           ` Junio C Hamano
2013-08-06  7:03             ` Peter Krefting
2013-07-04 17:20 ` brian m. carlson [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130704172034.GB267700@vauxhall.crustytoothpaste.net \
    --to=sandals@crustytoothpaste.net \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.