All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: <linux-fsdevel@vger.kernel.org>
Cc: Andrew Gabbasov <andrew_gabbasov@mentor.com>,
	arthur200126@gmail.com, Jan Kara <jack@suse.cz>
Subject: [PATCH 7/7] udf: Add support for decoding UTF-16 characters
Date: Tue, 17 Apr 2018 14:55:29 +0200	[thread overview]
Message-ID: <20180417125529.6975-8-jack@suse.cz> (raw)
In-Reply-To: <20180417125529.6975-1-jack@suse.cz>

Add support to decode characters outside of Base Multilingual Plane of
UTF-16 encoded in CS0 charset of UDF.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 103 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 35 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 616ffee441c5..45234791fec2 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -36,25 +36,6 @@
 #define SURROGATE_CHAR_BITS 10
 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
 
-static int udf_uni2char_utf8(wchar_t uni,
-			     unsigned char *out,
-			     int boundlen)
-{
-	int u_len = 0;
-
-	if (boundlen <= 0)
-		return -ENAMETOOLONG;
-
-	u_len = utf32_to_utf8(uni, out, boundlen);
-	if (u_len < 0) {
-		if (uni > UNICODE_MAX ||
-		    (uni & SURROGATE_MASK) == SURROGATE_PAIR)
-			return -EINVAL;
-		return -ENAMETOOLONG;
-	}
-	return u_len;
-}
-
 #define ILLEGAL_CHAR_MARK	'_'
 #define EXT_MARK		'.'
 #define CRC_MARK		'#'
@@ -62,6 +43,50 @@ static int udf_uni2char_utf8(wchar_t uni,
 /* Number of chars we need to store generated CRC to make filename unique */
 #define CRC_LEN			5
 
+static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
+				int str_i_idx, int u_ch, unicode_t *ret)
+{
+	unicode_t c;
+	int start_idx = str_i_idx;
+
+	/* Expand OSTA compressed Unicode to Unicode */
+	c = str_i[str_i_idx++];
+	if (u_ch > 1)
+		c = (c << 8) | str_i[str_i_idx++];
+	if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
+		unicode_t next;
+
+		/* Trailing surrogate char */
+		if (str_i_idx >= str_i_max_len) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		/* Low surrogate must follow the high one... */
+		if (c & SURROGATE_LOW) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		WARN_ON_ONCE(u_ch != 2);
+		next = str_i[str_i_idx++] << 8;
+		next |= str_i[str_i_idx++];
+		if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
+		    !(next & SURROGATE_LOW)) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		c = PLANE_SIZE +
+		    ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
+		    (next & SURROGATE_CHAR_MASK);
+	}
+out:
+	*ret = c;
+	return str_i_idx - start_idx;
+}
+
+
 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			      int *str_o_idx,
 			      const uint8_t *str_i, int str_i_max_len,
@@ -70,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			      int (*conv_f)(wchar_t, unsigned char *, int),
 			      int translate)
 {
-	uint32_t c;
+	unicode_t c;
 	int illChar = 0;
 	int len, gotch = 0;
 
-	for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+	while (!gotch && *str_i_idx < str_i_max_len) {
 		if (*str_o_idx >= str_o_max_len) {
 			*needsCRC = 1;
 			return gotch;
 		}
 
-		/* Expand OSTA compressed Unicode to Unicode */
-		c = str_i[*str_i_idx];
-		if (u_ch > 1)
-			c = (c << 8) | str_i[*str_i_idx + 1];
-
-		if (translate && (c == '/' || c == 0))
+		len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
+				     &c);
+		/* These chars cannot be converted. Replace them. */
+		if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
+		    (translate && c == '/')) {
 			illChar = 1;
-		else if (illChar)
+			if (!translate)
+				gotch = 1;
+		} else if (illChar)
 			break;
 		else
 			gotch = 1;
+		*str_i_idx += len;
 	}
 	if (illChar) {
 		*needsCRC = 1;
@@ -98,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 		gotch = 1;
 	}
 	if (gotch) {
-		len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+		if (conv_f) {
+			len = conv_f(c, &str_o[*str_o_idx],
+				     str_o_max_len - *str_o_idx);
+		} else {
+			len = utf32_to_utf8(c, &str_o[*str_o_idx],
+					    str_o_max_len - *str_o_idx);
+			if (len < 0)
+				len = -ENAMETOOLONG;
+		}
 		/* Valid character? */
 		if (len >= 0)
 			*str_o_idx += len;
@@ -106,7 +141,7 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			*needsCRC = 1;
 			gotch = 0;
 		} else {
-			str_o[(*str_o_idx)++] = '?';
+			str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
 			*needsCRC = 1;
 		}
 	}
@@ -142,12 +177,10 @@ static int udf_name_from_CS0(struct super_block *sb,
 		return 0;
 	}
 
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_uni2char_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
-	} else
-		BUG();
+	else
+		conv_f = NULL;
 
 	cmp_id = ocu[0];
 	if (cmp_id != 8 && cmp_id != 16) {
-- 
2.13.6

      parent reply	other threads:[~2018-04-17 12:55 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
2018-04-17 12:55 ` [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings Jan Kara
2018-04-17 12:55 ` [PATCH 2/7] udf: Always require NLS support Jan Kara
2018-04-17 12:55 ` [PATCH 3/7] udf: Use UTF-32 <-> UTF-8 conversion functions from NLS Jan Kara
2018-04-17 12:55 ` [PATCH 4/7] udf: Convert ident strings to proper charset Jan Kara
2018-04-17 12:55 ` [PATCH 5/7] udf: Push sb argument to udf_name_[to|from]_CS0() Jan Kara
2018-04-17 12:55 ` [PATCH 6/7] udf: Add support for encoding UTF-16 characters Jan Kara
2018-04-17 12:55 ` Jan Kara [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180417125529.6975-8-jack@suse.cz \
    --to=jack@suse.cz \
    --cc=andrew_gabbasov@mentor.com \
    --cc=arthur200126@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.