linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] udf: UTF handling fix and cleanups
@ 2018-04-17 12:55 Jan Kara
  2018-04-17 12:55 ` [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings Jan Kara
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

Hello,

the first patch in the series fixes a possible leak of UTF-16 surrogates into
the resulting UTF-8 string decoded from on-disk names. The rest of the series
the cleans up the unicode conversion functions somewhat and adds support for
full encoding and decoding of UTF-16 characters outside of Base Multilingual
Plane (characters needing more than one UTF-16 codepoint). Review and testing
is welcome.

								Honza

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 2/7] udf: Always require NLS support Jan Kara
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara, stable, #

OSTA UDF specification does not mention whether the CS0 charset in case
of two bytes per character encoding should be treated in UTF-16 or
UCS-2. The sample code in the standard does not treat UTF-16 surrogates
in any special way but on systems such as Windows which work in UTF-16
internally, filenames would be treated as being in UTF-16 effectively.
In Linux it is more difficult to handle characters outside of Base
Multilingual plane (beyond 0xffff) as NLS framework works with 2-byte
characters only. Just make sure we don't leak UTF-16 surrogates into the
resulting string when loading names from the filesystem for now.

CC: stable@vger.kernel.org # >= v4.6
Reported-by: Mingye Wang <arthur200126@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index f897e55f2cd0..16a8ad21b77e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,6 +28,9 @@
 
 #include "udf_sb.h"
 
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+
 static int udf_uni2char_utf8(wchar_t uni,
 			     unsigned char *out,
 			     int boundlen)
@@ -37,6 +40,9 @@ static int udf_uni2char_utf8(wchar_t uni,
 	if (boundlen <= 0)
 		return -ENAMETOOLONG;
 
+	if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -EINVAL;
+
 	if (uni < 0x80) {
 		out[u_len++] = (unsigned char)uni;
 	} else if (uni < 0x800) {
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/7] udf: Always require NLS support
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
  2018-04-17 12:55 ` [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 3/7] udf: Use UTF-32 <-> UTF-8 conversion functions from NLS Jan Kara
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

UDF needs to convert strings between OSTA CS0 charset and standard UTF8.
Currently we implement our own utf-16 <-> utf-8 translations which is
unnecessary code duplication. Always select NLS so that we can use
translation functions from there.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/Kconfig | 6 +-----
 fs/udf/super.c | 8 --------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index c6e17a744c3b..aa415054ad0a 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
 config UDF_FS
 	tristate "UDF file system support"
 	select CRC_ITU_T
+	select NLS
 	help
 	  This is a file system used on some CD-ROMs and DVDs. Since the
 	  file system is supported by multiple operating systems and is more
@@ -13,8 +14,3 @@ config UDF_FS
 	  module will be called udf.
 
 	  If unsure, say N.
-
-config UDF_NLS
-	bool
-	default y
-	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7949c338efa5..37d2565a7f78 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -572,7 +572,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 		case Opt_utf8:
 			uopt->flags |= (1 << UDF_FLAG_UTF8);
 			break;
-#ifdef CONFIG_UDF_NLS
 		case Opt_iocharset:
 			if (!remount) {
 				if (uopt->nls_map)
@@ -581,7 +580,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 				uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
 			}
 			break;
-#endif
 		case Opt_uforget:
 			uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
 			break;
@@ -2117,7 +2115,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		udf_err(sb, "utf8 cannot be combined with iocharset\n");
 		goto parse_options_failure;
 	}
-#ifdef CONFIG_UDF_NLS
 	if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
 		uopt.nls_map = load_nls_default();
 		if (!uopt.nls_map)
@@ -2125,7 +2122,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		else
 			udf_debug("Using default NLS map\n");
 	}
-#endif
 	if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
 		uopt.flags |= (1 << UDF_FLAG_UTF8);
 
@@ -2279,10 +2275,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 error_out:
 	iput(sbi->s_vat_inode);
 parse_options_failure:
-#ifdef CONFIG_UDF_NLS
 	if (uopt.nls_map)
 		unload_nls(uopt.nls_map);
-#endif
 	if (lvid_open)
 		udf_close_lvid(sb);
 	brelse(sbi->s_lvid_bh);
@@ -2332,10 +2326,8 @@ static void udf_put_super(struct super_block *sb)
 	sbi = UDF_SB(sb);
 
 	iput(sbi->s_vat_inode);
-#ifdef CONFIG_UDF_NLS
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		unload_nls(sbi->s_nls_map);
-#endif
 	if (!sb_rdonly(sb))
 		udf_close_lvid(sb);
 	brelse(sbi->s_lvid_bh);
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/7] udf: Use UTF-32 <-> UTF-8 conversion functions from NLS
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
  2018-04-17 12:55 ` [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings Jan Kara
  2018-04-17 12:55 ` [PATCH 2/7] udf: Always require NLS support Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 4/7] udf: Convert ident strings to proper charset Jan Kara
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

Instead of implementing our own functions converting to and from UTF-8,
use the ones provided by NLS.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 80 ++++++++++++--------------------------------------------
 1 file changed, 17 insertions(+), 63 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 16a8ad21b77e..18df831afd3d 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,6 +28,7 @@
 
 #include "udf_sb.h"
 
+#define UNICODE_MAX 0x10ffff
 #define SURROGATE_MASK 0xfffff800
 #define SURROGATE_PAIR 0x0000d800
 
@@ -40,22 +41,12 @@ static int udf_uni2char_utf8(wchar_t uni,
 	if (boundlen <= 0)
 		return -ENAMETOOLONG;
 
-	if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
-		return -EINVAL;
-
-	if (uni < 0x80) {
-		out[u_len++] = (unsigned char)uni;
-	} else if (uni < 0x800) {
-		if (boundlen < 2)
-			return -ENAMETOOLONG;
-		out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
-		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
-	} else {
-		if (boundlen < 3)
-			return -ENAMETOOLONG;
-		out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
-		out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
-		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
+	u_len = utf32_to_utf8(uni, out, boundlen);
+	if (u_len < 0) {
+		if (uni > UNICODE_MAX ||
+		    (uni & SURROGATE_MASK) == SURROGATE_PAIR)
+			return -EINVAL;
+		return -ENAMETOOLONG;
 	}
 	return u_len;
 }
@@ -64,56 +55,19 @@ static int udf_char2uni_utf8(const unsigned char *in,
 			     int boundlen,
 			     wchar_t *uni)
 {
-	unsigned int utf_char;
-	unsigned char c;
-	int utf_cnt, u_len;
-
-	utf_char = 0;
-	utf_cnt = 0;
-	for (u_len = 0; u_len < boundlen;) {
-		c = in[u_len++];
-
-		/* Complete a multi-byte UTF-8 character */
-		if (utf_cnt) {
-			utf_char = (utf_char << 6) | (c & 0x3f);
-			if (--utf_cnt)
-				continue;
-		} else {
-			/* Check for a multi-byte UTF-8 character */
-			if (c & 0x80) {
-				/* Start a multi-byte UTF-8 character */
-				if ((c & 0xe0) == 0xc0) {
-					utf_char = c & 0x1f;
-					utf_cnt = 1;
-				} else if ((c & 0xf0) == 0xe0) {
-					utf_char = c & 0x0f;
-					utf_cnt = 2;
-				} else if ((c & 0xf8) == 0xf0) {
-					utf_char = c & 0x07;
-					utf_cnt = 3;
-				} else if ((c & 0xfc) == 0xf8) {
-					utf_char = c & 0x03;
-					utf_cnt = 4;
-				} else if ((c & 0xfe) == 0xfc) {
-					utf_char = c & 0x01;
-					utf_cnt = 5;
-				} else {
-					utf_cnt = -1;
-					break;
-				}
-				continue;
-			} else {
-				/* Single byte UTF-8 character (most common) */
-				utf_char = c;
-			}
-		}
-		*uni = utf_char;
-		break;
-	}
-	if (utf_cnt) {
+	int u_len;
+	unicode_t c;
+
+	u_len = utf8_to_utf32(in, boundlen, &c);
+	if (u_len < 0) {
 		*uni = '?';
 		return -EINVAL;
 	}
+
+	if (c > MAX_WCHAR_T)
+		*uni = '?';
+	else
+		*uni = c;
 	return u_len;
 }
 
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/7] udf: Convert ident strings to proper charset
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
                   ` (2 preceding siblings ...)
  2018-04-17 12:55 ` [PATCH 3/7] udf: Use UTF-32 <-> UTF-8 conversion functions from NLS Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 5/7] udf: Push sb argument to udf_name_[to|from]_CS0() Jan Kara
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

iocharset= mount option specifies the character set used on *console*
(not on disk). So even dstrings from VRS need to be converted from CS0
to the specified charset and not always UTF-8. This is barely user
visible as those strings are shown only in UDF debug messages.

CC: Andrew Gabbasov <andrew_gabbasov@mentor.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c   |  4 ++--
 fs/udf/udfdecl.h |  3 ++-
 fs/udf/unicode.c | 13 ++++++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 37d2565a7f78..0d27d41f5c6e 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -890,14 +890,14 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
 	}
 
-	ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
+	ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
 	if (ret < 0)
 		goto out_bh;
 
 	strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
 	udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
 
-	ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
+	ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128);
 	if (ret < 0)
 		goto out_bh;
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 68e8a64d22e0..fc8d1b3384d2 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -220,7 +220,8 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int,
 			    uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, int,
 			    uint8_t *, int);
-extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int);
+extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int,
+			     const uint8_t *, int);
 
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 18df831afd3d..ad806c3125c1 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -295,9 +295,10 @@ static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
 	return u_len;
 }
 
-int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
+int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
 		      const uint8_t *ocu_i, int i_len)
 {
+	int (*conv_f)(wchar_t, unsigned char *, int);
 	int s_len = 0;
 
 	if (i_len > 0) {
@@ -309,8 +310,14 @@ int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
 		}
 	}
 
-	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
-				 udf_uni2char_utf8, 0);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		conv_f = udf_uni2char_utf8;
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
+	} else
+		BUG();
+
+	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, conv_f, 0);
 }
 
 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/7] udf: Push sb argument to udf_name_[to|from]_CS0()
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
                   ` (3 preceding siblings ...)
  2018-04-17 12:55 ` [PATCH 4/7] udf: Convert ident strings to proper charset Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 6/7] udf: Add support for encoding UTF-16 characters Jan Kara
  2018-04-17 12:55 ` [PATCH 7/7] udf: Add support for decoding " Jan Kara
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

Push superblock argument to udf_name_[to|from]_CS0() functions so that
we can decide about character conversion functions there.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 57 ++++++++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ad806c3125c1..329be783f98a 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -129,9 +129,9 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 	return gotch;
 }
 
-static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
+static int udf_name_from_CS0(struct super_block *sb,
+			     uint8_t *str_o, int str_max_len,
 			     const uint8_t *ocu, int ocu_len,
-			     int (*conv_f)(wchar_t, unsigned char *, int),
 			     int translate)
 {
 	uint32_t c;
@@ -148,6 +148,7 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
 	unsigned short valueCRC;
 	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
 	uint8_t crc[CRC_LEN];
+	int (*conv_f)(wchar_t, unsigned char *, int);
 
 	if (str_max_len <= 0)
 		return 0;
@@ -157,6 +158,13 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
 		return 0;
 	}
 
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		conv_f = udf_uni2char_utf8;
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
+	} else
+		BUG();
+
 	cmp_id = ocu[0];
 	if (cmp_id != 8 && cmp_id != 16) {
 		memset(str_o, 0, str_max_len);
@@ -247,18 +255,26 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
 	return str_o_len;
 }
 
-static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
-			   const uint8_t *str_i, int str_len,
-			   int (*conv_f)(const unsigned char *, int, wchar_t *))
+static int udf_name_to_CS0(struct super_block *sb,
+			   uint8_t *ocu, int ocu_max_len,
+			   const uint8_t *str_i, int str_len)
 {
 	int i, len;
 	unsigned int max_val;
 	wchar_t uni_char;
 	int u_len, u_ch;
+	int (*conv_f)(const unsigned char *, int, wchar_t *);
 
 	if (ocu_max_len <= 0)
 		return 0;
 
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		conv_f = udf_char2uni_utf8;
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
+	} else
+		BUG();
+
 	memset(ocu, 0, ocu_max_len);
 	ocu[0] = 8;
 	max_val = 0xff;
@@ -298,7 +314,6 @@ static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
 int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
 		      const uint8_t *ocu_i, int i_len)
 {
-	int (*conv_f)(wchar_t, unsigned char *, int);
 	int s_len = 0;
 
 	if (i_len > 0) {
@@ -310,20 +325,12 @@ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
 		}
 	}
 
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_uni2char_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
-	} else
-		BUG();
-
-	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, conv_f, 0);
+	return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0);
 }
 
 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
 		     uint8_t *dname, int dlen)
 {
-	int (*conv_f)(wchar_t, unsigned char *, int);
 	int ret;
 
 	if (!slen)
@@ -332,14 +339,7 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
 	if (dlen <= 0)
 		return 0;
 
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_uni2char_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
-	} else
-		BUG();
-
-	ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
+	ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1);
 	/* Zero length filename isn't valid... */
 	if (ret == 0)
 		ret = -EINVAL;
@@ -349,15 +349,6 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
 		     uint8_t *dname, int dlen)
 {
-	int (*conv_f)(const unsigned char *, int, wchar_t *);
-
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_char2uni_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
-	} else
-		BUG();
-
-	return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
+	return udf_name_to_CS0(sb, dname, dlen, sname, slen);
 }
 
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 6/7] udf: Add support for encoding UTF-16 characters
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
                   ` (4 preceding siblings ...)
  2018-04-17 12:55 ` [PATCH 5/7] udf: Push sb argument to udf_name_[to|from]_CS0() Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  2018-04-17 12:55 ` [PATCH 7/7] udf: Add support for decoding " Jan Kara
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

Add support to store characters outside of Base Multilingual Plane of
UTF-16 in CS0 encoding of UDF.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 79 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 329be783f98a..616ffee441c5 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,9 +28,13 @@
 
 #include "udf_sb.h"
 
+#define PLANE_SIZE 0x10000
 #define UNICODE_MAX 0x10ffff
 #define SURROGATE_MASK 0xfffff800
 #define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW  0x00000400
+#define SURROGATE_CHAR_BITS 10
+#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
 
 static int udf_uni2char_utf8(wchar_t uni,
 			     unsigned char *out,
@@ -51,26 +55,6 @@ static int udf_uni2char_utf8(wchar_t uni,
 	return u_len;
 }
 
-static int udf_char2uni_utf8(const unsigned char *in,
-			     int boundlen,
-			     wchar_t *uni)
-{
-	int u_len;
-	unicode_t c;
-
-	u_len = utf8_to_utf32(in, boundlen, &c);
-	if (u_len < 0) {
-		*uni = '?';
-		return -EINVAL;
-	}
-
-	if (c > MAX_WCHAR_T)
-		*uni = '?';
-	else
-		*uni = c;
-	return u_len;
-}
-
 #define ILLEGAL_CHAR_MARK	'_'
 #define EXT_MARK		'.'
 #define CRC_MARK		'#'
@@ -261,19 +245,17 @@ static int udf_name_to_CS0(struct super_block *sb,
 {
 	int i, len;
 	unsigned int max_val;
-	wchar_t uni_char;
 	int u_len, u_ch;
+	unicode_t uni_char;
 	int (*conv_f)(const unsigned char *, int, wchar_t *);
 
 	if (ocu_max_len <= 0)
 		return 0;
 
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_char2uni_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
-	} else
-		BUG();
+	else
+		conv_f = NULL;
 
 	memset(ocu, 0, ocu_max_len);
 	ocu[0] = 8;
@@ -282,30 +264,55 @@ static int udf_name_to_CS0(struct super_block *sb,
 
 try_again:
 	u_len = 1;
-	for (i = 0; i < str_len; i++) {
+	for (i = 0; i < str_len; i += len) {
 		/* Name didn't fit? */
 		if (u_len + u_ch > ocu_max_len)
 			return 0;
-		len = conv_f(&str_i[i], str_len - i, &uni_char);
-		if (!len)
-			continue;
+		if (conv_f) {
+			wchar_t wchar;
+
+			len = conv_f(&str_i[i], str_len - i, &wchar);
+			if (len > 0)
+				uni_char = wchar;
+		} else {
+			len = utf8_to_utf32(&str_i[i], str_len - i,
+					    &uni_char);
+		}
 		/* Invalid character, deal with it */
-		if (len < 0) {
+		if (len <= 0 || uni_char > UNICODE_MAX) {
 			len = 1;
 			uni_char = '?';
 		}
 
 		if (uni_char > max_val) {
-			max_val = 0xffff;
-			ocu[0] = 0x10;
-			u_ch = 2;
-			goto try_again;
+			unicode_t c;
+
+			if (max_val == 0xff) {
+				max_val = 0xffff;
+				ocu[0] = 0x10;
+				u_ch = 2;
+				goto try_again;
+			}
+			/*
+			 * Use UTF-16 encoding for chars outside we
+			 * cannot encode directly.
+			 */
+			if (u_len + 2 * u_ch > ocu_max_len)
+				return 0;
+
+			uni_char -= PLANE_SIZE;
+			c = SURROGATE_PAIR |
+			    ((uni_char >> SURROGATE_CHAR_BITS) &
+			     SURROGATE_CHAR_MASK);
+			ocu[u_len++] = (uint8_t)(c >> 8);
+			ocu[u_len++] = (uint8_t)(c & 0xff);
+			uni_char = SURROGATE_PAIR | SURROGATE_LOW |
+					(uni_char & SURROGATE_CHAR_MASK);
 		}
 
 		if (max_val == 0xffff)
 			ocu[u_len++] = (uint8_t)(uni_char >> 8);
 		ocu[u_len++] = (uint8_t)(uni_char & 0xff);
-		i += len - 1;
 	}
 
 	return u_len;
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 7/7] udf: Add support for decoding UTF-16 characters
  2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
                   ` (5 preceding siblings ...)
  2018-04-17 12:55 ` [PATCH 6/7] udf: Add support for encoding UTF-16 characters Jan Kara
@ 2018-04-17 12:55 ` Jan Kara
  6 siblings, 0 replies; 8+ messages in thread
From: Jan Kara @ 2018-04-17 12:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Andrew Gabbasov, arthur200126, Jan Kara

Add support to decode characters outside of Base Multilingual Plane of
UTF-16 encoded in CS0 charset of UDF.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 103 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 35 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 616ffee441c5..45234791fec2 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -36,25 +36,6 @@
 #define SURROGATE_CHAR_BITS 10
 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
 
-static int udf_uni2char_utf8(wchar_t uni,
-			     unsigned char *out,
-			     int boundlen)
-{
-	int u_len = 0;
-
-	if (boundlen <= 0)
-		return -ENAMETOOLONG;
-
-	u_len = utf32_to_utf8(uni, out, boundlen);
-	if (u_len < 0) {
-		if (uni > UNICODE_MAX ||
-		    (uni & SURROGATE_MASK) == SURROGATE_PAIR)
-			return -EINVAL;
-		return -ENAMETOOLONG;
-	}
-	return u_len;
-}
-
 #define ILLEGAL_CHAR_MARK	'_'
 #define EXT_MARK		'.'
 #define CRC_MARK		'#'
@@ -62,6 +43,50 @@ static int udf_uni2char_utf8(wchar_t uni,
 /* Number of chars we need to store generated CRC to make filename unique */
 #define CRC_LEN			5
 
+static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
+				int str_i_idx, int u_ch, unicode_t *ret)
+{
+	unicode_t c;
+	int start_idx = str_i_idx;
+
+	/* Expand OSTA compressed Unicode to Unicode */
+	c = str_i[str_i_idx++];
+	if (u_ch > 1)
+		c = (c << 8) | str_i[str_i_idx++];
+	if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
+		unicode_t next;
+
+		/* Trailing surrogate char */
+		if (str_i_idx >= str_i_max_len) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		/* Low surrogate must follow the high one... */
+		if (c & SURROGATE_LOW) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		WARN_ON_ONCE(u_ch != 2);
+		next = str_i[str_i_idx++] << 8;
+		next |= str_i[str_i_idx++];
+		if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
+		    !(next & SURROGATE_LOW)) {
+			c = UNICODE_MAX + 1;
+			goto out;
+		}
+
+		c = PLANE_SIZE +
+		    ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
+		    (next & SURROGATE_CHAR_MASK);
+	}
+out:
+	*ret = c;
+	return str_i_idx - start_idx;
+}
+
+
 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			      int *str_o_idx,
 			      const uint8_t *str_i, int str_i_max_len,
@@ -70,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			      int (*conv_f)(wchar_t, unsigned char *, int),
 			      int translate)
 {
-	uint32_t c;
+	unicode_t c;
 	int illChar = 0;
 	int len, gotch = 0;
 
-	for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+	while (!gotch && *str_i_idx < str_i_max_len) {
 		if (*str_o_idx >= str_o_max_len) {
 			*needsCRC = 1;
 			return gotch;
 		}
 
-		/* Expand OSTA compressed Unicode to Unicode */
-		c = str_i[*str_i_idx];
-		if (u_ch > 1)
-			c = (c << 8) | str_i[*str_i_idx + 1];
-
-		if (translate && (c == '/' || c == 0))
+		len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
+				     &c);
+		/* These chars cannot be converted. Replace them. */
+		if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
+		    (translate && c == '/')) {
 			illChar = 1;
-		else if (illChar)
+			if (!translate)
+				gotch = 1;
+		} else if (illChar)
 			break;
 		else
 			gotch = 1;
+		*str_i_idx += len;
 	}
 	if (illChar) {
 		*needsCRC = 1;
@@ -98,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 		gotch = 1;
 	}
 	if (gotch) {
-		len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+		if (conv_f) {
+			len = conv_f(c, &str_o[*str_o_idx],
+				     str_o_max_len - *str_o_idx);
+		} else {
+			len = utf32_to_utf8(c, &str_o[*str_o_idx],
+					    str_o_max_len - *str_o_idx);
+			if (len < 0)
+				len = -ENAMETOOLONG;
+		}
 		/* Valid character? */
 		if (len >= 0)
 			*str_o_idx += len;
@@ -106,7 +141,7 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
 			*needsCRC = 1;
 			gotch = 0;
 		} else {
-			str_o[(*str_o_idx)++] = '?';
+			str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
 			*needsCRC = 1;
 		}
 	}
@@ -142,12 +177,10 @@ static int udf_name_from_CS0(struct super_block *sb,
 		return 0;
 	}
 
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		conv_f = udf_uni2char_utf8;
-	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
-	} else
-		BUG();
+	else
+		conv_f = NULL;
 
 	cmp_id = ocu[0];
 	if (cmp_id != 8 && cmp_id != 16) {
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-04-17 12:55 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-17 12:55 [PATCH 0/7] udf: UTF handling fix and cleanups Jan Kara
2018-04-17 12:55 ` [PATCH 1/7] udf: Fix leak of UTF-16 surrogates into encoded strings Jan Kara
2018-04-17 12:55 ` [PATCH 2/7] udf: Always require NLS support Jan Kara
2018-04-17 12:55 ` [PATCH 3/7] udf: Use UTF-32 <-> UTF-8 conversion functions from NLS Jan Kara
2018-04-17 12:55 ` [PATCH 4/7] udf: Convert ident strings to proper charset Jan Kara
2018-04-17 12:55 ` [PATCH 5/7] udf: Push sb argument to udf_name_[to|from]_CS0() Jan Kara
2018-04-17 12:55 ` [PATCH 6/7] udf: Add support for encoding UTF-16 characters Jan Kara
2018-04-17 12:55 ` [PATCH 7/7] udf: Add support for decoding " Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).