From: "Pali Rohár" <pali@kernel.org>
To: linux-fsdevel@vger.kernel.org,
linux-ntfs-dev@lists.sourceforge.net, linux-cifs@vger.kernel.org,
jfs-discussion@lists.sourceforge.net,
linux-kernel@vger.kernel.org,
Alexander Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>,
"Theodore Y . Ts'o" <tytso@mit.edu>,
Anton Altaparmakov <anton@tuxera.com>,
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>,
Luis de Bethencourt <luisbg@kernel.org>,
Salah Triki <salah.triki@gmail.com>,
Steve French <sfrench@samba.org>, Paulo Alcantara <pc@cjr.nz>,
Ronnie Sahlberg <lsahlber@redhat.com>,
Shyam Prasad N <sprasad@microsoft.com>,
Tom Talpey <tom@talpey.com>, Dave Kleikamp <shaggy@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Pavel Machek <pavel@ucw.cz>,
Christoph Hellwig <hch@infradead.org>,
Kari Argillander <kari.argillander@gmail.com>,
Viacheslav Dubeyko <slava@dubeyko.com>
Subject: [RFC PATCH v2 16/18] cifs: Do not use broken utf8 NLS table for iocharset=utf8 mount option
Date: Mon, 26 Dec 2022 15:21:48 +0100 [thread overview]
Message-ID: <20221226142150.13324-17-pali@kernel.org> (raw)
In-Reply-To: <20221226142150.13324-1-pali@kernel.org>
NLS table for utf8 is broken and cannot be fixed.
So instead of broken utf8 nls functions char2uni() and uni2char() use
functions utf8s_to_utf16s() and utf16s_to_utf8s() which implements correct
conversion between UTF-16 and UTF-8.
When iochatset=utf8 is used then set ctx->iocharset to NULL and use it for
distinguish between the fact if NLS table or native UTF-8 functions should
be used.
Signed-off-by: Pali Rohár <pali@kernel.org>
---
fs/cifs/cifs_unicode.c | 128 +++++++++++++++++++++++++++--------------
fs/cifs/cifs_unicode.h | 2 +-
fs/cifs/cifsfs.c | 2 +
fs/cifs/connect.c | 8 ++-
fs/cifs/dir.c | 28 +++++++--
fs/cifs/winucase.c | 14 +++--
6 files changed, 124 insertions(+), 58 deletions(-)
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index e7582dd79179..94b861e666e3 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -130,20 +130,17 @@ cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
convert_sfu_char(src_char, target))
return len;
- /* if character not one of seven in special remap set */
- len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
- if (len <= 0)
- goto surrogate_pair;
-
- return len;
+ if (cp) {
+ /* if character not one of seven in special remap set */
+ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+ if (len <= 0)
+ goto unknown;
+ } else {
+ len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+ if (len <= 0)
+ goto unknown;
+ }
-surrogate_pair:
- /* convert SURROGATE_PAIR and IVS */
- if (strcmp(cp->charset, "utf8"))
- goto unknown;
- len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
- if (len <= 0)
- goto unknown;
return len;
unknown:
@@ -239,6 +236,37 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
return outlen;
}
+static int cifs_utf8s_to_utf16s(const char *s, int inlen, __le16 *pwcs)
+{
+ __le16 *op;
+ int size;
+ unicode_t u;
+
+ op = pwcs;
+ while (inlen > 0 && *s) {
+ if (*s & 0x80) {
+ size = utf8_to_utf32(s, inlen, &u);
+ if (size <= 0) {
+ u = 0x003f; /* A question mark */
+ size = 1;
+ }
+ s += size;
+ inlen -= size;
+ if (u >= 0x10000) {
+ u -= 0x10000;
+ *op++ = __cpu_to_le16(0xd800 | ((u >> 10) & 0x03ff));
+ *op++ = __cpu_to_le16(0xdc00 | (u & 0x03ff));
+ } else {
+ *op++ = __cpu_to_le16(u);
+ }
+ } else {
+ *op++ = __cpu_to_le16(*s++);
+ inlen--;
+ }
+ }
+ return op - pwcs;
+}
+
/*
* NAME: cifs_strtoUTF16()
*
@@ -254,24 +282,14 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
wchar_t wchar_to; /* needed to quiet sparse */
/* special case for utf8 to handle no plane0 chars */
- if (!strcmp(codepage->charset, "utf8")) {
+ if (!codepage) {
/*
* convert utf8 -> utf16, we assume we have enough space
* as caller should have assumed conversion does not overflow
- * in destination len is length in wchar_t units (16bits)
- */
- i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
- (wchar_t *) to, len);
-
- /* if success terminate and exit */
- if (i >= 0)
- goto success;
- /*
- * if fails fall back to UCS encoding as this
- * function should not return negative values
- * currently can fail only if source contains
- * invalid encoded characters
+ * in destination len is length in __le16 units
*/
+ i = cifs_utf8s_to_utf16s(from, len, to);
+ goto success;
}
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
@@ -502,25 +520,29 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
* as they use backslash as separator.
*/
if (dst_char == 0) {
- charlen = cp->char2uni(source + i, srclen - i, &tmp);
- dst_char = cpu_to_le16(tmp);
-
- /*
- * if no match, use question mark, which at least in
- * some cases serves as wild card
- */
- if (charlen > 0)
- goto ctoUTF16;
-
- /* convert SURROGATE_PAIR */
- if (strcmp(cp->charset, "utf8") || !wchar_to)
- goto unknown;
- if (*(source + i) & 0x80) {
- charlen = utf8_to_utf32(source + i, 6, &u);
- if (charlen < 0)
+ if (cp) {
+ charlen = cp->char2uni(source + i, srclen - i, &tmp);
+ dst_char = cpu_to_le16(tmp);
+
+ /*
+ * if no match, use question mark, which at least in
+ * some cases serves as wild card
+ */
+ if (charlen > 0)
+ goto ctoUTF16;
+ else
goto unknown;
- } else
+ }
+
+ /* UTF-8 to UTF-16 conversion */
+
+ if (!wchar_to)
goto unknown;
+
+ charlen = utf8_to_utf32(source + i, 6, &u);
+ if (charlen < 0)
+ goto unknown;
+
ret = utf8s_to_utf16s(source + i, charlen,
UTF16_LITTLE_ENDIAN,
wchar_to, 6);
@@ -589,8 +611,26 @@ cifs_local_to_utf16_bytes(const char *from, int len,
{
int charlen;
int i;
+ int outlen;
+ unicode_t u_to;
wchar_t wchar_to;
+ if (!codepage) {
+ outlen = 0;
+ for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+ charlen = utf8_to_utf32(from, len, &u_to);
+ /* Failed conversion defaults to a question mark */
+ if (charlen < 1) {
+ charlen = 1;
+ outlen += 2;
+ } else if (u_to <= 0xFFFF)
+ outlen += 2;
+ else
+ outlen += 4;
+ }
+ return outlen;
+ }
+
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
/* Failed conversion defaults to a question mark */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 80b3d845419f..b9a3290faaf7 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -106,7 +106,7 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
int remap);
#endif
-wchar_t cifs_toupper(wchar_t in);
+unicode_t cifs_toupper(unicode_t in);
/*
* UniStrcat: Concatenate the second string to the first
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 10e00c624922..1537bc8bb698 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -591,6 +591,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_sb->ctx->dir_mode);
if (cifs_sb->ctx->iocharset)
seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset);
+ else
+ seq_puts(s, ",iocharset=utf8");
if (tcon->seal)
seq_puts(s, ",seal");
else if (tcon->ses->server->ignore_signature)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d371259d6808..fb841a7baef6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2676,7 +2676,11 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
old->ctx->dir_mode != new->ctx->dir_mode)
return 0;
- if (strcmp(old->local_nls->charset, new->local_nls->charset))
+ if (old->local_nls && !new->local_nls)
+ return 0;
+ if (!old->local_nls && new->local_nls)
+ return 0;
+ if (old->local_nls && new->local_nls && strcmp(old->local_nls->charset, new->local_nls->charset))
return 0;
if (old->ctx->acregmax != new->ctx->acregmax)
@@ -3162,7 +3166,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
if (ctx->iocharset == NULL) {
/* load_nls_default cannot return null */
cifs_sb->local_nls = load_nls_default();
- } else {
+ } else if (strcmp(ctx->iocharset, "utf8") != 0) {
cifs_sb->local_nls = load_nls(ctx->iocharset);
if (cifs_sb->local_nls == NULL) {
cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ad4208bf1e32..83deba65e188 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -804,16 +804,22 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
{
struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
unsigned long hash;
+ unicode_t u;
wchar_t c;
int i, charlen;
hash = init_name_hash(dentry);
for (i = 0; i < q->len; i += charlen) {
- charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+ if (codepage) {
+ charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+ if (likely(charlen > 0))
+ u = c;
+ } else
+ charlen = utf8_to_utf32(&q->name[i], q->len - i, &u);
/* error out if we can't convert the character */
if (unlikely(charlen < 0))
return charlen;
- hash = partial_name_hash(cifs_toupper(c), hash);
+ hash = partial_name_hash(cifs_toupper(u), hash);
}
q->hash = end_name_hash(hash);
@@ -824,6 +830,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
+ unicode_t u1, u2;
wchar_t c1, c2;
int i, l1, l2;
@@ -837,9 +844,18 @@ static int cifs_ci_compare(const struct dentry *dentry,
return 1;
for (i = 0; i < len; i += l1) {
- /* Convert characters in both strings to UTF-16. */
- l1 = codepage->char2uni(&str[i], len - i, &c1);
- l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+ /* Convert characters in both strings to UTF-32. */
+ if (codepage) {
+ l1 = codepage->char2uni(&str[i], len - i, &c1);
+ l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+ if (likely(l1 > 0))
+ u1 = c1;
+ if (likely(l2 > 0))
+ u2 = c2;
+ } else {
+ l1 = utf8_to_utf32(&str[i], len - i, &u1);
+ l2 = utf8_to_utf32(&name->name[i], name->len - i, &u2);
+ }
/*
* If we can't convert either character, just declare it to
@@ -860,7 +876,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
return 1;
/* Now compare uppercase versions of these characters */
- if (cifs_toupper(c1) != cifs_toupper(c2))
+ if (cifs_toupper(u1) != cifs_toupper(u2))
return 1;
}
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
index 2f075b5b50df..b3647b35a7e1 100644
--- a/fs/cifs/winucase.c
+++ b/fs/cifs/winucase.c
@@ -17,7 +17,7 @@
#include <linux/nls.h>
-wchar_t cifs_toupper(wchar_t in); /* quiet sparse */
+unicode_t cifs_toupper(unicode_t in); /* quiet sparse */
static const wchar_t t2_00[256] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@@ -615,20 +615,24 @@ static const wchar_t *const toplevel[256] = {
};
/**
- * cifs_toupper - convert a wchar_t from lower to uppercase
+ * cifs_toupper - convert a unicode_t from lower to uppercase
* @in: character to convert from lower to uppercase
*
- * This function consults the static tables above to convert a wchar_t from
+ * This function consults the static tables above to convert a unicode_t from
* lower to uppercase. In the event that there is no mapping, the original
* "in" character is returned.
*/
-wchar_t
-cifs_toupper(wchar_t in)
+unicode_t
+cifs_toupper(unicode_t in)
{
unsigned char idx;
const wchar_t *tbl;
wchar_t out;
+ /* cifs_toupper table has only defines for plane-0 */
+ if (in > 0xffff)
+ return in;
+
/* grab upper byte */
idx = (in & 0xff00) >> 8;
--
2.20.1
next prev parent reply other threads:[~2022-12-26 14:23 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-12-26 14:21 [RFC PATCH v2 00/18] fs: Remove usage of broken nls_utf8 and drop it Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 01/18] fat: Fix iocharset=utf8 mount option Pali Rohár
2023-01-10 9:17 ` OGAWA Hirofumi
2023-02-04 10:57 ` Pali Rohár
2023-02-08 10:10 ` OGAWA Hirofumi
2022-12-26 14:21 ` [RFC PATCH v2 02/18] hfsplus: Add iocharset= mount option as alias for nls= Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 03/18] ntfs: Undeprecate iocharset= mount option Pali Rohár
2023-01-01 19:02 ` Kari Argillander
2023-01-01 19:06 ` Pali Rohár
2023-01-01 23:02 ` Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 04/18] ntfs: Fix error processing when load_nls() fails Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 05/18] befs: Fix printing iocharset= mount option Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 06/18] befs: Rename enum value Opt_charset to Opt_iocharset to match " Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 07/18] befs: Fix error processing when load_nls() fails Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 08/18] befs: Allow to use native UTF-8 mode Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 09/18] hfs: Explicitly set hsb->nls_disk when hsb->nls_io is set Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 10/18] hfs: Do not use broken utf8 NLS table for iocharset=utf8 mount option Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 11/18] hfsplus: " Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 12/18] jfs: Remove custom iso8859-1 implementation Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 13/18] jfs: Fix buffer overflow in jfs_strfromUCS_le() function Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 14/18] jfs: Do not use broken utf8 NLS table for iocharset=utf8 mount option Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 15/18] ntfs: " Pali Rohár
2022-12-26 14:21 ` Pali Rohár [this message]
2022-12-26 14:21 ` [RFC PATCH v2 17/18] cifs: Remove usage of load_nls_default() calls Pali Rohár
2022-12-26 14:21 ` [RFC PATCH v2 18/18] nls: Drop broken nls_utf8 module Pali Rohár
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221226142150.13324-17-pali@kernel.org \
--to=pali@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=anton@tuxera.com \
--cc=hch@infradead.org \
--cc=hirofumi@mail.parknet.co.jp \
--cc=jack@suse.cz \
--cc=jfs-discussion@lists.sourceforge.net \
--cc=kari.argillander@gmail.com \
--cc=linux-cifs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-ntfs-dev@lists.sourceforge.net \
--cc=lsahlber@redhat.com \
--cc=luisbg@kernel.org \
--cc=pavel@ucw.cz \
--cc=pc@cjr.nz \
--cc=salah.triki@gmail.com \
--cc=sfrench@samba.org \
--cc=shaggy@kernel.org \
--cc=slava@dubeyko.com \
--cc=sprasad@microsoft.com \
--cc=tom@talpey.com \
--cc=tytso@mit.edu \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).