All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
To: linux-fsdevel@vger.kernel.org
Cc: jra@google.com, tytso@mit.edu, olaf@sgi.com,
	darrick.wong@oracle.com, kernel@lists.collabora.co.uk,
	Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Subject: [PATCH 14/15] nls: utf8norm: Integrate utf8norm code with NLS subsystem
Date: Wed,  9 May 2018 03:55:26 -0300	[thread overview]
Message-ID: <20180509065527.4177-5-krisman@collabora.co.uk> (raw)
In-Reply-To: <20180509065527.4177-1-krisman@collabora.co.uk>

Changes since RFC v2:
  - Integrate with NLS

Changes since RFC v1:
  - Change error return code from EIO to EINVAL. (Olaf Weber)
  - Fix issues with strncmp/strcmp.  (Olaf Weber)
  - Remove stack buffer in normalization/casefold. (Olaf Weber)
  - Include length parameter for second string on comparison functions.
  - Change length type to size_t.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
 fs/nls/Makefile         |   2 +-
 fs/nls/nls_utf8n-core.c | 291 ++++++++++++++++++++++++++++++++++++++++
 fs/nls/nls_utf8n-norm.c |   6 +
 fs/nls/utf8n.h          |   1 +
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 fs/nls/nls_utf8n-core.c

diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index d7735173117f..0d05674707dd 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN)  += mac-romanian.o
 obj-$(CONFIG_NLS_MAC_ROMAN)     += mac-roman.o
 obj-$(CONFIG_NLS_MAC_TURKISH)   += mac-turkish.o
 
-nls_utf8n-y += nls_utf8n-norm.o
+nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o
 obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o
 
 $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h
diff --git a/fs/nls/nls_utf8n-core.c b/fs/nls/nls_utf8n-core.c
new file mode 100644
index 000000000000..56e1dd07047c
--- /dev/null
+++ b/fs/nls/nls_utf8n-core.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2017 Collabora Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/nls.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/string.h>
+
+#include "utf8n.h"
+
+static struct nls_charset utf8norm_info;
+
+static int utf8_strncmp(const struct nls_table *charset,
+			const unsigned char *str1, size_t len1,
+			const unsigned char *str2, size_t len2)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+	int r;
+
+	r = utf8ncursor(&cur1, data, str1, len1);
+	if (r < 0)
+		return -EINVAL;
+	r = utf8ncursor(&cur2, data, str2, len2);
+	if (r < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+
+static int utf8_strncasecmp(const struct nls_table *charset,
+			    const unsigned char *str1, size_t len1,
+			    const unsigned char *str2, size_t len2)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+	int r;
+
+	r = utf8ncursor(&cur1, data, str1, len1);
+	if (r < 0)
+		return -EINVAL;
+
+	r = utf8ncursor(&cur2, data, str2, len2);
+	if (r < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+
+static int utf8_casefold(const struct nls_table *charset,
+			 const unsigned char *str, size_t len,
+			 unsigned char **folded)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur;
+	char *s;
+	ssize_t nlen;
+
+	nlen = utf8nlen(data, str, len);
+	if (nlen < 0)
+		return -EINVAL;
+
+	s = kmalloc(nlen + 1, GFP_NOFS);
+	if (!s)
+		return -ENOMEM;
+	*folded = s;
+
+	utf8ncursor(&cur, data, str, len);
+	do {
+		*s = utf8byte(&cur);
+	} while (*s++);
+
+	return nlen;
+}
+
+static int utf8_normalize(const struct nls_table *charset,
+			  const unsigned char *str,
+			  size_t len, unsigned char **normalization)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur;
+	char *s;
+	ssize_t nlen;
+
+	nlen = utf8nlen(data, str, len);
+	if (nlen < 0)
+		return -EINVAL;
+
+	s = kmalloc(nlen + 1, GFP_NOFS);
+	if (!s)
+		return -ENOMEM;
+	*normalization = s;
+
+	utf8ncursor(&cur, data, str, len);
+	do {
+		*s = utf8byte(&cur);
+	} while (*s++);
+
+	return nlen;
+}
+
+static int utf8_uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
+		*out = '?';
+		return -EINVAL;
+	}
+	return n;
+}
+
+static int utf8_char2uni(const unsigned char *rawstring, int boundlen,
+			 wchar_t *uni)
+{
+	int n;
+	unicode_t u;
+
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
+		*uni = 0x003f;	/* ? */
+		return -EINVAL;
+	}
+	*uni = (wchar_t) u;
+	return n;
+}
+
+static unsigned char utf8_tolower(const struct nls_table *table,
+				  unsigned int c)
+{
+	return c; /* Identity */
+}
+
+static unsigned char utf8_toupper(const struct nls_table *table,
+				  unsigned int c)
+{
+	return c; /* Identity */
+}
+
+static const struct nls_ops utf8_ops = {
+	.strncmp = utf8_strncmp,
+	.strncasecmp = utf8_strncasecmp,
+	.casefold = utf8_casefold,
+	.normalize = utf8_normalize,
+	.lowercase = utf8_tolower,
+	.uppercase = utf8_toupper,
+	.uni2char = utf8_uni2char,
+	.char2uni = utf8_char2uni,
+};
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+			      unsigned int *min, unsigned int *rev)
+{
+	substring_t args[3];
+	char *tmp;
+	const struct match_token token[] = {
+		{1, "%d.%d.%d"},
+		{0, NULL}
+	};
+	int ret = 0;
+
+	tmp = kstrdup(version, GFP_KERNEL);
+	if (match_token(tmp, token, args) != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+	    match_int(&args[2], rev)) {
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	kfree(tmp);
+	return ret;
+}
+
+static struct nls_table *utf8_load_charset(const char *version)
+{
+	struct nls_table *tbl = NULL;
+	unsigned int nls_version;
+
+	if (version) {
+		unsigned int maj, min, rev;
+
+		if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (!utf8version_is_supported(maj, min, rev))
+			return ERR_PTR(-EINVAL);
+
+		nls_version = UNICODE_AGE(maj, min, rev);
+	} else {
+		nls_version = utf8version_latest();
+		printk(KERN_WARNING"utf8norm version not specified. "
+		       "Assuming latest supported version (%d.%d.%d).",
+		       (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff,
+		       (nls_version & 0xff));
+	}
+
+	 /* Try an already loaded table first. */
+	for (tbl = utf8norm_info.tables; tbl; tbl = tbl->next) {
+		if (tbl->version == nls_version)
+			return tbl;
+	}
+
+	tbl = kmalloc(sizeof(struct nls_table), GFP_KERNEL);
+	if (!tbl)
+		return ERR_PTR(-ENOMEM);
+
+	tbl->charset = &utf8norm_info;
+	tbl->version = nls_version;
+	tbl->ops = &utf8_ops;
+
+	tbl->next = utf8norm_info.tables;
+	utf8norm_info.tables = tbl;
+
+	return tbl;
+}
+
+static void utf8_cleanup_tables(void)
+{
+	struct nls_table *tmp, *tbl = utf8norm_info.tables;
+
+	while (tbl) {
+		tmp = tbl;
+		tbl = tbl->next;
+		kfree(tmp);
+	}
+	utf8norm_info.tables = NULL;
+}
+
+static struct nls_charset utf8norm_info = {
+	.charset = "utf8n",
+	.load_table = utf8_load_charset,
+};
+
+static int __init init_utf8(void)
+{
+	register_nls(&utf8norm_info);
+	return 0;
+}
+
+static void __exit exit_utf8(void)
+{
+	unregister_nls(&utf8norm_info);
+	utf8_cleanup_tables();
+}
+
+module_init(init_utf8);
+module_exit(exit_utf8);
+MODULE_AUTHOR("SGI, Gabriel Krisman Bertazi");
+MODULE_DESCRIPTION("UTF-8 charset operations for filesystems");
+MODULE_LICENSE("GPL");
diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c
index 64c3cc74a2ca..abee8b376a87 100644
--- a/fs/nls/nls_utf8n-norm.c
+++ b/fs/nls/nls_utf8n-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 }
 EXPORT_SYMBOL(utf8version_is_supported);
 
+int utf8version_latest()
+{
+	return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
 /*
  * UTF-8 valid ranges.
  *
diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h
index f60827663503..b4697f9bfbab 100644
--- a/fs/nls/utf8n.h
+++ b/fs/nls/utf8n.h
@@ -32,6 +32,7 @@
 
 /* Highest unicode version supported by the data tables. */
 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
 
 /*
  * Look for the correct const struct utf8data for a unicode version.
-- 
2.17.0

  parent reply	other threads:[~2018-05-09  6:56 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-09  6:47 [PATCH 00/15] NLS refactor and UTF-8 normalization Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 01/15] nls: Wrap uni2char/char2uni callers Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 02/15] nls: Wrap charset field access Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 03/15] nls: Wrap charset hooks in ops structure Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 04/15] nls: Split default charset from NLS core Gabriel Krisman Bertazi
2018-05-09 14:52   ` kbuild test robot
2018-05-15  2:45     ` Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 05/15] nls: Split struct nls_charset from struct nls_table Gabriel Krisman Bertazi
2018-05-09 14:30   ` kbuild test robot
2018-05-15  2:41     ` Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 06/15] nls: Add support for multiple versions of an encoding Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 07/15] nls: Add new interface for string comparisons Gabriel Krisman Bertazi
2018-05-09  6:47 ` [PATCH 08/15] nls: Let charsets define the behavior of tolower/toupper Gabriel Krisman Bertazi
2018-05-09  6:48 ` [PATCH 09/15] nls: Add optional normalization and casefold hooks Gabriel Krisman Bertazi
2018-05-09  6:55 ` [PATCH 10/15] nls: utf8norm: Add unicode character database files Gabriel Krisman Bertazi
2018-05-09  6:55   ` [PATCH 11/15] scripts: add trie generator for UTF-8 Gabriel Krisman Bertazi
2018-05-09  6:55   ` [PATCH 12/15] nls: utf8norm: Introduce code for UTF-8 normalization Gabriel Krisman Bertazi
2018-05-09 17:02     ` kbuild test robot
2018-05-09 18:46       ` Gabriel Krisman Bertazi
2018-05-09  6:55   ` [PATCH 13/15] nls: utf8norm: reduce the size of utf8data[] Gabriel Krisman Bertazi
2018-05-09  6:55   ` Gabriel Krisman Bertazi [this message]
2018-05-09  6:55   ` [PATCH 15/15] nls: utf8norm: Introduce test module for utf8norm implementation Gabriel Krisman Bertazi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180509065527.4177-5-krisman@collabora.co.uk \
    --to=krisman@collabora.co.uk \
    --cc=darrick.wong@oracle.com \
    --cc=jra@google.com \
    --cc=kernel@lists.collabora.co.uk \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=olaf@sgi.com \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.