All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] shell: Call setlocale
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
@ 2024-04-16 10:03 ` Herbert Xu
  2024-04-16 10:38 ` [PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-16 10:03 UTC (permalink / raw)
  To: DASH Mailing List

Call setlocale to initialise locale settings for libc.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/main.c b/src/main.c
index 7beb280..1e192f8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -32,6 +32,7 @@
  * SUCH DAMAGE.
  */
 
+#include <locale.h>
 #include <stdio.h>
 #include <signal.h>
 #include <sys/stat.h>
@@ -101,6 +102,9 @@ main(int argc, char **argv)
 #if PROFILE
 	monitor(4, etext, profile_buf, sizeof profile_buf, 50);
 #endif
+
+	setlocale(LC_ALL, "");
+
 	state = 0;
 	if (unlikely(setjmp(main_handler.loc))) {
 		int e;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/8] shell: Use strcoll instead of strcmp where applicable
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
  2024-04-16 10:03 ` [PATCH 1/8] shell: Call setlocale Herbert Xu
@ 2024-04-16 10:38 ` Herbert Xu
  2024-04-16 23:13 ` [PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-16 10:38 UTC (permalink / raw)
  To: DASH Mailing List

Use strcoll instead of strcmp so that the locale is taken into
account when sorting strings during pathname expansion, and for
the built-in test(1) string comparison operators.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/bltin/test.c | 8 ++++----
 src/expand.c     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bltin/test.c b/src/bltin/test.c
index fd8a43b..2db4d0f 100644
--- a/src/bltin/test.c
+++ b/src/bltin/test.c
@@ -353,13 +353,13 @@ binop(void)
 		/* NOTREACHED */
 #endif
 	case STREQ:
-		return strcmp(opnd1, opnd2) == 0;
+		return strcoll(opnd1, opnd2) == 0;
 	case STRNE:
-		return strcmp(opnd1, opnd2) != 0;
+		return strcoll(opnd1, opnd2) != 0;
 	case STRLT:
-		return strcmp(opnd1, opnd2) < 0;
+		return strcoll(opnd1, opnd2) < 0;
 	case STRGT:
-		return strcmp(opnd1, opnd2) > 0;
+		return strcoll(opnd1, opnd2) > 0;
 	case INTEQ:
 		return getn(opnd1) == getn(opnd2);
 	case INTNE:
diff --git a/src/expand.c b/src/expand.c
index 0db2b29..9ac981e 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -1476,7 +1476,7 @@ msort(struct strlist *list, int len)
 	p = msort(p, len - half);		/* sort second half */
 	lpp = &list;
 	for (;;) {
-		if (strcmp(p->text, q->text) < 0) {
+		if (strcoll(p->text, q->text) < 0) {
 			*lpp = p;
 			lpp = &p->next;
 			if ((p = *lpp) == NULL) {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/8] expand: Count multi-byte characters for VSLENGTH
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
  2024-04-16 10:03 ` [PATCH 1/8] shell: Call setlocale Herbert Xu
  2024-04-16 10:38 ` [PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
@ 2024-04-16 23:13 ` Herbert Xu
  2024-04-18  8:59 ` [PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-16 23:13 UTC (permalink / raw)
  To: DASH Mailing List

Count multi-byte characters in variables and rather than bytes
and return that as the length expansion.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c | 62 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index 9ac981e..ad186b0 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -53,6 +53,7 @@
 #endif
 #include <ctype.h>
 #include <stdbool.h>
+#include <wchar.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -796,6 +797,18 @@ really_record:
 	return p;
 }
 
+static char *chtodest(int c, int flags, char *out)
+{
+	const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
+
+	if ((flags & QUOTES_ESC) &&
+	    ((syntax[c] == CCTL) ||
+	     (flags & EXP_QUOTED && syntax[c] == CBACK)))
+		USTPUTC(CTLESC, out);
+	USTPUTC(c, out);
+
+	return out;
+}
 
 /*
  * Put a string on the stack.
@@ -803,38 +816,48 @@ really_record:
 
 static size_t memtodest(const char *p, size_t len, int flags)
 {
-	const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
+	size_t count = 0;
 	char *q;
-	char *s;
+	int c;
 
 	if (unlikely(!len))
 		return 0;
 
 	q = makestrspace(len * 2, expdest);
-	s = q;
 
 	do {
-		int c = (signed char)*p++;
-		if (c) {
-			if ((flags & QUOTES_ESC) &&
-			    ((syntax[c] == CCTL) ||
-			     (flags & EXP_QUOTED && syntax[c] == CBACK)))
-				USTPUTC(CTLESC, q);
-		} else if (!(flags & EXP_KEEPNUL))
+		c = (signed char)*p++;
+
+		if (c)
+			count++;
+		else if (!(flags & EXP_KEEPNUL))
 			continue;
-		USTPUTC(c, q);
+
+		if (c < 0) {
+			mbstate_t mbs = {};
+
+			p--;
+			do {
+				q = chtodest(c, flags, q);
+			} while (mbrlen(p++, 1, &mbs) == -2 &&
+				 (c = *p, --len));
+			if (!len)
+				break;
+			continue;
+		}
+
+		q = chtodest(c, flags, q);
 	} while (--len);
 
 	expdest = q;
-	return q - s;
+	return count;
 }
 
 
 static size_t strtodest(const char *p, int flags)
 {
 	size_t len = strlen(p);
-	memtodest(p, len, flags);
-	return len;
+	return memtodest(p, len, flags);
 }
 
 
@@ -856,6 +879,7 @@ varvalue(char *name, int varflags, int flags, int quoted)
 	int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
 		      (flags & EXP_DISCARD);
 	ssize_t len = 0;
+	size_t start;
 	char c;
 
 	if (!subtype) {
@@ -865,9 +889,9 @@ varvalue(char *name, int varflags, int flags, int quoted)
 		sh_error("Bad substitution");
 	}
 
-	flags |= EXP_KEEPNUL;
 	flags &= discard ? ~QUOTES_ESC : ~0;
 	sep = (flags & EXP_FULL) << CHAR_BIT;
+	start = expdest - (char *)stackblock();
 
 	switch (*name) {
 	case '$':
@@ -927,7 +951,7 @@ param:
 
 			if (*ap && sep) {
 				len++;
-				memtodest(&sepc, 1, flags);
+				memtodest(&sepc, 1, flags | EXP_KEEPNUL);
 			}
 		}
 		break;
@@ -957,7 +981,7 @@ value:
 	}
 
 	if (discard)
-		STADJUST(-len, expdest);
+		expdest = (char *)stackblock() + start;
 
 	return len;
 }
@@ -1758,11 +1782,13 @@ casematch(union node *pattern, char *val)
 
 static size_t cvtnum(intmax_t num, int flags)
 {
+	size_t start = expdest - (char *)stackblock();
 	int len = max_int_length(sizeof(num));
 	char buf[len];
 
 	len = fmtstr(buf, len, "%" PRIdMAX, num);
-	return memtodest(buf, len, flags);
+	memtodest(buf, len, flags);
+	return (expdest - (char *)stackblock()) - start;
 }
 
 STATIC void
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/8] expand: Process multi-byte characters in subevalvar
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (2 preceding siblings ...)
  2024-04-16 23:13 ` [PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
@ 2024-04-18  8:59 ` Herbert Xu
  2024-04-20 13:46 ` [PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-18  8:59 UTC (permalink / raw)
  To: DASH Mailing List

When trimming variables in subevalvar, process multi-byte characters
as one unit instead of their constituent bytes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c   | 194 ++++++++++++++++++++++++++++++++++---------------
 src/expand.h   |   1 +
 src/mystring.c |   2 +-
 src/parser.h   |   1 +
 4 files changed, 138 insertions(+), 60 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index ad186b0..14c6a15 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -32,27 +32,27 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/stat.h>
+#include <ctype.h>
 #include <dirent.h>
-#include <unistd.h>
-#ifdef HAVE_GETPWNAM
-#include <pwd.h>
-#endif
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <string.h>
 #ifdef HAVE_FNMATCH
 #include <fnmatch.h>
 #endif
 #ifdef HAVE_GLOB
 #include <glob.h>
 #endif
-#include <ctype.h>
+#include <inttypes.h>
+#include <limits.h>
+#ifdef HAVE_GETPWNAM
+#include <pwd.h>
+#endif
+#include <string.h>
 #include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include <wchar.h>
 
 /*
@@ -550,8 +550,10 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 	loc = startp;
 	loc2 = rmesc;
 	do {
-		int match;
 		const char *s = loc2;
+		unsigned ml;
+		int match;
+
 		c = *loc2;
 		if (zero) {
 			*loc2 = '\0';
@@ -560,12 +562,26 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 		match = pmatch(str, s);
 		*loc2 = c;
 		if (match)
-			return loc;
-		if (quotes && *loc == (char)CTLESC)
+			return quotes ? loc : loc2;
+
+		if (!c)
+			break;
+
+		if (*loc != (char)CTLMBCHAR) {
+			if (*loc == (char)CTLESC)
+				loc++;
 			loc++;
-		loc++;
-		loc2++;
-	} while (c);
+			loc2++;
+			continue;
+		}
+
+		if (*++loc == (char)CTLESC)
+			loc++;
+
+		ml = (unsigned char)*loc;
+		loc += ml + 3;
+		loc2 += ml;
+	} while (1);
 	return 0;
 }
 
@@ -573,14 +589,16 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
 		       char *str, int quotes, int zero
 ) {
-	int esc = 0;
+	size_t esc = 0;
 	char *loc;
 	char *loc2;
 
 	for (loc = endp, loc2 = rmescend; loc >= startp; loc2--) {
-		int match;
-		char c = *loc2;
 		const char *s = loc2;
+		char c = *loc2;
+		unsigned ml;
+		int match;
+
 		if (zero) {
 			*loc2 = '\0';
 			s = rmesc;
@@ -588,17 +606,25 @@ static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
 		match = pmatch(str, s);
 		*loc2 = c;
 		if (match)
-			return loc;
+			return quotes ? loc : loc2;
 		loc--;
-		if (quotes) {
-			if (--esc < 0) {
-				esc = esclen(startp, loc);
-			}
-			if (esc % 2) {
-				esc--;
-				loc--;
-			}
+		if (!esc--) {
+			esc = esclen(startp, loc);
 		}
+		if (esc % 2) {
+			esc--;
+			loc--;
+			continue;
+		}
+		if (loc[1] != (char)CTLMBCHAR)
+			continue;
+
+		ml = (unsigned char)*loc;
+		loc -= ml - 1;
+		if (*loc == (char)CTLESC)
+			loc--;
+		loc--;
+		loc2 -= ml - 1;
 	}
 	return 0;
 }
@@ -652,14 +678,11 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
 		nstrloc = str - (char *)stackblock();
 	}
 
-	rmesc = startp;
-	if (quotes) {
-		rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
-		if (rmesc != startp)
-			rmescend = expdest;
-		startp = stackblock() + startloc;
-		str = stackblock() + nstrloc;
-	}
+	rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
+	if (rmesc != startp)
+		rmescend = expdest;
+	startp = stackblock() + startloc;
+	str = stackblock() + nstrloc;
 	rmescend--;
 
 	/* zero = subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX */
@@ -669,16 +692,29 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
 
 	endp = stackblock() + strloc - 1;
 	loc = scan(startp, endp, rmesc, rmescend, str, quotes, zero);
-	if (loc) {
-		if (zero) {
-			memmove(startp, loc, endp - loc);
-			loc = startp + (endp - loc);
+	if (!loc) {
+		if (quotes) {
+			rmesc = startp;
+			rmescend = endp;
 		}
-		*loc = '\0';
-	} else
-		loc = endp;
+	} else if (!quotes) {
+		if (zero)
+			rmesc = loc;
+		else
+			rmescend = loc;
+	} else if (zero) {
+		rmesc = loc;
+		rmescend = endp;
+	} else {
+		rmesc = startp;
+		rmescend = loc;
+	}
+
+	memmove(startp, rmesc, rmescend - rmesc);
+	loc = startp + (rmescend - rmesc);
 
 out:
+	*loc = '\0';
 	amount = loc - expdest;
 	STADJUST(amount, expdest);
 
@@ -704,6 +740,7 @@ evalvar(char *p, int flag)
 	ssize_t varlen;
 	int discard;
 	int quoted;
+	int mbchar;
 
 	varflags = *p++ & ~VSBIT;
 	subtype = varflags & VSTYPE;
@@ -713,8 +750,18 @@ evalvar(char *p, int flag)
 	startloc = expdest - (char *)stackblock();
 	p = strchr(p, '=') + 1;
 
+	mbchar = 0;
+	switch (subtype) {
+	case VSTRIMLEFT:
+	case VSTRIMLEFTMAX:
+	case VSTRIMRIGHT:
+	case VSTRIMRIGHTMAX:
+		mbchar = EXP_MBCHAR;
+		break;
+	}
+
 again:
-	varlen = varvalue(var, varflags, flag, quoted);
+	varlen = varvalue(var, varflags, flag | mbchar, quoted);
 	if (varflags & VSNUL)
 		varlen--;
 
@@ -801,7 +848,7 @@ static char *chtodest(int c, int flags, char *out)
 {
 	const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
 
-	if ((flags & QUOTES_ESC) &&
+	if ((flags & (QUOTES_ESC | EXP_MBCHAR)) &&
 	    ((syntax[c] == CCTL) ||
 	     (flags & EXP_QUOTED && syntax[c] == CBACK)))
 		USTPUTC(CTLESC, out);
@@ -823,9 +870,13 @@ static size_t memtodest(const char *p, size_t len, int flags)
 	if (unlikely(!len))
 		return 0;
 
-	q = makestrspace(len * 2, expdest);
+	/* CTLMBCHAR, 2, c, c, 2, CTLMBCHAR */
+	q = makestrspace(len * 3, expdest);
 
 	do {
+		mbstate_t mbs = {};
+		size_t ml;
+
 		c = (signed char)*p++;
 
 		if (c)
@@ -833,19 +884,30 @@ static size_t memtodest(const char *p, size_t len, int flags)
 		else if (!(flags & EXP_KEEPNUL))
 			continue;
 
-		if (c < 0) {
-			mbstate_t mbs = {};
+		if (c >= 0)
+			goto copy;
 
-			p--;
-			do {
-				q = chtodest(c, flags, q);
-			} while (mbrlen(p++, 1, &mbs) == -2 &&
-				 (c = *p, --len));
-			if (!len)
-				break;
-			continue;
+		ml = mbrlen(p - 1, len, &mbs);
+		if (ml == -1 || ml == -2 || ml < 2 || ml > MB_LEN_MAX)
+			goto copy;
+
+		if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+			USTPUTC(CTLMBCHAR, q);
+			USTPUTC(ml, q);
 		}
 
+		q = mempcpy(q, p - 1, ml);
+
+		if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+			USTPUTC(ml, q);
+			USTPUTC(CTLMBCHAR, q);
+		}
+
+		p += ml - 1;
+		len -= ml - 1;
+		continue;
+
+copy:
 		q = chtodest(c, flags, q);
 	} while (--len);
 
@@ -1720,6 +1782,8 @@ _rmescapes(char *str, int flag)
 	inquotes = 0;
 	notescaped = globbing;
 	while (*p) {
+		unsigned ml;
+
 		if (*p == (char)CTLQUOTEMARK) {
 			p++;
 			inquotes ^= globbing;
@@ -1743,6 +1807,18 @@ add_escape:
 			}
 		}
 		notescaped = globbing;
+
+		if (*p != (char)CTLMBCHAR)
+			goto copy;
+
+		if (*++p == (char)CTLESC)
+			p++;
+
+		ml = (unsigned char)*p++;
+		q = mempcpy(q, p, ml);
+		p += ml + 2;
+		continue;
+
 copy:
 		*q++ = *p++;
 	}
diff --git a/src/expand.h b/src/expand.h
index 49a18f9..e5a990e 100644
--- a/src/expand.h
+++ b/src/expand.h
@@ -60,6 +60,7 @@ struct arglist {
 #define EXP_QUOTED	0x100	/* expand word in double quotes */
 #define EXP_KEEPNUL	0x200	/* do not skip NUL characters */
 #define EXP_DISCARD	0x400	/* discard result of expansion */
+#define EXP_MBCHAR	0x800	/* mark multi-byte characters */
 
 
 struct jmploc;
diff --git a/src/mystring.c b/src/mystring.c
index 5eace6c..77b457c 100644
--- a/src/mystring.c
+++ b/src/mystring.c
@@ -67,7 +67,7 @@ const char cqchars[] = {
 #ifdef HAVE_FNMATCH
 	'^',
 #endif
-	CTLESC, CTLQUOTEMARK, 0
+	CTLESC, CTLMBCHAR, CTLQUOTEMARK, 0
 };
 const char illnum[] = "Illegal number: %s";
 const char homestr[] = "HOME";
diff --git a/src/parser.h b/src/parser.h
index 433573d..14bfc4f 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -44,6 +44,7 @@ union node;
 #define CTLVAR -126		/* variable defn */
 #define CTLENDVAR -125
 #define CTLBACKQ -124
+#define CTLMBCHAR -123
 #define	CTLARI -122		/* arithmetic expression */
 #define	CTLENDARI -121
 #define	CTLQUOTEMARK -120
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/8] expand: Process multi-byte characters in expmeta
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (3 preceding siblings ...)
  2024-04-18  8:59 ` [PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
@ 2024-04-20 13:46 ` Herbert Xu
  2024-04-23 11:17 ` [PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-20 13:46 UTC (permalink / raw)
  To: DASH Mailing List

When glob(3) is not in use, make sure that expmeta processes
multi-byte characters correctly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c | 107 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index 14c6a15..1e86058 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -84,6 +84,7 @@
 #define RMESCAPE_GLOB	0x2	/* Add backslashes for glob */
 #define RMESCAPE_GROW	0x8	/* Grow strings instead of stalloc */
 #define RMESCAPE_HEAP	0x10	/* Malloc strings instead of stalloc */
+#define RMESCAPE_EMETA	0x20	/* Remove backslashes too */
 
 /* Add CTLESC when necessary. */
 #define QUOTES_ESC	(EXP_FULL | EXP_CASE)
@@ -1349,15 +1350,13 @@ expandmeta(struct strlist *str)
 		savelastp = exparg.lastp;
 
 		INTOFF;
-		p = preglob(str->text, RMESCAPE_ALLOC | RMESCAPE_HEAP);
+		p = str->text;
 		len = strlen(p);
 		expdir_max = len + PATH_MAX;
 		expdir = ckmalloc(expdir_max);
 
 		expmeta(p, len, 0);
 		ckfree(expdir);
-		if (p != str->text)
-			ckfree(p);
 		INTON;
 		if (exparg.lastp == savelastp) {
 			/*
@@ -1378,6 +1377,41 @@ nometa:
 	}
 }
 
+static void expmeta_rmescapes(char *enddir, char *name)
+{
+	preglob(strcpy(enddir, name), RMESCAPE_EMETA);
+}
+
+static unsigned mbcharlen(char *p)
+{
+	int esc = 0;
+
+	if (*++p == (char)CTLESC)
+		esc++;
+
+	return esc + 3 + (unsigned char)p[esc];
+}
+
+static int skipesc(char *p)
+{
+	int esc = 0;
+
+	if (p[esc] == (char)CTLMBCHAR)
+		return esc + mbcharlen(p);
+
+	if (*p == (char)CTLESC)
+		esc++;
+
+	if (p[esc] == '\\' && p[esc + 1]) {
+		esc++;
+		if (p[esc] == (char)CTLMBCHAR)
+			return esc + mbcharlen(p + esc);
+		if (p[esc] == (char)CTLESC)
+			esc++;
+	}
+
+	return esc;
+}
 
 /*
  * Do metacharacter (i.e. *, ?, [...]) expansion.
@@ -1387,17 +1421,18 @@ STATIC void
 expmeta(char *name, unsigned name_len, unsigned expdir_len)
 {
 	char *enddir = expdir + expdir_len;
-	char *p;
+	struct stat64 statb;
+	struct dirent64 *dp;
 	const char *cp;
-	char *start;
 	char *endname;
 	int metaflag;
-	struct stat64 statb;
-	DIR *dirp;
-	struct dirent64 *dp;
-	int atend;
 	int matchdot;
+	char *start;
+	DIR *dirp;
+	char *pat;
+	char *p;
 	int esc;
+	int c;
 
 	metaflag = 0;
 	start = name;
@@ -1409,8 +1444,7 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
 			if (*q == '!')
 				q++;
 			for (;;) {
-				if (*q == '\\')
-					q++;
+				q += skipesc(q);
 				if (*q == '/' || *q == '\0')
 					break;
 				if (*++q == ']') {
@@ -1419,8 +1453,8 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
 				}
 			}
 		} else {
-			if (*p == '\\' && p[1])
-				esc++;
+			esc = skipesc(p);
+
 			if (p[esc] == '/') {
 				if (metaflag)
 					break;
@@ -1431,24 +1465,18 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
 	if (metaflag == 0) {	/* we've reached the end of the file name */
 		if (!expdir_len)
 			return;
-		p = name;
-		do {
-			if (*p == '\\' && p[1])
-				p++;
-			*enddir++ = *p;
-		} while (*p++);
+		expmeta_rmescapes(enddir, name);
 		if (lstat64(expdir, &statb) >= 0)
 			addfname(expdir);
 		return;
 	}
 	endname = p;
 	if (name < start) {
-		p = name;
-		do {
-			if (*p == '\\' && p[1])
-				p++;
-			*enddir++ = *p++;
-		} while (p < start);
+		c = *start;
+		*start = 0;
+		expmeta_rmescapes(enddir, name);
+		*start = c;
+		enddir += strlen(enddir);
 	}
 	*enddir = 0;
 	cp = expdir;
@@ -1457,25 +1485,26 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
 		cp = ".";
 	if ((dirp = opendir(cp)) == NULL)
 		return;
-	if (*endname == 0) {
-		atend = 1;
-	} else {
-		atend = 0;
+	c = *endname;
+	if (c) {
 		*endname = '\0';
 		endname += esc + 1;
 	}
 	name_len -= endname - name;
 	matchdot = 0;
 	p = start;
+	if (*p == (char)CTLESC)
+		p++;
 	if (*p == '\\')
 		p++;
 	if (*p == '.')
 		matchdot++;
+	pat = preglob(start, RMESCAPE_ALLOC | RMESCAPE_HEAP);
 	while (! int_pending() && (dp = readdir64(dirp)) != NULL) {
 		if (dp->d_name[0] == '.' && ! matchdot)
 			continue;
-		if (pmatch(start, dp->d_name)) {
-			if (atend) {
+		if (pmatch(pat, dp->d_name)) {
+			if (!c) {
 				scopy(dp->d_name, enddir);
 				addfname(expdir);
 			} else {
@@ -1498,9 +1527,11 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
 			}
 		}
 	}
+	if (pat != start)
+		ckfree(pat);
 	closedir(dirp);
-	if (! atend)
-		endname[-esc - 1] = esc ? '\\' : '/';
+	if (c)
+		endname[-esc - 1] = c;
 }
 #endif	/* HAVE_GLOB */
 
@@ -1745,6 +1776,7 @@ _rmescapes(char *str, int flag)
 	int notescaped;
 	int globbing;
 	int inquotes;
+	int expmeta;
 
 	p = strpbrk(str, cqchars);
 	if (!p) {
@@ -1753,6 +1785,7 @@ _rmescapes(char *str, int flag)
 	q = p;
 	r = str;
 	globbing = flag & RMESCAPE_GLOB;
+	expmeta = (flag & RMESCAPE_EMETA) ? RMESCAPE_GLOB : 0;
 
 	if (flag & RMESCAPE_ALLOC) {
 		size_t len = p - str;
@@ -1792,6 +1825,10 @@ _rmescapes(char *str, int flag)
 		if (*p == '\\') {
 			/* naked back slash */
 			notescaped ^= globbing;
+			if (expmeta & ~notescaped) {
+				p++;
+				continue;
+			}
 			goto copy;
 		}
 		if (FNMATCH_IS_ENABLED && *p == '^')
@@ -1799,7 +1836,9 @@ _rmescapes(char *str, int flag)
 		if (*p == (char)CTLESC) {
 			p++;
 add_escape:
-			if (notescaped)
+			if (expmeta)
+				;
+			else if (notescaped)
 				*q++ = '\\';
 			else if (inquotes) {
 				*q++ = '\\';
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/8] expand: Support multi-byte characters during field splitting
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (4 preceding siblings ...)
  2024-04-20 13:46 ` [PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
@ 2024-04-23 11:17 ` Herbert Xu
  2024-04-27  8:15 ` [PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-23 11:17 UTC (permalink / raw)
  To: DASH Mailing List

When multi-byte characters are used in IFS, they will be used
for field splitting.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c | 201 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 140 insertions(+), 61 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index 1e86058..679bbb8 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -54,6 +54,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <wchar.h>
+#include <wctype.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -164,6 +165,30 @@ esclen(const char *start, const char *p) {
 	return esc;
 }
 
+static __attribute__((noinline)) unsigned mbnext(const char *p)
+{
+	unsigned start = 0;
+	unsigned end = 0;
+	unsigned ml;
+	int c;
+
+	c = p[end++];
+
+	switch (c) {
+	case CTLMBCHAR:
+		if (p[end] == CTLESC)
+			end++;
+		ml = (unsigned char)p[end++];
+		start = end;
+		end = ml + 2;
+		break;
+	case CTLESC:
+		start++;
+		break;
+	}
+
+	return start | end << 8;
+}
 
 static inline const char *getpwhome(const char *name)
 {
@@ -552,6 +577,7 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 	loc2 = rmesc;
 	do {
 		const char *s = loc2;
+		unsigned mb;
 		unsigned ml;
 		int match;
 
@@ -568,19 +594,9 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 		if (!c)
 			break;
 
-		if (*loc != (char)CTLMBCHAR) {
-			if (*loc == (char)CTLESC)
-				loc++;
-			loc++;
-			loc2++;
-			continue;
-		}
-
-		if (*++loc == (char)CTLESC)
-			loc++;
-
-		ml = (unsigned char)*loc;
-		loc += ml + 3;
+		mb = mbnext(loc);
+		loc += (mb & 0xff) + (mb >> 8);
+		ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1;
 		loc2 += ml;
 	} while (1);
 	return 0;
@@ -932,18 +948,22 @@ static size_t strtodest(const char *p, int flags)
 STATIC ssize_t
 varvalue(char *name, int varflags, int flags, int quoted)
 {
+	int subtype = varflags & VSTYPE;
+	const char *seps;
+	ssize_t len = 0;
+	unsigned seplen;
+	size_t start;
+	int discard;
+	char sepc;
+	char **ap;
+	int sep;
 	int num;
 	char *p;
 	int i;
-	int sep;
-	char sepc;
-	char **ap;
-	int subtype = varflags & VSTYPE;
-	int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
-		      (flags & EXP_DISCARD);
-	ssize_t len = 0;
-	size_t start;
-	char c;
+	int c;
+
+	discard = (subtype == VSPLUS || subtype == VSLENGTH) |
+		  (flags & EXP_DISCARD);
 
 	if (!subtype) {
 		if (discard)
@@ -1006,15 +1026,27 @@ numvar:
 		sep &= ~quoted;
 		sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' ';
 param:
-		sepc = sep;
 		if (!(ap = shellparam.p))
 			return -1;
+		sepc = sep;
+		seps = &sepc;
+		seplen = 1;
+		if (sepc < 0) {
+			mbstate_t mbs = {};
+			size_t ml;
+
+			ml = mbrlen(ifsval(), strlen(ifsval()), &mbs);
+			if (ml != -1 && ml != -2 && ml > 1) {
+				seps = ifsval();
+				seplen = ml;
+			}
+		}
 		while ((p = *ap++)) {
 			len += strtodest(p, flags);
 
 			if (*ap && sep) {
 				len++;
-				memtodest(&sepc, 1, flags | EXP_KEEPNUL);
+				memtodest(seps, seplen, flags | EXP_KEEPNUL);
 			}
 		}
 		break;
@@ -1076,7 +1108,54 @@ recordregion(int start, int end, int nulonly)
 	ifslastp->nulonly = nulonly;
 }
 
+static __attribute__((noinline)) unsigned ifsisifs(
+	const char *p, unsigned ml, const char *ifs, size_t ifslen)
+{
+	bool isdefifs = false;
+	size_t slen = ifslen;
+	const char *s = ifs;
+	wchar_t c = *p;
+	bool isifs;
 
+	isifs = !c;
+	if (isifs) {
+		p = ifs;
+		c = *p;
+		slen = 0;
+	}
+
+	while (slen) {
+		mbstate_t mbst = {};
+		size_t ifsml;
+		wchar_t c2;
+
+		if ((signed char)*s > 0 ||
+		    (ifsml = mbrtowc(&c2, s, slen, &mbst),
+		     ifsml == -2 || ifsml == -1 || ifsml < 2)) {
+			if (c == *s) {
+				isifs = true;
+				break;
+			}
+			s++;
+			slen--;
+			continue;
+		}
+
+		if (ifsml == ml && !memcmp(p, s, ifsml)) {
+			isifs = true;
+			c = c2;
+			break;
+		}
+
+		s += ifsml;
+		slen -= ifsml;
+	}
+
+	if (isifs)
+		isdefifs = iswspace(c);
+
+	return isifs | isdefifs << 1;
+}
 
 /*
  * Break the argument string into pieces based upon IFS and add the
@@ -1088,16 +1167,16 @@ recordregion(int start, int end, int nulonly)
 void
 ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 {
+	const char *ifs, *realifs;
 	struct ifsregion *ifsp;
 	struct strlist *sp;
+	char *r = NULL;
+	size_t ifslen;
 	char *start;
+	int nulonly;
+	int ifsspc;
 	char *p;
 	char *q;
-	char *r = NULL;
-	const char *ifs, *realifs;
-	int ifsspc;
-	int nulonly;
-
 
 	start = string;
 	if (ifslastp != NULL) {
@@ -1112,21 +1191,27 @@ ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 			afternul = nulonly;
 			nulonly = ifsp->nulonly;
 			ifs = nulonly ? nullstr : realifs;
+			ifslen = strlen(ifs);
 			ifsspc = 0;
 			while (p < string + ifsp->endoff) {
-				int c;
-				bool isifs;
+				unsigned ifschar;
+				unsigned sisifs;
 				bool isdefifs;
+				unsigned ml;
+				bool isifs;
 
 				q = p;
-				c = *p++;
-				if (c == (char)CTLESC)
-					c = *p++;
 
-				isifs = strchr(ifs, c);
-				isdefifs = false;
-				if (isifs)
-					isdefifs = strchr(defifs, c);
+				ifschar = mbnext(p);
+				p += ifschar & 0xff;
+				ml = (ifschar >> 8) > 3 ?
+				     (ifschar >> 8) - 2 : 0;
+
+				sisifs = ifsisifs(p, ml, ifs, ifslen);
+				p += ifschar >> 8;
+
+				isifs = sisifs & 1;
+				isdefifs = sisifs >> 1;
 
 				/* If only reading one more argument:
 				 * If we have exactly one field,
@@ -1382,32 +1467,24 @@ static void expmeta_rmescapes(char *enddir, char *name)
 	preglob(strcpy(enddir, name), RMESCAPE_EMETA);
 }
 
-static unsigned mbcharlen(char *p)
-{
-	int esc = 0;
-
-	if (*++p == (char)CTLESC)
-		esc++;
-
-	return esc + 3 + (unsigned char)p[esc];
-}
-
 static int skipesc(char *p)
 {
+	unsigned short mb;
 	int esc = 0;
 
-	if (p[esc] == (char)CTLMBCHAR)
-		return esc + mbcharlen(p);
+	mb = mbnext(p);
+	if ((mb >> 8) > 3)
+		return (mb & 0xff) + (mb >> 8) - 1;
 
-	if (*p == (char)CTLESC)
-		esc++;
+	esc = mb & 0xff;
 
 	if (p[esc] == '\\' && p[esc + 1]) {
 		esc++;
-		if (p[esc] == (char)CTLMBCHAR)
-			return esc + mbcharlen(p + esc);
-		if (p[esc] == (char)CTLESC)
-			esc++;
+		mb = mbnext(p + esc);
+		if ((mb >> 8) > 3)
+			return esc + (mb & 0xff) + (mb >> 8) - 1;
+
+		esc += mb & 0xff;
 	}
 
 	return esc;
@@ -1815,6 +1892,7 @@ _rmescapes(char *str, int flag)
 	inquotes = 0;
 	notescaped = globbing;
 	while (*p) {
+		unsigned mb;
 		unsigned ml;
 
 		if (*p == (char)CTLQUOTEMARK) {
@@ -1847,13 +1925,14 @@ add_escape:
 		}
 		notescaped = globbing;
 
-		if (*p != (char)CTLMBCHAR)
+		mb = mbnext(p);
+		ml = mb >> 8;
+
+		if (ml <= 3)
 			goto copy;
 
-		if (*++p == (char)CTLESC)
-			p++;
-
-		ml = (unsigned char)*p++;
+		ml -= 2;
+		p += mb & 0xff;
 		q = mempcpy(q, p, ml);
 		p += ml + 2;
 		continue;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (5 preceding siblings ...)
  2024-04-23 11:17 ` [PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
@ 2024-04-27  8:15 ` Herbert Xu
  2024-04-27  8:41 ` [PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu
  2024-04-27 21:31 ` [PATCH 0/8] Add multi-byte support Christoph Anton Mitterer
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-27  8:15 UTC (permalink / raw)
  To: DASH Mailing List

In order to parse multi-byte characters which may be up to MB_LEN_MAX
bytes long, allow enough calls to pungetc to undo a single multi-byte
character.

Also add a function pungetn to do multiple pungetc calls in a row.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/input.c | 58 ++++++++++++++++++++++++++++++++++-------------------
 src/input.h | 11 +++++-----
 2 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/src/input.c b/src/input.c
index fb9858f..c7805ad 100644
--- a/src/input.c
+++ b/src/input.c
@@ -56,7 +56,7 @@
 #include "main.h"
 #include "myhistedit.h"
 
-#define IBUFSIZ (BUFSIZ + 1)
+#define IBUFSIZ (BUFSIZ + PUNGETC_MAX + 1)
 
 
 MKINIT struct parsefile basepf;	/* top level input file */
@@ -83,13 +83,16 @@ INIT {
 }
 
 RESET {
+	int c;
+
 	/* clear input buffer */
 	popallfiles();
-	basepf.unget = 0;
-	while (basepf.lastc[0] != '\n' &&
-	       basepf.lastc[0] != PEOF &&
-	       !int_pending())
-		pgetc();
+
+	c = PEOF;
+	if (basepf.nextc - basebuf > basepf.unget)
+		c = basepf.nextc[-basepf.unget];
+	while (c != '\n' && c != PEOF && !int_pending())
+		c = pgetc();
 }
 
 FORKRESET {
@@ -131,17 +134,20 @@ static int __pgetc(void)
 {
 	int c;
 
-	if (parsefile->unget)
-		return parsefile->lastc[--parsefile->unget];
+	if (parsefile->unget) {
+		long unget = -(long)(unsigned)parsefile->unget--;
+
+		if (parsefile->nleft < 0)
+			return preadbuffer();
+
+		return parsefile->nextc[unget];
+	}
 
 	if (--parsefile->nleft >= 0)
 		c = (signed char)*parsefile->nextc++;
 	else
 		c = preadbuffer();
 
-	parsefile->lastc[1] = parsefile->lastc[0];
-	parsefile->lastc[0] = c;
-
 	return c;
 }
 
@@ -176,9 +182,16 @@ static int stdin_clear_nonblock(void)
 static int
 preadfd(void)
 {
+	char *buf = parsefile->buf;
+	int unget;
 	int nr;
-	char *buf =  parsefile->buf;
-	parsefile->nextc = buf;
+
+	unget = parsefile->nextc - buf;
+	if (unget > PUNGETC_MAX)
+		unget = PUNGETC_MAX;
+
+	memmove(buf, parsefile->nextc - unget, unget);
+	parsefile->nextc = buf += unget;
 
 retry:
 #ifndef SMALL
@@ -196,8 +209,8 @@ retry:
 			nr = 0;
 		else {
 			nr = el_len;
-			if (nr > IBUFSIZ - 1)
-				nr = IBUFSIZ - 1;
+			if (nr > BUFSIZ)
+				nr = BUFSIZ;
 			memcpy(buf, rl_cp, nr);
 			if (nr != el_len) {
 				el_len -= nr;
@@ -209,9 +222,9 @@ retry:
 	} else
 #endif
 	if (parsefile->fd)
-		nr = read(parsefile->fd, buf, IBUFSIZ - 1);
+		nr = read(parsefile->fd, buf, BUFSIZ);
 	else {
-		unsigned len = IBUFSIZ - 1;
+		unsigned len = BUFSIZ;
 
 		nr = 0;
 
@@ -348,6 +361,11 @@ done:
 	return (signed char)*parsefile->nextc++;
 }
 
+void pungetn(int n)
+{
+	parsefile->unget += n;
+}
+
 /*
  * Undo a call to pgetc.  Only two characters may be pushed back.
  * PEOF may be pushed back.
@@ -356,7 +374,7 @@ done:
 void
 pungetc(void)
 {
-	parsefile->unget++;
+	pungetn(1);
 }
 
 /*
@@ -383,7 +401,6 @@ pushstring(char *s, void *ap)
 	sp->prevnleft = parsefile->nleft;
 	sp->unget = parsefile->unget;
 	sp->spfree = parsefile->spfree;
-	memcpy(sp->lastc, parsefile->lastc, sizeof(sp->lastc));
 	sp->ap = (struct alias *)ap;
 	if (ap) {
 		((struct alias *)ap)->flag |= ALIASINUSE;
@@ -413,7 +430,6 @@ static void popstring(void)
 	parsefile->nextc = sp->prevstring;
 	parsefile->nleft = sp->prevnleft;
 	parsefile->unget = sp->unget;
-	memcpy(parsefile->lastc, sp->lastc, sizeof(sp->lastc));
 /*dprintf("*** calling popstring: restoring to '%s'\n", parsenextc);*/
 	parsefile->strpush = sp->prev;
 	parsefile->spfree = sp;
@@ -457,7 +473,7 @@ setinputfd(int fd, int push)
 	}
 	parsefile->fd = fd;
 	if (parsefile->buf == NULL)
-		parsefile->buf = ckmalloc(IBUFSIZ);
+		parsefile->nextc = parsefile->buf = ckmalloc(IBUFSIZ);
 	input_set_lleft(parsefile, parsefile->nleft = 0);
 	plinno = 1;
 }
diff --git a/src/input.h b/src/input.h
index 1ff5773..5b4a045 100644
--- a/src/input.h
+++ b/src/input.h
@@ -34,12 +34,16 @@
  *	@(#)input.h	8.2 (Berkeley) 5/4/95
  */
 
+#include <limits.h>
+
 #ifdef SMALL
 #define IS_DEFINED_SMALL 1
 #else
 #define IS_DEFINED_SMALL 0
 #endif
 
+#define PUNGETC_MAX (MB_LEN_MAX > 16 ? MB_LEN_MAX : 16)
+
 /* PEOF (the end of file marker) is defined in syntax.h */
 
 enum {
@@ -59,9 +63,6 @@ struct strpush {
 	/* Delay freeing so we can stop nested aliases. */
 	struct strpush *spfree;
 
-	/* Remember last two characters for pungetc. */
-	int lastc[2];
-
 	/* Number of outstanding calls to pungetc. */
 	int unget;
 };
@@ -87,9 +88,6 @@ struct parsefile {
 	/* Delay freeing so we can stop nested aliases. */
 	struct strpush *spfree;
 
-	/* Remember last two characters for pungetc. */
-	int lastc[2];
-
 	/* Number of outstanding calls to pungetc. */
 	int unget;
 };
@@ -106,6 +104,7 @@ extern struct parsefile *parsefile;
 int pgetc(void);
 int pgetc2(void);
 void pungetc(void);
+void pungetn(int);
 void pushstring(char *, void *);
 int setinputfile(const char *, int);
 void setinputstring(char *);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 8/8] parser: Add support for multi-byte characters
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (6 preceding siblings ...)
  2024-04-27  8:15 ` [PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
@ 2024-04-27  8:41 ` Herbert Xu
  2024-04-27 21:31 ` [PATCH 0/8] Add multi-byte support Christoph Anton Mitterer
  8 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-27  8:41 UTC (permalink / raw)
  To: DASH Mailing List

Add the requisite markers for multi-byte characters so that the
expansion code can recognise them.  Also allow wide blank characters
to terminate words.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c |  19 ++++++++
 src/parser.c | 127 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index 679bbb8..7c3f350 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -265,6 +265,7 @@ static char *argstr(char *p, int flag)
 		CTLESC,
 		CTLVAR,
 		CTLBACKQ,
+		CTLMBCHAR,
 		CTLARI,
 		CTLENDARI,
 		0
@@ -289,6 +290,8 @@ tilde:
 start:
 	startloc = expdest - (char *)stackblock();
 	for (;;) {
+		unsigned ml;
+		unsigned mb;
 		int end;
 
 		length += strcspn(p + length, reject);
@@ -351,6 +354,22 @@ addquote:
 				startloc++;
 			}
 			break;
+		case CTLMBCHAR:
+			c = (signed char)*p--;
+			mb = mbnext(p);
+			ml = (mb >> 8) - 2;
+			if (flag & QUOTES_ESC) {
+				length = (mb >> 8) + (mb & 0xff);
+				if (c == (char)CTLESC)
+					startloc += length;
+				break;
+			}
+			if (c == CTLESC)
+				startloc += ml;
+			p += mb & 0xff;
+			expdest = stnputs(p, ml, expdest);
+			p += mb >> 8;
+			break;
 		case CTLESC:
 			startloc++;
 			length++;
diff --git a/src/parser.c b/src/parser.c
index 27611f0..c23cc9b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -36,7 +36,11 @@
 #include <alloca.h>
 #endif
 
+#include <limits.h>
+#include <stdbool.h>
 #include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "shell.h"
 #include "parser.h"
@@ -876,7 +880,53 @@ static void synstack_pop(struct synstack **stack)
 	*stack = (*stack)->next;
 }
 
+static unsigned getmbc(int c, char *out, int mode)
+{
+	char *const start = out;
+	mbstate_t mbst = {};
+	unsigned ml = 0;
+	size_t ml2;
+	wchar_t wc;
+	char *mbc;
 
+	if (likely(c >= 0))
+		return 0;
+
+	mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out;
+	mbc[ml] = c;
+	while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) {
+		if (ml >= MB_LEN_MAX)
+			break;
+		c = pgetc();
+		if (c == PEOF)
+			break;
+		mbc[ml] = c;
+	}
+
+	if (ml2 == 1 && ml > 1) {
+		if (mode == 4 && iswblank(wc))
+			return 1;
+
+		if ((mode & 3) < 2) {
+			USTPUTC(CTLMBCHAR, out);
+			if (mode == 1)
+				USTPUTC(CTLESC, out);
+			USTPUTC(ml, out);
+		}
+		STADJUST(ml, out);
+		if ((mode & 3) < 2) {
+			USTPUTC(ml, out);
+			USTPUTC(CTLMBCHAR, out);
+		}
+
+		return out - start;
+	}
+
+	if (ml > 1)
+		pungetn(ml - 1);
+
+	return 0;
+}
 
 /*
  * If eofmark is NULL, read a word or a redirection symbol.  If eofmark
@@ -929,12 +979,27 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 		}
 #endif
 		CHECKEND();	/* set c to PEOF if at end of here document */
-		for (;;) {	/* until end of line or end of word */
-			CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
+		/* Until end of line or end of word */
+		for (;; c = pgetc_top(synstack)) {
+			int fieldsplitting;
+			unsigned ml;
+
+			/* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */
+			CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7,
+				      out);
+			fieldsplitting = synstack->syntax == BASESYNTAX &&
+					 !synstack->varnest ? 4 : 0;
+			ml = getmbc(c, out, fieldsplitting);
+			if (ml == 1) {
+				c = pgetc();
+				break;
+			}
+			out += ml;
+			if (ml)
+				continue;
 			switch(synstack->syntax[c]) {
 			case CNL:	/* '\n' */
-				if (synstack->syntax == BASESYNTAX &&
-				    !synstack->varnest)
+				if (fieldsplitting)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 				nlprompt();
@@ -956,26 +1021,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 					USTPUTC(CTLESC, out);
 					USTPUTC('\\', out);
 					pungetc();
-				} else {
-					if (
-						synstack->dblquote &&
-						c != '\\' && c != '`' &&
-						c != '$' && (
-							c != '"' ||
-							(eofmark != NULL &&
-							 !synstack->varnest)
-						) && (
-							c != '}' ||
-							!synstack->varnest
-						)
-					) {
-						USTPUTC(CTLESC, out);
-						USTPUTC('\\', out);
-					}
-					USTPUTC(CTLESC, out);
-					USTPUTC(c, out);
-					quotef++;
+					break;
 				}
+
+				if (
+					synstack->dblquote &&
+					c != '\\' && c != '`' &&
+					c != '$' && (
+						c != '"' ||
+						(eofmark != NULL &&
+						 !synstack->varnest)
+					) && (
+						c != '}' ||
+						!synstack->varnest
+					)
+				) {
+					USTPUTC(CTLESC, out);
+					USTPUTC('\\', out);
+				}
+				quotef++;
+
+				ml = getmbc(c, out, 1);
+				out += ml;
+				if (ml)
+					break;
+
+				USTPUTC(CTLESC, out);
+				USTPUTC(c, out);
 				break;
 			case CSQUOTE:
 				synstack->syntax = SQSYNTAX;
@@ -1053,11 +1125,10 @@ toggledq:
 			case CEOF:
 				goto endword;		/* exit outer loop */
 			default:
-				if (synstack->varnest == 0)
+				if (fieldsplitting)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 			}
-			c = pgetc_top(synstack);
 		}
 	}
 endword:
@@ -1384,6 +1455,7 @@ parsebackq: {
 	size_t psavelen;
 	size_t savelen;
 	union node *n;
+	unsigned ml;
 	char *pstr;
 	char *str;
 
@@ -1415,6 +1487,11 @@ parsebackq: {
                                 if (pc != '\\' && pc != '`' && pc != '$'
                                     && (!synstack->dblquote || pc != '"'))
                                         STPUTC('\\', pout);
+				CHECKSTRSPACE(MB_LEN_MAX, pout);
+				ml = getmbc(pc, pout, 2);
+				pout += ml;
+				if (ml)
+					continue;
 				break;
 
 			case PEOF:
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 0/8] Add multi-byte support
@ 2024-04-27 11:03 Herbert Xu
  2024-04-16 10:03 ` [PATCH 1/8] shell: Call setlocale Herbert Xu
                   ` (8 more replies)
  0 siblings, 9 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-27 11:03 UTC (permalink / raw)
  To: DASH Mailing List

This patch series adds multi-byte support to dash.  For now only
fnmatch is supported as the native pmatch function has not been
modified to support multi-byte characters.

Herbert Xu (8):
  shell: Call setlocale
  shell: Use strcoll instead of strcmp where applicable
  expand: Count multi-byte characters for VSLENGTH
  expand: Process multi-byte characters in subevalvar
  expand: Process multi-byte characters in expmeta
  expand: Support multi-byte characters during field splitting
  input: Allow MB_LEN_MAX calls to pungetc
  parser: Add support for multi-byte characters

 src/bltin/test.c |   8 +-
 src/expand.c     | 489 +++++++++++++++++++++++++++++++++++------------
 src/expand.h     |   1 +
 src/input.c      |  58 ++++--
 src/input.h      |  11 +-
 src/main.c       |   4 +
 src/mystring.c   |   2 +-
 src/parser.c     | 127 +++++++++---
 src/parser.h     |   1 +
 9 files changed, 519 insertions(+), 182 deletions(-)

-- 
2.39.2


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
                   ` (7 preceding siblings ...)
  2024-04-27  8:41 ` [PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu
@ 2024-04-27 21:31 ` Christoph Anton Mitterer
  2024-04-28  0:49   ` Herbert Xu
  8 siblings, 1 reply; 17+ messages in thread
From: Christoph Anton Mitterer @ 2024-04-27 21:31 UTC (permalink / raw)
  To: Herbert Xu, DASH Mailing List

Hey.


On Sat, 2024-04-27 at 19:03 +0800, Herbert Xu wrote:
> This patch series adds multi-byte support to dash.  For now only
> fnmatch is supported as the native pmatch function has not been
> modified to support multi-byte characters.

Nothing against the functionality per se, but I think for all scripts
that assumed dash's (and thus on may systems /bin/sh's) current
behaviour of being C locale only even without explicitly setting
LC_ALL=C, this may have quite some subtle issues.


AFAIU, in the C locale, all bytes is a character, and thus in
particular pattern matching notation is defined for every defined
outcome of command substitution respectively every content of variables
(that is: in every(!) locale every byte other than NUL).


For example:
************
A while ago I've asked on the Austin Group mailing list for a portable
way to get command substitution without stripping of trailing newlines.

Long story short:
The recommended way was to add a sentinel character '.' at the end of
the output within the command substitution and strip that off later
with parameter expansion.
But despite of the very special properties[0] of '.', it's apparently
still required to set LC_ALL=C when stripping the sentinel, because the
pattern matching notation in ${foo%.} is defined only on strings of
characters, not on strings of bytes.

Back then, Harald van Dijk had some ideas how that might be resolved
for good, but IIRC none of the shell implementors seemed to really have
interest.

My goal was to make a portable function like
   command_subst_with_newlines "eval-ed-command-string" "target-variable-name"
which, with the requirement of setting LC_ALL proved more or less
impossible when the function should have no side effects (like keeping
the LC_ALL overridden, over possibly overriding some existing var like
OLD_LC_ALL).


Anyway... I could image, that if dash becomes multi-byte aware, there
might be more or less subtle surprises.


Cheers,
Chris.


[0] https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap06.html
"The encoded values associated with <period>, <slash>, <newline>, and
<carriage-return> shall be invariant across all locales supported by
the implementation."

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-27 21:31 ` [PATCH 0/8] Add multi-byte support Christoph Anton Mitterer
@ 2024-04-28  0:49   ` Herbert Xu
  2024-04-28  1:19     ` Christoph Anton Mitterer
  2024-04-28 14:50     ` Harald van Dijk
  0 siblings, 2 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-28  0:49 UTC (permalink / raw)
  To: Christoph Anton Mitterer; +Cc: DASH Mailing List

On Sat, Apr 27, 2024 at 11:31:43PM +0200, Christoph Anton Mitterer wrote:
>
> Long story short:
> The recommended way was to add a sentinel character '.' at the end of
> the output within the command substitution and strip that off later
> with parameter expansion.
> But despite of the very special properties[0] of '.', it's apparently
> still required to set LC_ALL=C when stripping the sentinel, because the
> pattern matching notation in ${foo%.} is defined only on strings of
> characters, not on strings of bytes.

Are you talking about a theoretical undefined condition, or an
actual one?  Which shell doesn't deal with ${foo%.} correctly?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28  0:49   ` Herbert Xu
@ 2024-04-28  1:19     ` Christoph Anton Mitterer
  2024-04-28  1:35       ` Lawrence Velázquez
  2024-04-28  2:03       ` Christoph Anton Mitterer
  2024-04-28 14:50     ` Harald van Dijk
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Anton Mitterer @ 2024-04-28  1:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: DASH Mailing List

On Sun, 2024-04-28 at 08:49 +0800, Herbert Xu wrote:
> 
> Are you talking about a theoretical undefined condition, or an
> actual one?  Which shell doesn't deal with ${foo%.} correctly?

Well my main point for this mail was how dash does it (and not just
about '.').
I guess it simply resorts to fnmatch(3) so it will probably do whatever
the system's libc does?

But it's really not just about '.' (which is more or less a rather safe
case).
If someone assumes dash would always be LC_ALL=C, that any such
operation where e.g. a byte is used that is part of a multi-byte
character might, AFAIU, lead to unexpected results.
E.g. an fnmatch implementation may just decide to stop and give an
error at the first byte that's not a valid characters, right?

Also there were some locales like Big5, which had the weird property of
having multibyte chars that contain byte sequences that form other
valid chars (see [0]).
Not sure if I remember that correctly, but it might have been undefined
when stripping of the "shorter" character from that.
Which again, couldn't have happened so far in dash, as it simply was C
local only.


Harald van Dijk made some extensive tests back then, how different
shells behave.
I think the austrin-group-l mailing list archive is not publicly
available, but if you have an account it was in that mail:
https://collaboration.opengroup.org/operational/mailarch.php?soph=N&action=show&archive=austin-group-l&num=34339&limit=100&offset=0
which showed that shells do indeed behave differently (the tests
weren't for '.').


Cheers,
Chris.

[0] https://unix.stackexchange.com/questions/383217/shell-keep-trailing-newlines-n-in-command-substitution/383411#383411

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28  1:19     ` Christoph Anton Mitterer
@ 2024-04-28  1:35       ` Lawrence Velázquez
  2024-04-28  1:50         ` Christoph Anton Mitterer
  2024-04-28  2:03       ` Christoph Anton Mitterer
  1 sibling, 1 reply; 17+ messages in thread
From: Lawrence Velázquez @ 2024-04-28  1:35 UTC (permalink / raw)
  To: Christoph Anton Mitterer; +Cc: Herbert Xu, dash

On Sat, Apr 27, 2024, at 9:19 PM, Christoph Anton Mitterer wrote:
> If someone assumes dash would always be LC_ALL=C, that any such
> operation where e.g. a byte is used that is part of a multi-byte
> character might, AFAIU, lead to unexpected results.

What's your point?  Do you think dash should not acquire multibyte
support, to avoid breaking scripts that make such assumptions?

-- 
vq

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28  1:35       ` Lawrence Velázquez
@ 2024-04-28  1:50         ` Christoph Anton Mitterer
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Anton Mitterer @ 2024-04-28  1:50 UTC (permalink / raw)
  To: Lawrence Velázquez; +Cc: Herbert Xu, dash

On Sat, 2024-04-27 at 21:35 -0400, Lawrence Velázquez wrote:
> What's your point?  Do you think dash should not acquire multibyte
> support, to avoid breaking scripts that make such assumptions?

No, as I've said I have nothing against the functionality per se, but
just wanted to give a heads up that this might have some not
immediately obvious consequences.

Maybe it makes sense to add some big fat warning in a NEWS or changelog
file, telling people that their scripts may now run under another
locale and perhaps also giving some explanation on possible subtle
consequences?


Cheers,
Chris.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28  1:19     ` Christoph Anton Mitterer
  2024-04-28  1:35       ` Lawrence Velázquez
@ 2024-04-28  2:03       ` Christoph Anton Mitterer
  1 sibling, 0 replies; 17+ messages in thread
From: Christoph Anton Mitterer @ 2024-04-28  2:03 UTC (permalink / raw)
  To: DASH Mailing List

Just to give a simple example:

x.sh:
   char=ä
   printf '%s' "$char" | hd
   
   byte="$( printf '\244' )"
   printf '%s' "$byte" | hd
   
   printf '%s' "${char%$byte}" | hd


$ LC_ALL=en_US.UTF-8 bash x.sh 
00000000  c3 a4                                             |..|
00000002
00000000  a4                                                |.|
00000001
00000000  c3                                                |.|
00000001
$ LC_ALL=C bash x.sh 
00000000  c3 a4                                             |..|
00000002
00000000  a4                                                |.|
00000001
00000000  c3                                                |.|
00000001

$ dash x.sh 
00000000  c3 a4                                             |..|
00000002
00000000  a4                                                |.|
00000001
00000000  c3                                                |.|
00000001

but:

$ LC_ALL=en_US.UTF-8 zsh x.sh 
00000000  c3 a4                                             |..|
00000002
00000000  a4                                                |.|
00000001
00000000  c3 a4                                             |..|
00000002
$ LC_ALL=C zsh x.sh 
00000000  c3 a4                                             |..|
00000002
00000000  a4                                                |.|
00000001
00000000  c3                                                |.|
00000001


Again, not saying that this means dash shouldn't support other
locales... but people may (of course wrongly) rely on the behaviour
that dash is always in the C locale.

Even if that doesn't change right now with the fnmatch()
implementation, it could in principle do so with any change there.


Ceers,
Chris.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28  0:49   ` Herbert Xu
  2024-04-28  1:19     ` Christoph Anton Mitterer
@ 2024-04-28 14:50     ` Harald van Dijk
  2024-04-29 13:12       ` Herbert Xu
  1 sibling, 1 reply; 17+ messages in thread
From: Harald van Dijk @ 2024-04-28 14:50 UTC (permalink / raw)
  To: Herbert Xu, Christoph Anton Mitterer; +Cc: DASH Mailing List

On 28/04/2024 01:49, Herbert Xu wrote:
> On Sat, Apr 27, 2024 at 11:31:43PM +0200, Christoph Anton Mitterer wrote:
>>
>> Long story short:
>> The recommended way was to add a sentinel character '.' at the end of
>> the output within the command substitution and strip that off later
>> with parameter expansion.
>> But despite of the very special properties[0] of '.', it's apparently
>> still required to set LC_ALL=C when stripping the sentinel, because the
>> pattern matching notation in ${foo%.} is defined only on strings of
>> characters, not on strings of bytes.
> 
> Are you talking about a theoretical undefined condition, or an
> actual one?  Which shell doesn't deal with ${foo%.} correctly?

The way you are implementing it, once you get to pmatch(), arguably you 
will not handle ${foo%.} correctly.

Consider an UTF-8 locale, where '\303' is not a valid multibyte 
character. In this locale, consider

   foo=$(printf '\303.')
   foo=${foo%.}

This is something I expect to set foo to '\303', and it does in all 
shells I know of, despite POSIX not saying this needs to work. The way 
you are implementing multibyte character support, if I am reading it 
right, as long as a full multibyte character has not been read, the next 
byte will be taken as part of that multibyte character, meaning you will 
take '\303.' as a single invalid multibyte character.

At the same time, '\303\251' is a valid multibyte character, and '\251' 
is not. So also consider

   foo=$(printf '\303\251')
   foo=${foo%$(printf '\251')}

Here, it is not clear what the correct result is, and indeed, shells 
disagree. bosh, ksh, zsh, and my shell do not break up characters, which 
I believe to be the most sensible behaviour. bash and mksh do.

The corner cases need to be carefully considered in order to figure out 
how to write the multibyte character support core functionality.

Cheers,
Harald van Dijk

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/8] Add multi-byte support
  2024-04-28 14:50     ` Harald van Dijk
@ 2024-04-29 13:12       ` Herbert Xu
  0 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2024-04-29 13:12 UTC (permalink / raw)
  To: Harald van Dijk; +Cc: Christoph Anton Mitterer, DASH Mailing List

On Sun, Apr 28, 2024 at 03:50:57PM +0100, Harald van Dijk wrote:
>
> The way you are implementing it, once you get to pmatch(), arguably you will
> not handle ${foo%.} correctly.

I took the easy way out so for now only the fnmatch path works
properly.  But eventually I will get to the pmatch path too,
especially because fnmatch(3) seems to slow down quite a bit
once locale is set.
 
> This is something I expect to set foo to '\303', and it does in all shells I
> know of, despite POSIX not saying this needs to work. The way you are
> implementing multibyte character support, if I am reading it right, as long
> as a full multibyte character has not been read, the next byte will be taken
> as part of that multibyte character, meaning you will take '\303.' as a
> single invalid multibyte character.

A multi-byte character has to be valid according to mbrtowc
before I mark it with MBCHAR.  Otherwise it'll be treated as
just a single-byte character.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-04-29 13:12 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-16 10:03 ` [PATCH 1/8] shell: Call setlocale Herbert Xu
2024-04-16 10:38 ` [PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-04-16 23:13 ` [PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-04-18  8:59 ` [PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
2024-04-20 13:46 ` [PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
2024-04-23 11:17 ` [PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
2024-04-27  8:15 ` [PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-04-27  8:41 ` [PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu
2024-04-27 21:31 ` [PATCH 0/8] Add multi-byte support Christoph Anton Mitterer
2024-04-28  0:49   ` Herbert Xu
2024-04-28  1:19     ` Christoph Anton Mitterer
2024-04-28  1:35       ` Lawrence Velázquez
2024-04-28  1:50         ` Christoph Anton Mitterer
2024-04-28  2:03       ` Christoph Anton Mitterer
2024-04-28 14:50     ` Harald van Dijk
2024-04-29 13:12       ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.