All of lore.kernel.org
 help / color / mirror / Atom feed
From: Herbert Xu <herbert@gondor.apana.org.au>
To: DASH Mailing List <dash@vger.kernel.org>
Subject: [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar
Date: Sun, 28 Apr 2024 11:57:05 +0800	[thread overview]
Message-ID: <008ebecbab03a2504589f69ae9c2ed1353f7b6a3.1714276539.git.herbert@gondor.apana.org.au> (raw)
In-Reply-To: <cover.1714276539.git.herbert@gondor.apana.org.au>

When trimming variables in subevalvar, process multi-byte characters
as one unit instead of their constituent bytes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c   | 192 ++++++++++++++++++++++++++++++++++---------------
 src/expand.h   |   1 +
 src/mystring.c |   2 +-
 src/parser.h   |   1 +
 4 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index ad186b0..60a51b1 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -32,27 +32,27 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/stat.h>
+#include <ctype.h>
 #include <dirent.h>
-#include <unistd.h>
-#ifdef HAVE_GETPWNAM
-#include <pwd.h>
-#endif
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <string.h>
 #ifdef HAVE_FNMATCH
 #include <fnmatch.h>
 #endif
 #ifdef HAVE_GLOB
 #include <glob.h>
 #endif
-#include <ctype.h>
+#include <inttypes.h>
+#include <limits.h>
+#ifdef HAVE_GETPWNAM
+#include <pwd.h>
+#endif
+#include <string.h>
 #include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include <wchar.h>
 
 /*
@@ -550,8 +550,10 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 	loc = startp;
 	loc2 = rmesc;
 	do {
-		int match;
 		const char *s = loc2;
+		unsigned ml;
+		int match;
+
 		c = *loc2;
 		if (zero) {
 			*loc2 = '\0';
@@ -560,12 +562,26 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 		match = pmatch(str, s);
 		*loc2 = c;
 		if (match)
-			return loc;
-		if (quotes && *loc == (char)CTLESC)
+			return quotes ? loc : loc2;
+
+		if (!c)
+			break;
+
+		if (*loc != (char)CTLMBCHAR) {
+			if (*loc == (char)CTLESC)
+				loc++;
 			loc++;
-		loc++;
-		loc2++;
-	} while (c);
+			loc2++;
+			continue;
+		}
+
+		if (*++loc == (char)CTLESC)
+			loc++;
+
+		ml = (unsigned char)*loc;
+		loc += ml + 3;
+		loc2 += ml;
+	} while (1);
 	return 0;
 }
 
@@ -573,14 +589,16 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
 		       char *str, int quotes, int zero
 ) {
-	int esc = 0;
+	size_t esc = 0;
 	char *loc;
 	char *loc2;
 
 	for (loc = endp, loc2 = rmescend; loc >= startp; loc2--) {
-		int match;
-		char c = *loc2;
 		const char *s = loc2;
+		char c = *loc2;
+		unsigned ml;
+		int match;
+
 		if (zero) {
 			*loc2 = '\0';
 			s = rmesc;
@@ -588,17 +606,23 @@ static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
 		match = pmatch(str, s);
 		*loc2 = c;
 		if (match)
-			return loc;
+			return quotes ? loc : loc2;
 		loc--;
-		if (quotes) {
-			if (--esc < 0) {
-				esc = esclen(startp, loc);
-			}
-			if (esc % 2) {
-				esc--;
-				loc--;
-			}
+		if (!esc--)
+			esc = esclen(startp, loc);
+		if (esc % 2) {
+			esc--;
+			loc--;
+			continue;
 		}
+		if (*loc != (char)CTLMBCHAR)
+			continue;
+
+		ml = (unsigned char)*--loc;
+		loc -= ml + 2;
+		if (*loc == (char)CTLESC)
+			loc--;
+		loc2 -= ml - 1;
 	}
 	return 0;
 }
@@ -652,14 +676,11 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
 		nstrloc = str - (char *)stackblock();
 	}
 
-	rmesc = startp;
-	if (quotes) {
-		rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
-		if (rmesc != startp)
-			rmescend = expdest;
-		startp = stackblock() + startloc;
-		str = stackblock() + nstrloc;
-	}
+	rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
+	if (rmesc != startp)
+		rmescend = expdest;
+	startp = stackblock() + startloc;
+	str = stackblock() + nstrloc;
 	rmescend--;
 
 	/* zero = subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX */
@@ -669,16 +690,29 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
 
 	endp = stackblock() + strloc - 1;
 	loc = scan(startp, endp, rmesc, rmescend, str, quotes, zero);
-	if (loc) {
-		if (zero) {
-			memmove(startp, loc, endp - loc);
-			loc = startp + (endp - loc);
+	if (!loc) {
+		if (quotes) {
+			rmesc = startp;
+			rmescend = endp;
 		}
-		*loc = '\0';
-	} else
-		loc = endp;
+	} else if (!quotes) {
+		if (zero)
+			rmesc = loc;
+		else
+			rmescend = loc;
+	} else if (zero) {
+		rmesc = loc;
+		rmescend = endp;
+	} else {
+		rmesc = startp;
+		rmescend = loc;
+	}
+
+	memmove(startp, rmesc, rmescend - rmesc);
+	loc = startp + (rmescend - rmesc);
 
 out:
+	*loc = '\0';
 	amount = loc - expdest;
 	STADJUST(amount, expdest);
 
@@ -704,6 +738,7 @@ evalvar(char *p, int flag)
 	ssize_t varlen;
 	int discard;
 	int quoted;
+	int mbchar;
 
 	varflags = *p++ & ~VSBIT;
 	subtype = varflags & VSTYPE;
@@ -713,8 +748,18 @@ evalvar(char *p, int flag)
 	startloc = expdest - (char *)stackblock();
 	p = strchr(p, '=') + 1;
 
+	mbchar = 0;
+	switch (subtype) {
+	case VSTRIMLEFT:
+	case VSTRIMLEFTMAX:
+	case VSTRIMRIGHT:
+	case VSTRIMRIGHTMAX:
+		mbchar = EXP_MBCHAR;
+		break;
+	}
+
 again:
-	varlen = varvalue(var, varflags, flag, quoted);
+	varlen = varvalue(var, varflags, flag | mbchar, quoted);
 	if (varflags & VSNUL)
 		varlen--;
 
@@ -801,7 +846,7 @@ static char *chtodest(int c, int flags, char *out)
 {
 	const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
 
-	if ((flags & QUOTES_ESC) &&
+	if ((flags & (QUOTES_ESC | EXP_MBCHAR)) &&
 	    ((syntax[c] == CCTL) ||
 	     (flags & EXP_QUOTED && syntax[c] == CBACK)))
 		USTPUTC(CTLESC, out);
@@ -823,9 +868,13 @@ static size_t memtodest(const char *p, size_t len, int flags)
 	if (unlikely(!len))
 		return 0;
 
-	q = makestrspace(len * 2, expdest);
+	/* CTLMBCHAR, 2, c, c, 2, CTLMBCHAR */
+	q = makestrspace(len * 3, expdest);
 
 	do {
+		mbstate_t mbs = {};
+		size_t ml;
+
 		c = (signed char)*p++;
 
 		if (c)
@@ -833,19 +882,30 @@ static size_t memtodest(const char *p, size_t len, int flags)
 		else if (!(flags & EXP_KEEPNUL))
 			continue;
 
-		if (c < 0) {
-			mbstate_t mbs = {};
+		if (c >= 0)
+			goto copy;
 
-			p--;
-			do {
-				q = chtodest(c, flags, q);
-			} while (mbrlen(p++, 1, &mbs) == -2 &&
-				 (c = *p, --len));
-			if (!len)
-				break;
-			continue;
+		ml = mbrlen(p - 1, len, &mbs);
+		if (ml == -1 || ml == -2 || ml < 2 || ml > MB_LEN_MAX)
+			goto copy;
+
+		if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+			USTPUTC(CTLMBCHAR, q);
+			USTPUTC(ml, q);
 		}
 
+		q = mempcpy(q, p - 1, ml);
+
+		if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+			USTPUTC(ml, q);
+			USTPUTC(CTLMBCHAR, q);
+		}
+
+		p += ml - 1;
+		len -= ml - 1;
+		continue;
+
+copy:
 		q = chtodest(c, flags, q);
 	} while (--len);
 
@@ -1720,6 +1780,8 @@ _rmescapes(char *str, int flag)
 	inquotes = 0;
 	notescaped = globbing;
 	while (*p) {
+		unsigned ml;
+
 		if (*p == (char)CTLQUOTEMARK) {
 			p++;
 			inquotes ^= globbing;
@@ -1743,6 +1805,18 @@ add_escape:
 			}
 		}
 		notescaped = globbing;
+
+		if (*p != (char)CTLMBCHAR)
+			goto copy;
+
+		if (*++p == (char)CTLESC)
+			p++;
+
+		ml = (unsigned char)*p++;
+		q = mempcpy(q, p, ml);
+		p += ml + 2;
+		continue;
+
 copy:
 		*q++ = *p++;
 	}
diff --git a/src/expand.h b/src/expand.h
index 49a18f9..e5a990e 100644
--- a/src/expand.h
+++ b/src/expand.h
@@ -60,6 +60,7 @@ struct arglist {
 #define EXP_QUOTED	0x100	/* expand word in double quotes */
 #define EXP_KEEPNUL	0x200	/* do not skip NUL characters */
 #define EXP_DISCARD	0x400	/* discard result of expansion */
+#define EXP_MBCHAR	0x800	/* mark multi-byte characters */
 
 
 struct jmploc;
diff --git a/src/mystring.c b/src/mystring.c
index 5eace6c..77b457c 100644
--- a/src/mystring.c
+++ b/src/mystring.c
@@ -67,7 +67,7 @@ const char cqchars[] = {
 #ifdef HAVE_FNMATCH
 	'^',
 #endif
-	CTLESC, CTLQUOTEMARK, 0
+	CTLESC, CTLMBCHAR, CTLQUOTEMARK, 0
 };
 const char illnum[] = "Illegal number: %s";
 const char homestr[] = "HOME";
diff --git a/src/parser.h b/src/parser.h
index 433573d..14bfc4f 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -44,6 +44,7 @@ union node;
 #define CTLVAR -126		/* variable defn */
 #define CTLENDVAR -125
 #define CTLBACKQ -124
+#define CTLMBCHAR -123
 #define	CTLARI -122		/* arithmetic expression */
 #define	CTLENDARI -121
 #define	CTLQUOTEMARK -120
-- 
2.39.2


  parent reply	other threads:[~2024-04-28  3:56 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-28  3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-28  3:56 ` [v2 PATCH 1/8] shell: Call setlocale Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-04-28  3:57 ` Herbert Xu [this message]
2024-04-28  3:57 ` [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=008ebecbab03a2504589f69ae9c2ed1353f7b6a3.1714276539.git.herbert@gondor.apana.org.au \
    --to=herbert@gondor.apana.org.au \
    --cc=dash@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.