All of lore.kernel.org
 help / color / mirror / Atom feed
From: Herbert Xu <herbert@gondor.apana.org.au>
To: DASH Mailing List <dash@vger.kernel.org>
Subject: [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting
Date: Sun, 28 Apr 2024 11:57:09 +0800	[thread overview]
Message-ID: <ffd890700a18b9beaad65e26876b1d7932b0d018.1714276539.git.herbert@gondor.apana.org.au> (raw)
In-Reply-To: <cover.1714276539.git.herbert@gondor.apana.org.au>

When multi-byte characters are used in IFS, they will be used
for field splitting.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c | 201 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 140 insertions(+), 61 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index 0e85025..dd2b71e 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -54,6 +54,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <wchar.h>
+#include <wctype.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -164,6 +165,30 @@ esclen(const char *start, const char *p) {
 	return esc;
 }
 
+static __attribute__((noinline)) unsigned mbnext(const char *p)
+{
+	unsigned start = 0;
+	unsigned end = 0;
+	unsigned ml;
+	int c;
+
+	c = p[end++];
+
+	switch (c) {
+	case CTLMBCHAR:
+		if (p[end] == CTLESC)
+			end++;
+		ml = (unsigned char)p[end++];
+		start = end;
+		end = ml + 2;
+		break;
+	case CTLESC:
+		start++;
+		break;
+	}
+
+	return start | end << 8;
+}
 
 static inline const char *getpwhome(const char *name)
 {
@@ -552,6 +577,7 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 	loc2 = rmesc;
 	do {
 		const char *s = loc2;
+		unsigned mb;
 		unsigned ml;
 		int match;
 
@@ -568,19 +594,9 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 		if (!c)
 			break;
 
-		if (*loc != (char)CTLMBCHAR) {
-			if (*loc == (char)CTLESC)
-				loc++;
-			loc++;
-			loc2++;
-			continue;
-		}
-
-		if (*++loc == (char)CTLESC)
-			loc++;
-
-		ml = (unsigned char)*loc;
-		loc += ml + 3;
+		mb = mbnext(loc);
+		loc += (mb & 0xff) + (mb >> 8);
+		ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1;
 		loc2 += ml;
 	} while (1);
 	return 0;
@@ -930,18 +946,22 @@ static size_t strtodest(const char *p, int flags)
 STATIC ssize_t
 varvalue(char *name, int varflags, int flags, int quoted)
 {
+	int subtype = varflags & VSTYPE;
+	const char *seps;
+	ssize_t len = 0;
+	unsigned seplen;
+	size_t start;
+	int discard;
+	char sepc;
+	char **ap;
+	int sep;
 	int num;
 	char *p;
 	int i;
-	int sep;
-	char sepc;
-	char **ap;
-	int subtype = varflags & VSTYPE;
-	int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
-		      (flags & EXP_DISCARD);
-	ssize_t len = 0;
-	size_t start;
-	char c;
+	int c;
+
+	discard = (subtype == VSPLUS || subtype == VSLENGTH) |
+		  (flags & EXP_DISCARD);
 
 	if (!subtype) {
 		if (discard)
@@ -1004,15 +1024,27 @@ numvar:
 		sep &= ~quoted;
 		sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' ';
 param:
-		sepc = sep;
 		if (!(ap = shellparam.p))
 			return -1;
+		sepc = sep;
+		seps = &sepc;
+		seplen = 1;
+		if (sepc < 0) {
+			mbstate_t mbs = {};
+			size_t ml;
+
+			ml = mbrlen(ifsval(), strlen(ifsval()), &mbs);
+			if (ml != -1 && ml != -2 && ml > 1) {
+				seps = ifsval();
+				seplen = ml;
+			}
+		}
 		while ((p = *ap++)) {
 			len += strtodest(p, flags);
 
 			if (*ap && sep) {
 				len++;
-				memtodest(&sepc, 1, flags | EXP_KEEPNUL);
+				memtodest(seps, seplen, flags | EXP_KEEPNUL);
 			}
 		}
 		break;
@@ -1074,7 +1106,54 @@ recordregion(int start, int end, int nulonly)
 	ifslastp->nulonly = nulonly;
 }
 
+static __attribute__((noinline)) unsigned ifsisifs(
+	const char *p, unsigned ml, const char *ifs, size_t ifslen)
+{
+	bool isdefifs = false;
+	size_t slen = ifslen;
+	const char *s = ifs;
+	wchar_t c = *p;
+	bool isifs;
 
+	isifs = !c;
+	if (isifs) {
+		p = ifs;
+		c = *p;
+		slen = 0;
+	}
+
+	while (slen) {
+		mbstate_t mbst = {};
+		size_t ifsml;
+		wchar_t c2;
+
+		if ((signed char)*s > 0 ||
+		    (ifsml = mbrtowc(&c2, s, slen, &mbst),
+		     ifsml == -2 || ifsml == -1 || ifsml < 2)) {
+			if (c == *s) {
+				isifs = true;
+				break;
+			}
+			s++;
+			slen--;
+			continue;
+		}
+
+		if (ifsml == ml && !memcmp(p, s, ifsml)) {
+			isifs = true;
+			c = c2;
+			break;
+		}
+
+		s += ifsml;
+		slen -= ifsml;
+	}
+
+	if (isifs)
+		isdefifs = iswspace(c);
+
+	return isifs | isdefifs << 1;
+}
 
 /*
  * Break the argument string into pieces based upon IFS and add the
@@ -1086,16 +1165,16 @@ recordregion(int start, int end, int nulonly)
 void
 ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 {
+	const char *ifs, *realifs;
 	struct ifsregion *ifsp;
 	struct strlist *sp;
+	char *r = NULL;
+	size_t ifslen;
 	char *start;
+	int nulonly;
+	int ifsspc;
 	char *p;
 	char *q;
-	char *r = NULL;
-	const char *ifs, *realifs;
-	int ifsspc;
-	int nulonly;
-
 
 	start = string;
 	if (ifslastp != NULL) {
@@ -1110,21 +1189,27 @@ ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 			afternul = nulonly;
 			nulonly = ifsp->nulonly;
 			ifs = nulonly ? nullstr : realifs;
+			ifslen = strlen(ifs);
 			ifsspc = 0;
 			while (p < string + ifsp->endoff) {
-				int c;
-				bool isifs;
+				unsigned ifschar;
+				unsigned sisifs;
 				bool isdefifs;
+				unsigned ml;
+				bool isifs;
 
 				q = p;
-				c = *p++;
-				if (c == (char)CTLESC)
-					c = *p++;
 
-				isifs = strchr(ifs, c);
-				isdefifs = false;
-				if (isifs)
-					isdefifs = strchr(defifs, c);
+				ifschar = mbnext(p);
+				p += ifschar & 0xff;
+				ml = (ifschar >> 8) > 3 ?
+				     (ifschar >> 8) - 2 : 0;
+
+				sisifs = ifsisifs(p, ml, ifs, ifslen);
+				p += ifschar >> 8;
+
+				isifs = sisifs & 1;
+				isdefifs = sisifs >> 1;
 
 				/* If only reading one more argument:
 				 * If we have exactly one field,
@@ -1380,32 +1465,24 @@ static void expmeta_rmescapes(char *enddir, char *name)
 	preglob(strcpy(enddir, name), RMESCAPE_EMETA);
 }
 
-static unsigned mbcharlen(char *p)
-{
-	int esc = 0;
-
-	if (*++p == (char)CTLESC)
-		esc++;
-
-	return esc + 3 + (unsigned char)p[esc];
-}
-
 static int skipesc(char *p)
 {
+	unsigned short mb;
 	int esc = 0;
 
-	if (p[esc] == (char)CTLMBCHAR)
-		return esc + mbcharlen(p);
+	mb = mbnext(p);
+	if ((mb >> 8) > 3)
+		return (mb & 0xff) + (mb >> 8) - 1;
 
-	if (*p == (char)CTLESC)
-		esc++;
+	esc = mb & 0xff;
 
 	if (p[esc] == '\\' && p[esc + 1]) {
 		esc++;
-		if (p[esc] == (char)CTLMBCHAR)
-			return esc + mbcharlen(p + esc);
-		if (p[esc] == (char)CTLESC)
-			esc++;
+		mb = mbnext(p + esc);
+		if ((mb >> 8) > 3)
+			return esc + (mb & 0xff) + (mb >> 8) - 1;
+
+		esc += mb & 0xff;
 	}
 
 	return esc;
@@ -1813,6 +1890,7 @@ _rmescapes(char *str, int flag)
 	inquotes = 0;
 	notescaped = globbing;
 	while (*p) {
+		unsigned mb;
 		unsigned ml;
 
 		if (*p == (char)CTLQUOTEMARK) {
@@ -1845,13 +1923,14 @@ add_escape:
 		}
 		notescaped = globbing;
 
-		if (*p != (char)CTLMBCHAR)
+		mb = mbnext(p);
+		ml = mb >> 8;
+
+		if (ml <= 3)
 			goto copy;
 
-		if (*++p == (char)CTLESC)
-			p++;
-
-		ml = (unsigned char)*p++;
+		ml -= 2;
+		p += mb & 0xff;
 		q = mempcpy(q, p, ml);
 		p += ml + 2;
 		continue;
-- 
2.39.2


  parent reply	other threads:[~2024-04-28  3:56 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-28  3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-28  3:56 ` [v2 PATCH 1/8] shell: Call setlocale Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
2024-04-28  3:57 ` Herbert Xu [this message]
2024-04-28  3:57 ` [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-04-28  3:57 ` [v2 PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ffd890700a18b9beaad65e26876b1d7932b0d018.1714276539.git.herbert@gondor.apana.org.au \
    --to=herbert@gondor.apana.org.au \
    --cc=dash@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.