* [v2 PATCH 1/8] shell: Call setlocale
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
@ 2024-04-28 3:56 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:56 UTC (permalink / raw)
To: DASH Mailing List
Call setlocale to initialise locale settings for libc.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/main.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/main.c b/src/main.c
index 7beb280..1e192f8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -32,6 +32,7 @@
* SUCH DAMAGE.
*/
+#include <locale.h>
#include <stdio.h>
#include <signal.h>
#include <sys/stat.h>
@@ -101,6 +102,9 @@ main(int argc, char **argv)
#if PROFILE
monitor(4, etext, profile_buf, sizeof profile_buf, 50);
#endif
+
+ setlocale(LC_ALL, "");
+
state = 0;
if (unlikely(setjmp(main_handler.loc))) {
int e;
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-28 3:56 ` [v2 PATCH 1/8] shell: Call setlocale Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
Use strcoll instead of strcmp so that the locale is taken into
account when sorting strings during pathname expansion, and for
the built-in test(1) string comparison operators.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/bltin/test.c | 8 ++++----
src/expand.c | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/bltin/test.c b/src/bltin/test.c
index fd8a43b..2db4d0f 100644
--- a/src/bltin/test.c
+++ b/src/bltin/test.c
@@ -353,13 +353,13 @@ binop(void)
/* NOTREACHED */
#endif
case STREQ:
- return strcmp(opnd1, opnd2) == 0;
+ return strcoll(opnd1, opnd2) == 0;
case STRNE:
- return strcmp(opnd1, opnd2) != 0;
+ return strcoll(opnd1, opnd2) != 0;
case STRLT:
- return strcmp(opnd1, opnd2) < 0;
+ return strcoll(opnd1, opnd2) < 0;
case STRGT:
- return strcmp(opnd1, opnd2) > 0;
+ return strcoll(opnd1, opnd2) > 0;
case INTEQ:
return getn(opnd1) == getn(opnd2);
case INTNE:
diff --git a/src/expand.c b/src/expand.c
index 0db2b29..9ac981e 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -1476,7 +1476,7 @@ msort(struct strlist *list, int len)
p = msort(p, len - half); /* sort second half */
lpp = &list;
for (;;) {
- if (strcmp(p->text, q->text) < 0) {
+ if (strcoll(p->text, q->text) < 0) {
*lpp = p;
lpp = &p->next;
if ((p = *lpp) == NULL) {
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-28 3:56 ` [v2 PATCH 1/8] shell: Call setlocale Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
Count multi-byte characters in variables and rather than bytes
and return that as the length expansion.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 62 +++++++++++++++++++++++++++++++++++++---------------
1 file changed, 44 insertions(+), 18 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index 9ac981e..ad186b0 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -53,6 +53,7 @@
#endif
#include <ctype.h>
#include <stdbool.h>
+#include <wchar.h>
/*
* Routines to expand arguments to commands. We have to deal with
@@ -796,6 +797,18 @@ really_record:
return p;
}
+static char *chtodest(int c, int flags, char *out)
+{
+ const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
+
+ if ((flags & QUOTES_ESC) &&
+ ((syntax[c] == CCTL) ||
+ (flags & EXP_QUOTED && syntax[c] == CBACK)))
+ USTPUTC(CTLESC, out);
+ USTPUTC(c, out);
+
+ return out;
+}
/*
* Put a string on the stack.
@@ -803,38 +816,48 @@ really_record:
static size_t memtodest(const char *p, size_t len, int flags)
{
- const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
+ size_t count = 0;
char *q;
- char *s;
+ int c;
if (unlikely(!len))
return 0;
q = makestrspace(len * 2, expdest);
- s = q;
do {
- int c = (signed char)*p++;
- if (c) {
- if ((flags & QUOTES_ESC) &&
- ((syntax[c] == CCTL) ||
- (flags & EXP_QUOTED && syntax[c] == CBACK)))
- USTPUTC(CTLESC, q);
- } else if (!(flags & EXP_KEEPNUL))
+ c = (signed char)*p++;
+
+ if (c)
+ count++;
+ else if (!(flags & EXP_KEEPNUL))
continue;
- USTPUTC(c, q);
+
+ if (c < 0) {
+ mbstate_t mbs = {};
+
+ p--;
+ do {
+ q = chtodest(c, flags, q);
+ } while (mbrlen(p++, 1, &mbs) == -2 &&
+ (c = *p, --len));
+ if (!len)
+ break;
+ continue;
+ }
+
+ q = chtodest(c, flags, q);
} while (--len);
expdest = q;
- return q - s;
+ return count;
}
static size_t strtodest(const char *p, int flags)
{
size_t len = strlen(p);
- memtodest(p, len, flags);
- return len;
+ return memtodest(p, len, flags);
}
@@ -856,6 +879,7 @@ varvalue(char *name, int varflags, int flags, int quoted)
int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
(flags & EXP_DISCARD);
ssize_t len = 0;
+ size_t start;
char c;
if (!subtype) {
@@ -865,9 +889,9 @@ varvalue(char *name, int varflags, int flags, int quoted)
sh_error("Bad substitution");
}
- flags |= EXP_KEEPNUL;
flags &= discard ? ~QUOTES_ESC : ~0;
sep = (flags & EXP_FULL) << CHAR_BIT;
+ start = expdest - (char *)stackblock();
switch (*name) {
case '$':
@@ -927,7 +951,7 @@ param:
if (*ap && sep) {
len++;
- memtodest(&sepc, 1, flags);
+ memtodest(&sepc, 1, flags | EXP_KEEPNUL);
}
}
break;
@@ -957,7 +981,7 @@ value:
}
if (discard)
- STADJUST(-len, expdest);
+ expdest = (char *)stackblock() + start;
return len;
}
@@ -1758,11 +1782,13 @@ casematch(union node *pattern, char *val)
static size_t cvtnum(intmax_t num, int flags)
{
+ size_t start = expdest - (char *)stackblock();
int len = max_int_length(sizeof(num));
char buf[len];
len = fmtstr(buf, len, "%" PRIdMAX, num);
- return memtodest(buf, len, flags);
+ memtodest(buf, len, flags);
+ return (expdest - (char *)stackblock()) - start;
}
STATIC void
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
` (2 preceding siblings ...)
2024-04-28 3:57 ` [v2 PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
When trimming variables in subevalvar, process multi-byte characters
as one unit instead of their constituent bytes.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 192 ++++++++++++++++++++++++++++++++++---------------
src/expand.h | 1 +
src/mystring.c | 2 +-
src/parser.h | 1 +
4 files changed, 136 insertions(+), 60 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index ad186b0..60a51b1 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -32,27 +32,27 @@
* SUCH DAMAGE.
*/
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/stat.h>
+#include <ctype.h>
#include <dirent.h>
-#include <unistd.h>
-#ifdef HAVE_GETPWNAM
-#include <pwd.h>
-#endif
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <string.h>
#ifdef HAVE_FNMATCH
#include <fnmatch.h>
#endif
#ifdef HAVE_GLOB
#include <glob.h>
#endif
-#include <ctype.h>
+#include <inttypes.h>
+#include <limits.h>
+#ifdef HAVE_GETPWNAM
+#include <pwd.h>
+#endif
+#include <string.h>
#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <unistd.h>
#include <wchar.h>
/*
@@ -550,8 +550,10 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
loc = startp;
loc2 = rmesc;
do {
- int match;
const char *s = loc2;
+ unsigned ml;
+ int match;
+
c = *loc2;
if (zero) {
*loc2 = '\0';
@@ -560,12 +562,26 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
match = pmatch(str, s);
*loc2 = c;
if (match)
- return loc;
- if (quotes && *loc == (char)CTLESC)
+ return quotes ? loc : loc2;
+
+ if (!c)
+ break;
+
+ if (*loc != (char)CTLMBCHAR) {
+ if (*loc == (char)CTLESC)
+ loc++;
loc++;
- loc++;
- loc2++;
- } while (c);
+ loc2++;
+ continue;
+ }
+
+ if (*++loc == (char)CTLESC)
+ loc++;
+
+ ml = (unsigned char)*loc;
+ loc += ml + 3;
+ loc2 += ml;
+ } while (1);
return 0;
}
@@ -573,14 +589,16 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
char *str, int quotes, int zero
) {
- int esc = 0;
+ size_t esc = 0;
char *loc;
char *loc2;
for (loc = endp, loc2 = rmescend; loc >= startp; loc2--) {
- int match;
- char c = *loc2;
const char *s = loc2;
+ char c = *loc2;
+ unsigned ml;
+ int match;
+
if (zero) {
*loc2 = '\0';
s = rmesc;
@@ -588,17 +606,23 @@ static char *scanright(char *startp, char *endp, char *rmesc, char *rmescend,
match = pmatch(str, s);
*loc2 = c;
if (match)
- return loc;
+ return quotes ? loc : loc2;
loc--;
- if (quotes) {
- if (--esc < 0) {
- esc = esclen(startp, loc);
- }
- if (esc % 2) {
- esc--;
- loc--;
- }
+ if (!esc--)
+ esc = esclen(startp, loc);
+ if (esc % 2) {
+ esc--;
+ loc--;
+ continue;
}
+ if (*loc != (char)CTLMBCHAR)
+ continue;
+
+ ml = (unsigned char)*--loc;
+ loc -= ml + 2;
+ if (*loc == (char)CTLESC)
+ loc--;
+ loc2 -= ml - 1;
}
return 0;
}
@@ -652,14 +676,11 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
nstrloc = str - (char *)stackblock();
}
- rmesc = startp;
- if (quotes) {
- rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
- if (rmesc != startp)
- rmescend = expdest;
- startp = stackblock() + startloc;
- str = stackblock() + nstrloc;
- }
+ rmesc = _rmescapes(startp, RMESCAPE_ALLOC | RMESCAPE_GROW);
+ if (rmesc != startp)
+ rmescend = expdest;
+ startp = stackblock() + startloc;
+ str = stackblock() + nstrloc;
rmescend--;
/* zero = subtype == VSTRIMLEFT || subtype == VSTRIMLEFTMAX */
@@ -669,16 +690,29 @@ static char *subevalvar(char *start, char *str, int strloc, int startloc,
endp = stackblock() + strloc - 1;
loc = scan(startp, endp, rmesc, rmescend, str, quotes, zero);
- if (loc) {
- if (zero) {
- memmove(startp, loc, endp - loc);
- loc = startp + (endp - loc);
+ if (!loc) {
+ if (quotes) {
+ rmesc = startp;
+ rmescend = endp;
}
- *loc = '\0';
- } else
- loc = endp;
+ } else if (!quotes) {
+ if (zero)
+ rmesc = loc;
+ else
+ rmescend = loc;
+ } else if (zero) {
+ rmesc = loc;
+ rmescend = endp;
+ } else {
+ rmesc = startp;
+ rmescend = loc;
+ }
+
+ memmove(startp, rmesc, rmescend - rmesc);
+ loc = startp + (rmescend - rmesc);
out:
+ *loc = '\0';
amount = loc - expdest;
STADJUST(amount, expdest);
@@ -704,6 +738,7 @@ evalvar(char *p, int flag)
ssize_t varlen;
int discard;
int quoted;
+ int mbchar;
varflags = *p++ & ~VSBIT;
subtype = varflags & VSTYPE;
@@ -713,8 +748,18 @@ evalvar(char *p, int flag)
startloc = expdest - (char *)stackblock();
p = strchr(p, '=') + 1;
+ mbchar = 0;
+ switch (subtype) {
+ case VSTRIMLEFT:
+ case VSTRIMLEFTMAX:
+ case VSTRIMRIGHT:
+ case VSTRIMRIGHTMAX:
+ mbchar = EXP_MBCHAR;
+ break;
+ }
+
again:
- varlen = varvalue(var, varflags, flag, quoted);
+ varlen = varvalue(var, varflags, flag | mbchar, quoted);
if (varflags & VSNUL)
varlen--;
@@ -801,7 +846,7 @@ static char *chtodest(int c, int flags, char *out)
{
const char *syntax = flags & EXP_QUOTED ? DQSYNTAX : BASESYNTAX;
- if ((flags & QUOTES_ESC) &&
+ if ((flags & (QUOTES_ESC | EXP_MBCHAR)) &&
((syntax[c] == CCTL) ||
(flags & EXP_QUOTED && syntax[c] == CBACK)))
USTPUTC(CTLESC, out);
@@ -823,9 +868,13 @@ static size_t memtodest(const char *p, size_t len, int flags)
if (unlikely(!len))
return 0;
- q = makestrspace(len * 2, expdest);
+ /* CTLMBCHAR, 2, c, c, 2, CTLMBCHAR */
+ q = makestrspace(len * 3, expdest);
do {
+ mbstate_t mbs = {};
+ size_t ml;
+
c = (signed char)*p++;
if (c)
@@ -833,19 +882,30 @@ static size_t memtodest(const char *p, size_t len, int flags)
else if (!(flags & EXP_KEEPNUL))
continue;
- if (c < 0) {
- mbstate_t mbs = {};
+ if (c >= 0)
+ goto copy;
- p--;
- do {
- q = chtodest(c, flags, q);
- } while (mbrlen(p++, 1, &mbs) == -2 &&
- (c = *p, --len));
- if (!len)
- break;
- continue;
+ ml = mbrlen(p - 1, len, &mbs);
+ if (ml == -1 || ml == -2 || ml < 2 || ml > MB_LEN_MAX)
+ goto copy;
+
+ if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+ USTPUTC(CTLMBCHAR, q);
+ USTPUTC(ml, q);
}
+ q = mempcpy(q, p - 1, ml);
+
+ if ((flags & (QUOTES_ESC | EXP_MBCHAR))) {
+ USTPUTC(ml, q);
+ USTPUTC(CTLMBCHAR, q);
+ }
+
+ p += ml - 1;
+ len -= ml - 1;
+ continue;
+
+copy:
q = chtodest(c, flags, q);
} while (--len);
@@ -1720,6 +1780,8 @@ _rmescapes(char *str, int flag)
inquotes = 0;
notescaped = globbing;
while (*p) {
+ unsigned ml;
+
if (*p == (char)CTLQUOTEMARK) {
p++;
inquotes ^= globbing;
@@ -1743,6 +1805,18 @@ add_escape:
}
}
notescaped = globbing;
+
+ if (*p != (char)CTLMBCHAR)
+ goto copy;
+
+ if (*++p == (char)CTLESC)
+ p++;
+
+ ml = (unsigned char)*p++;
+ q = mempcpy(q, p, ml);
+ p += ml + 2;
+ continue;
+
copy:
*q++ = *p++;
}
diff --git a/src/expand.h b/src/expand.h
index 49a18f9..e5a990e 100644
--- a/src/expand.h
+++ b/src/expand.h
@@ -60,6 +60,7 @@ struct arglist {
#define EXP_QUOTED 0x100 /* expand word in double quotes */
#define EXP_KEEPNUL 0x200 /* do not skip NUL characters */
#define EXP_DISCARD 0x400 /* discard result of expansion */
+#define EXP_MBCHAR 0x800 /* mark multi-byte characters */
struct jmploc;
diff --git a/src/mystring.c b/src/mystring.c
index 5eace6c..77b457c 100644
--- a/src/mystring.c
+++ b/src/mystring.c
@@ -67,7 +67,7 @@ const char cqchars[] = {
#ifdef HAVE_FNMATCH
'^',
#endif
- CTLESC, CTLQUOTEMARK, 0
+ CTLESC, CTLMBCHAR, CTLQUOTEMARK, 0
};
const char illnum[] = "Illegal number: %s";
const char homestr[] = "HOME";
diff --git a/src/parser.h b/src/parser.h
index 433573d..14bfc4f 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -44,6 +44,7 @@ union node;
#define CTLVAR -126 /* variable defn */
#define CTLENDVAR -125
#define CTLBACKQ -124
+#define CTLMBCHAR -123
#define CTLARI -122 /* arithmetic expression */
#define CTLENDARI -121
#define CTLQUOTEMARK -120
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
` (3 preceding siblings ...)
2024-04-28 3:57 ` [v2 PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
When glob(3) is not in use, make sure that expmeta processes
multi-byte characters correctly.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 107 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 73 insertions(+), 34 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index 60a51b1..0e85025 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -84,6 +84,7 @@
#define RMESCAPE_GLOB 0x2 /* Add backslashes for glob */
#define RMESCAPE_GROW 0x8 /* Grow strings instead of stalloc */
#define RMESCAPE_HEAP 0x10 /* Malloc strings instead of stalloc */
+#define RMESCAPE_EMETA 0x20 /* Remove backslashes too */
/* Add CTLESC when necessary. */
#define QUOTES_ESC (EXP_FULL | EXP_CASE)
@@ -1347,15 +1348,13 @@ expandmeta(struct strlist *str)
savelastp = exparg.lastp;
INTOFF;
- p = preglob(str->text, RMESCAPE_ALLOC | RMESCAPE_HEAP);
+ p = str->text;
len = strlen(p);
expdir_max = len + PATH_MAX;
expdir = ckmalloc(expdir_max);
expmeta(p, len, 0);
ckfree(expdir);
- if (p != str->text)
- ckfree(p);
INTON;
if (exparg.lastp == savelastp) {
/*
@@ -1376,6 +1375,41 @@ nometa:
}
}
+static void expmeta_rmescapes(char *enddir, char *name)
+{
+ preglob(strcpy(enddir, name), RMESCAPE_EMETA);
+}
+
+static unsigned mbcharlen(char *p)
+{
+ int esc = 0;
+
+ if (*++p == (char)CTLESC)
+ esc++;
+
+ return esc + 3 + (unsigned char)p[esc];
+}
+
+static int skipesc(char *p)
+{
+ int esc = 0;
+
+ if (p[esc] == (char)CTLMBCHAR)
+ return esc + mbcharlen(p);
+
+ if (*p == (char)CTLESC)
+ esc++;
+
+ if (p[esc] == '\\' && p[esc + 1]) {
+ esc++;
+ if (p[esc] == (char)CTLMBCHAR)
+ return esc + mbcharlen(p + esc);
+ if (p[esc] == (char)CTLESC)
+ esc++;
+ }
+
+ return esc;
+}
/*
* Do metacharacter (i.e. *, ?, [...]) expansion.
@@ -1385,17 +1419,18 @@ STATIC void
expmeta(char *name, unsigned name_len, unsigned expdir_len)
{
char *enddir = expdir + expdir_len;
- char *p;
+ struct stat64 statb;
+ struct dirent64 *dp;
const char *cp;
- char *start;
char *endname;
int metaflag;
- struct stat64 statb;
- DIR *dirp;
- struct dirent64 *dp;
- int atend;
int matchdot;
+ char *start;
+ DIR *dirp;
+ char *pat;
+ char *p;
int esc;
+ int c;
metaflag = 0;
start = name;
@@ -1407,8 +1442,7 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
if (*q == '!')
q++;
for (;;) {
- if (*q == '\\')
- q++;
+ q += skipesc(q);
if (*q == '/' || *q == '\0')
break;
if (*++q == ']') {
@@ -1417,8 +1451,8 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
}
}
} else {
- if (*p == '\\' && p[1])
- esc++;
+ esc = skipesc(p);
+
if (p[esc] == '/') {
if (metaflag)
break;
@@ -1429,24 +1463,18 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
if (metaflag == 0) { /* we've reached the end of the file name */
if (!expdir_len)
return;
- p = name;
- do {
- if (*p == '\\' && p[1])
- p++;
- *enddir++ = *p;
- } while (*p++);
+ expmeta_rmescapes(enddir, name);
if (lstat64(expdir, &statb) >= 0)
addfname(expdir);
return;
}
endname = p;
if (name < start) {
- p = name;
- do {
- if (*p == '\\' && p[1])
- p++;
- *enddir++ = *p++;
- } while (p < start);
+ c = *start;
+ *start = 0;
+ expmeta_rmescapes(enddir, name);
+ *start = c;
+ enddir += strlen(enddir);
}
*enddir = 0;
cp = expdir;
@@ -1455,25 +1483,26 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
cp = ".";
if ((dirp = opendir(cp)) == NULL)
return;
- if (*endname == 0) {
- atend = 1;
- } else {
- atend = 0;
+ c = *endname;
+ if (c) {
*endname = '\0';
endname += esc + 1;
}
name_len -= endname - name;
matchdot = 0;
p = start;
+ if (*p == (char)CTLESC)
+ p++;
if (*p == '\\')
p++;
if (*p == '.')
matchdot++;
+ pat = preglob(start, RMESCAPE_ALLOC | RMESCAPE_HEAP);
while (! int_pending() && (dp = readdir64(dirp)) != NULL) {
if (dp->d_name[0] == '.' && ! matchdot)
continue;
- if (pmatch(start, dp->d_name)) {
- if (atend) {
+ if (pmatch(pat, dp->d_name)) {
+ if (!c) {
scopy(dp->d_name, enddir);
addfname(expdir);
} else {
@@ -1496,9 +1525,11 @@ expmeta(char *name, unsigned name_len, unsigned expdir_len)
}
}
}
+ if (pat != start)
+ ckfree(pat);
closedir(dirp);
- if (! atend)
- endname[-esc - 1] = esc ? '\\' : '/';
+ if (c)
+ endname[-esc - 1] = c;
}
#endif /* HAVE_GLOB */
@@ -1743,6 +1774,7 @@ _rmescapes(char *str, int flag)
int notescaped;
int globbing;
int inquotes;
+ int expmeta;
p = strpbrk(str, cqchars);
if (!p) {
@@ -1751,6 +1783,7 @@ _rmescapes(char *str, int flag)
q = p;
r = str;
globbing = flag & RMESCAPE_GLOB;
+ expmeta = (flag & RMESCAPE_EMETA) ? RMESCAPE_GLOB : 0;
if (flag & RMESCAPE_ALLOC) {
size_t len = p - str;
@@ -1790,6 +1823,10 @@ _rmescapes(char *str, int flag)
if (*p == '\\') {
/* naked back slash */
notescaped ^= globbing;
+ if (expmeta & ~notescaped) {
+ p++;
+ continue;
+ }
goto copy;
}
if (FNMATCH_IS_ENABLED && *p == '^')
@@ -1797,7 +1834,9 @@ _rmescapes(char *str, int flag)
if (*p == (char)CTLESC) {
p++;
add_escape:
- if (notescaped)
+ if (expmeta)
+ ;
+ else if (notescaped)
*q++ = '\\';
else if (inquotes) {
*q++ = '\\';
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
` (4 preceding siblings ...)
2024-04-28 3:57 ` [v2 PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
When multi-byte characters are used in IFS, they will be used
for field splitting.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 201 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 140 insertions(+), 61 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index 0e85025..dd2b71e 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -54,6 +54,7 @@
#include <sys/stat.h>
#include <unistd.h>
#include <wchar.h>
+#include <wctype.h>
/*
* Routines to expand arguments to commands. We have to deal with
@@ -164,6 +165,30 @@ esclen(const char *start, const char *p) {
return esc;
}
+static __attribute__((noinline)) unsigned mbnext(const char *p)
+{
+ unsigned start = 0;
+ unsigned end = 0;
+ unsigned ml;
+ int c;
+
+ c = p[end++];
+
+ switch (c) {
+ case CTLMBCHAR:
+ if (p[end] == CTLESC)
+ end++;
+ ml = (unsigned char)p[end++];
+ start = end;
+ end = ml + 2;
+ break;
+ case CTLESC:
+ start++;
+ break;
+ }
+
+ return start | end << 8;
+}
static inline const char *getpwhome(const char *name)
{
@@ -552,6 +577,7 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
loc2 = rmesc;
do {
const char *s = loc2;
+ unsigned mb;
unsigned ml;
int match;
@@ -568,19 +594,9 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
if (!c)
break;
- if (*loc != (char)CTLMBCHAR) {
- if (*loc == (char)CTLESC)
- loc++;
- loc++;
- loc2++;
- continue;
- }
-
- if (*++loc == (char)CTLESC)
- loc++;
-
- ml = (unsigned char)*loc;
- loc += ml + 3;
+ mb = mbnext(loc);
+ loc += (mb & 0xff) + (mb >> 8);
+ ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1;
loc2 += ml;
} while (1);
return 0;
@@ -930,18 +946,22 @@ static size_t strtodest(const char *p, int flags)
STATIC ssize_t
varvalue(char *name, int varflags, int flags, int quoted)
{
+ int subtype = varflags & VSTYPE;
+ const char *seps;
+ ssize_t len = 0;
+ unsigned seplen;
+ size_t start;
+ int discard;
+ char sepc;
+ char **ap;
+ int sep;
int num;
char *p;
int i;
- int sep;
- char sepc;
- char **ap;
- int subtype = varflags & VSTYPE;
- int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
- (flags & EXP_DISCARD);
- ssize_t len = 0;
- size_t start;
- char c;
+ int c;
+
+ discard = (subtype == VSPLUS || subtype == VSLENGTH) |
+ (flags & EXP_DISCARD);
if (!subtype) {
if (discard)
@@ -1004,15 +1024,27 @@ numvar:
sep &= ~quoted;
sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' ';
param:
- sepc = sep;
if (!(ap = shellparam.p))
return -1;
+ sepc = sep;
+ seps = &sepc;
+ seplen = 1;
+ if (sepc < 0) {
+ mbstate_t mbs = {};
+ size_t ml;
+
+ ml = mbrlen(ifsval(), strlen(ifsval()), &mbs);
+ if (ml != -1 && ml != -2 && ml > 1) {
+ seps = ifsval();
+ seplen = ml;
+ }
+ }
while ((p = *ap++)) {
len += strtodest(p, flags);
if (*ap && sep) {
len++;
- memtodest(&sepc, 1, flags | EXP_KEEPNUL);
+ memtodest(seps, seplen, flags | EXP_KEEPNUL);
}
}
break;
@@ -1074,7 +1106,54 @@ recordregion(int start, int end, int nulonly)
ifslastp->nulonly = nulonly;
}
+static __attribute__((noinline)) unsigned ifsisifs(
+ const char *p, unsigned ml, const char *ifs, size_t ifslen)
+{
+ bool isdefifs = false;
+ size_t slen = ifslen;
+ const char *s = ifs;
+ wchar_t c = *p;
+ bool isifs;
+ isifs = !c;
+ if (isifs) {
+ p = ifs;
+ c = *p;
+ slen = 0;
+ }
+
+ while (slen) {
+ mbstate_t mbst = {};
+ size_t ifsml;
+ wchar_t c2;
+
+ if ((signed char)*s > 0 ||
+ (ifsml = mbrtowc(&c2, s, slen, &mbst),
+ ifsml == -2 || ifsml == -1 || ifsml < 2)) {
+ if (c == *s) {
+ isifs = true;
+ break;
+ }
+ s++;
+ slen--;
+ continue;
+ }
+
+ if (ifsml == ml && !memcmp(p, s, ifsml)) {
+ isifs = true;
+ c = c2;
+ break;
+ }
+
+ s += ifsml;
+ slen -= ifsml;
+ }
+
+ if (isifs)
+ isdefifs = iswspace(c);
+
+ return isifs | isdefifs << 1;
+}
/*
* Break the argument string into pieces based upon IFS and add the
@@ -1086,16 +1165,16 @@ recordregion(int start, int end, int nulonly)
void
ifsbreakup(char *string, int maxargs, struct arglist *arglist)
{
+ const char *ifs, *realifs;
struct ifsregion *ifsp;
struct strlist *sp;
+ char *r = NULL;
+ size_t ifslen;
char *start;
+ int nulonly;
+ int ifsspc;
char *p;
char *q;
- char *r = NULL;
- const char *ifs, *realifs;
- int ifsspc;
- int nulonly;
-
start = string;
if (ifslastp != NULL) {
@@ -1110,21 +1189,27 @@ ifsbreakup(char *string, int maxargs, struct arglist *arglist)
afternul = nulonly;
nulonly = ifsp->nulonly;
ifs = nulonly ? nullstr : realifs;
+ ifslen = strlen(ifs);
ifsspc = 0;
while (p < string + ifsp->endoff) {
- int c;
- bool isifs;
+ unsigned ifschar;
+ unsigned sisifs;
bool isdefifs;
+ unsigned ml;
+ bool isifs;
q = p;
- c = *p++;
- if (c == (char)CTLESC)
- c = *p++;
- isifs = strchr(ifs, c);
- isdefifs = false;
- if (isifs)
- isdefifs = strchr(defifs, c);
+ ifschar = mbnext(p);
+ p += ifschar & 0xff;
+ ml = (ifschar >> 8) > 3 ?
+ (ifschar >> 8) - 2 : 0;
+
+ sisifs = ifsisifs(p, ml, ifs, ifslen);
+ p += ifschar >> 8;
+
+ isifs = sisifs & 1;
+ isdefifs = sisifs >> 1;
/* If only reading one more argument:
* If we have exactly one field,
@@ -1380,32 +1465,24 @@ static void expmeta_rmescapes(char *enddir, char *name)
preglob(strcpy(enddir, name), RMESCAPE_EMETA);
}
-static unsigned mbcharlen(char *p)
-{
- int esc = 0;
-
- if (*++p == (char)CTLESC)
- esc++;
-
- return esc + 3 + (unsigned char)p[esc];
-}
-
static int skipesc(char *p)
{
+ unsigned short mb;
int esc = 0;
- if (p[esc] == (char)CTLMBCHAR)
- return esc + mbcharlen(p);
+ mb = mbnext(p);
+ if ((mb >> 8) > 3)
+ return (mb & 0xff) + (mb >> 8) - 1;
- if (*p == (char)CTLESC)
- esc++;
+ esc = mb & 0xff;
if (p[esc] == '\\' && p[esc + 1]) {
esc++;
- if (p[esc] == (char)CTLMBCHAR)
- return esc + mbcharlen(p + esc);
- if (p[esc] == (char)CTLESC)
- esc++;
+ mb = mbnext(p + esc);
+ if ((mb >> 8) > 3)
+ return esc + (mb & 0xff) + (mb >> 8) - 1;
+
+ esc += mb & 0xff;
}
return esc;
@@ -1813,6 +1890,7 @@ _rmescapes(char *str, int flag)
inquotes = 0;
notescaped = globbing;
while (*p) {
+ unsigned mb;
unsigned ml;
if (*p == (char)CTLQUOTEMARK) {
@@ -1845,13 +1923,14 @@ add_escape:
}
notescaped = globbing;
- if (*p != (char)CTLMBCHAR)
+ mb = mbnext(p);
+ ml = mb >> 8;
+
+ if (ml <= 3)
goto copy;
- if (*++p == (char)CTLESC)
- p++;
-
- ml = (unsigned char)*p++;
+ ml -= 2;
+ p += mb & 0xff;
q = mempcpy(q, p, ml);
p += ml + 2;
continue;
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
` (5 preceding siblings ...)
2024-04-28 3:57 ` [v2 PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
2024-04-28 3:57 ` [v2 PATCH 8/8] parser: Add support for multi-byte characters Herbert Xu
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
In order to parse multi-byte characters which may be up to MB_LEN_MAX
bytes long, allow enough calls to pungetc to undo a single multi-byte
character.
Also add a function pungetn to do multiple pungetc calls in a row.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/input.c | 58 ++++++++++++++++++++++++++++++++++-------------------
src/input.h | 11 +++++-----
2 files changed, 42 insertions(+), 27 deletions(-)
diff --git a/src/input.c b/src/input.c
index fb9858f..c7805ad 100644
--- a/src/input.c
+++ b/src/input.c
@@ -56,7 +56,7 @@
#include "main.h"
#include "myhistedit.h"
-#define IBUFSIZ (BUFSIZ + 1)
+#define IBUFSIZ (BUFSIZ + PUNGETC_MAX + 1)
MKINIT struct parsefile basepf; /* top level input file */
@@ -83,13 +83,16 @@ INIT {
}
RESET {
+ int c;
+
/* clear input buffer */
popallfiles();
- basepf.unget = 0;
- while (basepf.lastc[0] != '\n' &&
- basepf.lastc[0] != PEOF &&
- !int_pending())
- pgetc();
+
+ c = PEOF;
+ if (basepf.nextc - basebuf > basepf.unget)
+ c = basepf.nextc[-basepf.unget];
+ while (c != '\n' && c != PEOF && !int_pending())
+ c = pgetc();
}
FORKRESET {
@@ -131,17 +134,20 @@ static int __pgetc(void)
{
int c;
- if (parsefile->unget)
- return parsefile->lastc[--parsefile->unget];
+ if (parsefile->unget) {
+ long unget = -(long)(unsigned)parsefile->unget--;
+
+ if (parsefile->nleft < 0)
+ return preadbuffer();
+
+ return parsefile->nextc[unget];
+ }
if (--parsefile->nleft >= 0)
c = (signed char)*parsefile->nextc++;
else
c = preadbuffer();
- parsefile->lastc[1] = parsefile->lastc[0];
- parsefile->lastc[0] = c;
-
return c;
}
@@ -176,9 +182,16 @@ static int stdin_clear_nonblock(void)
static int
preadfd(void)
{
+ char *buf = parsefile->buf;
+ int unget;
int nr;
- char *buf = parsefile->buf;
- parsefile->nextc = buf;
+
+ unget = parsefile->nextc - buf;
+ if (unget > PUNGETC_MAX)
+ unget = PUNGETC_MAX;
+
+ memmove(buf, parsefile->nextc - unget, unget);
+ parsefile->nextc = buf += unget;
retry:
#ifndef SMALL
@@ -196,8 +209,8 @@ retry:
nr = 0;
else {
nr = el_len;
- if (nr > IBUFSIZ - 1)
- nr = IBUFSIZ - 1;
+ if (nr > BUFSIZ)
+ nr = BUFSIZ;
memcpy(buf, rl_cp, nr);
if (nr != el_len) {
el_len -= nr;
@@ -209,9 +222,9 @@ retry:
} else
#endif
if (parsefile->fd)
- nr = read(parsefile->fd, buf, IBUFSIZ - 1);
+ nr = read(parsefile->fd, buf, BUFSIZ);
else {
- unsigned len = IBUFSIZ - 1;
+ unsigned len = BUFSIZ;
nr = 0;
@@ -348,6 +361,11 @@ done:
return (signed char)*parsefile->nextc++;
}
+void pungetn(int n)
+{
+ parsefile->unget += n;
+}
+
/*
* Undo a call to pgetc. Only two characters may be pushed back.
* PEOF may be pushed back.
@@ -356,7 +374,7 @@ done:
void
pungetc(void)
{
- parsefile->unget++;
+ pungetn(1);
}
/*
@@ -383,7 +401,6 @@ pushstring(char *s, void *ap)
sp->prevnleft = parsefile->nleft;
sp->unget = parsefile->unget;
sp->spfree = parsefile->spfree;
- memcpy(sp->lastc, parsefile->lastc, sizeof(sp->lastc));
sp->ap = (struct alias *)ap;
if (ap) {
((struct alias *)ap)->flag |= ALIASINUSE;
@@ -413,7 +430,6 @@ static void popstring(void)
parsefile->nextc = sp->prevstring;
parsefile->nleft = sp->prevnleft;
parsefile->unget = sp->unget;
- memcpy(parsefile->lastc, sp->lastc, sizeof(sp->lastc));
/*dprintf("*** calling popstring: restoring to '%s'\n", parsenextc);*/
parsefile->strpush = sp->prev;
parsefile->spfree = sp;
@@ -457,7 +473,7 @@ setinputfd(int fd, int push)
}
parsefile->fd = fd;
if (parsefile->buf == NULL)
- parsefile->buf = ckmalloc(IBUFSIZ);
+ parsefile->nextc = parsefile->buf = ckmalloc(IBUFSIZ);
input_set_lleft(parsefile, parsefile->nleft = 0);
plinno = 1;
}
diff --git a/src/input.h b/src/input.h
index 1ff5773..5b4a045 100644
--- a/src/input.h
+++ b/src/input.h
@@ -34,12 +34,16 @@
* @(#)input.h 8.2 (Berkeley) 5/4/95
*/
+#include <limits.h>
+
#ifdef SMALL
#define IS_DEFINED_SMALL 1
#else
#define IS_DEFINED_SMALL 0
#endif
+#define PUNGETC_MAX (MB_LEN_MAX > 16 ? MB_LEN_MAX : 16)
+
/* PEOF (the end of file marker) is defined in syntax.h */
enum {
@@ -59,9 +63,6 @@ struct strpush {
/* Delay freeing so we can stop nested aliases. */
struct strpush *spfree;
- /* Remember last two characters for pungetc. */
- int lastc[2];
-
/* Number of outstanding calls to pungetc. */
int unget;
};
@@ -87,9 +88,6 @@ struct parsefile {
/* Delay freeing so we can stop nested aliases. */
struct strpush *spfree;
- /* Remember last two characters for pungetc. */
- int lastc[2];
-
/* Number of outstanding calls to pungetc. */
int unget;
};
@@ -106,6 +104,7 @@ extern struct parsefile *parsefile;
int pgetc(void);
int pgetc2(void);
void pungetc(void);
+void pungetn(int);
void pushstring(char *, void *);
int setinputfile(const char *, int);
void setinputstring(char *);
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [v2 PATCH 8/8] parser: Add support for multi-byte characters
2024-04-28 3:56 [v2 PATCH 0/8] Add multi-byte support Herbert Xu
` (6 preceding siblings ...)
2024-04-28 3:57 ` [v2 PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
@ 2024-04-28 3:57 ` Herbert Xu
7 siblings, 0 replies; 9+ messages in thread
From: Herbert Xu @ 2024-04-28 3:57 UTC (permalink / raw)
To: DASH Mailing List
Add the requisite markers for multi-byte characters so that the
expansion code can recognise them. Also allow wide blank characters
to terminate words.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 19 ++++++++
src/parser.c | 127 +++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 121 insertions(+), 25 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index dd2b71e..402289f 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -265,6 +265,7 @@ static char *argstr(char *p, int flag)
CTLESC,
CTLVAR,
CTLBACKQ,
+ CTLMBCHAR,
CTLARI,
CTLENDARI,
0
@@ -289,6 +290,8 @@ tilde:
start:
startloc = expdest - (char *)stackblock();
for (;;) {
+ unsigned ml;
+ unsigned mb;
int end;
length += strcspn(p + length, reject);
@@ -351,6 +354,22 @@ addquote:
startloc++;
}
break;
+ case CTLMBCHAR:
+ c = (signed char)*p--;
+ mb = mbnext(p);
+ ml = (mb >> 8) - 2;
+ if (flag & QUOTES_ESC) {
+ length = (mb >> 8) + (mb & 0xff);
+ if (c == (char)CTLESC)
+ startloc += length;
+ break;
+ }
+ if (c == CTLESC)
+ startloc += ml;
+ p += mb & 0xff;
+ expdest = stnputs(p, ml, expdest);
+ p += mb >> 8;
+ break;
case CTLESC:
startloc++;
length++;
diff --git a/src/parser.c b/src/parser.c
index 27611f0..c23cc9b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -36,7 +36,11 @@
#include <alloca.h>
#endif
+#include <limits.h>
+#include <stdbool.h>
#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
#include "shell.h"
#include "parser.h"
@@ -876,7 +880,53 @@ static void synstack_pop(struct synstack **stack)
*stack = (*stack)->next;
}
+static unsigned getmbc(int c, char *out, int mode)
+{
+ char *const start = out;
+ mbstate_t mbst = {};
+ unsigned ml = 0;
+ size_t ml2;
+ wchar_t wc;
+ char *mbc;
+ if (likely(c >= 0))
+ return 0;
+
+ mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out;
+ mbc[ml] = c;
+ while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) {
+ if (ml >= MB_LEN_MAX)
+ break;
+ c = pgetc();
+ if (c == PEOF)
+ break;
+ mbc[ml] = c;
+ }
+
+ if (ml2 == 1 && ml > 1) {
+ if (mode == 4 && iswblank(wc))
+ return 1;
+
+ if ((mode & 3) < 2) {
+ USTPUTC(CTLMBCHAR, out);
+ if (mode == 1)
+ USTPUTC(CTLESC, out);
+ USTPUTC(ml, out);
+ }
+ STADJUST(ml, out);
+ if ((mode & 3) < 2) {
+ USTPUTC(ml, out);
+ USTPUTC(CTLMBCHAR, out);
+ }
+
+ return out - start;
+ }
+
+ if (ml > 1)
+ pungetn(ml - 1);
+
+ return 0;
+}
/*
* If eofmark is NULL, read a word or a redirection symbol. If eofmark
@@ -929,12 +979,27 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
}
#endif
CHECKEND(); /* set c to PEOF if at end of here document */
- for (;;) { /* until end of line or end of word */
- CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */
+ /* Until end of line or end of word */
+ for (;; c = pgetc_top(synstack)) {
+ int fieldsplitting;
+ unsigned ml;
+
+ /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */
+ CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7,
+ out);
+ fieldsplitting = synstack->syntax == BASESYNTAX &&
+ !synstack->varnest ? 4 : 0;
+ ml = getmbc(c, out, fieldsplitting);
+ if (ml == 1) {
+ c = pgetc();
+ break;
+ }
+ out += ml;
+ if (ml)
+ continue;
switch(synstack->syntax[c]) {
case CNL: /* '\n' */
- if (synstack->syntax == BASESYNTAX &&
- !synstack->varnest)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
nlprompt();
@@ -956,26 +1021,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
USTPUTC(CTLESC, out);
USTPUTC('\\', out);
pungetc();
- } else {
- if (
- synstack->dblquote &&
- c != '\\' && c != '`' &&
- c != '$' && (
- c != '"' ||
- (eofmark != NULL &&
- !synstack->varnest)
- ) && (
- c != '}' ||
- !synstack->varnest
- )
- ) {
- USTPUTC(CTLESC, out);
- USTPUTC('\\', out);
- }
- USTPUTC(CTLESC, out);
- USTPUTC(c, out);
- quotef++;
+ break;
}
+
+ if (
+ synstack->dblquote &&
+ c != '\\' && c != '`' &&
+ c != '$' && (
+ c != '"' ||
+ (eofmark != NULL &&
+ !synstack->varnest)
+ ) && (
+ c != '}' ||
+ !synstack->varnest
+ )
+ ) {
+ USTPUTC(CTLESC, out);
+ USTPUTC('\\', out);
+ }
+ quotef++;
+
+ ml = getmbc(c, out, 1);
+ out += ml;
+ if (ml)
+ break;
+
+ USTPUTC(CTLESC, out);
+ USTPUTC(c, out);
break;
case CSQUOTE:
synstack->syntax = SQSYNTAX;
@@ -1053,11 +1125,10 @@ toggledq:
case CEOF:
goto endword; /* exit outer loop */
default:
- if (synstack->varnest == 0)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
}
- c = pgetc_top(synstack);
}
}
endword:
@@ -1384,6 +1455,7 @@ parsebackq: {
size_t psavelen;
size_t savelen;
union node *n;
+ unsigned ml;
char *pstr;
char *str;
@@ -1415,6 +1487,11 @@ parsebackq: {
if (pc != '\\' && pc != '`' && pc != '$'
&& (!synstack->dblquote || pc != '"'))
STPUTC('\\', pout);
+ CHECKSTRSPACE(MB_LEN_MAX, pout);
+ ml = getmbc(pc, pout, 2);
+ pout += ml;
+ if (ml)
+ continue;
break;
case PEOF:
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread