From: Herbert Xu <herbert@gondor.apana.org.au>
To: DASH Mailing List <dash@vger.kernel.org>
Subject: [PATCH 8/8] parser: Add support for multi-byte characters
Date: Sat, 27 Apr 2024 16:41:00 +0800 [thread overview]
Message-ID: <dce77dcec0adb6d2dbaecdb3025b35425981a41c.1714215826.git.herbert@gondor.apana.org.au> (raw)
In-Reply-To: <cover.1714215826.git.herbert@gondor.apana.org.au>
Add the requisite markers for multi-byte characters so that the
expansion code can recognise them. Also allow wide blank characters
to terminate words.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 19 ++++++++
src/parser.c | 127 +++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 121 insertions(+), 25 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index 679bbb8..7c3f350 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -265,6 +265,7 @@ static char *argstr(char *p, int flag)
CTLESC,
CTLVAR,
CTLBACKQ,
+ CTLMBCHAR,
CTLARI,
CTLENDARI,
0
@@ -289,6 +290,8 @@ tilde:
start:
startloc = expdest - (char *)stackblock();
for (;;) {
+ unsigned ml;
+ unsigned mb;
int end;
length += strcspn(p + length, reject);
@@ -351,6 +354,22 @@ addquote:
startloc++;
}
break;
+ case CTLMBCHAR:
+ c = (signed char)*p--;
+ mb = mbnext(p);
+ ml = (mb >> 8) - 2;
+ if (flag & QUOTES_ESC) {
+ length = (mb >> 8) + (mb & 0xff);
+ if (c == (char)CTLESC)
+ startloc += length;
+ break;
+ }
+ if (c == CTLESC)
+ startloc += ml;
+ p += mb & 0xff;
+ expdest = stnputs(p, ml, expdest);
+ p += mb >> 8;
+ break;
case CTLESC:
startloc++;
length++;
diff --git a/src/parser.c b/src/parser.c
index 27611f0..c23cc9b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -36,7 +36,11 @@
#include <alloca.h>
#endif
+#include <limits.h>
+#include <stdbool.h>
#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
#include "shell.h"
#include "parser.h"
@@ -876,7 +880,53 @@ static void synstack_pop(struct synstack **stack)
*stack = (*stack)->next;
}
+static unsigned getmbc(int c, char *out, int mode)
+{
+ char *const start = out;
+ mbstate_t mbst = {};
+ unsigned ml = 0;
+ size_t ml2;
+ wchar_t wc;
+ char *mbc;
+ if (likely(c >= 0))
+ return 0;
+
+ mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out;
+ mbc[ml] = c;
+ while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) {
+ if (ml >= MB_LEN_MAX)
+ break;
+ c = pgetc();
+ if (c == PEOF)
+ break;
+ mbc[ml] = c;
+ }
+
+ if (ml2 == 1 && ml > 1) {
+ if (mode == 4 && iswblank(wc))
+ return 1;
+
+ if ((mode & 3) < 2) {
+ USTPUTC(CTLMBCHAR, out);
+ if (mode == 1)
+ USTPUTC(CTLESC, out);
+ USTPUTC(ml, out);
+ }
+ STADJUST(ml, out);
+ if ((mode & 3) < 2) {
+ USTPUTC(ml, out);
+ USTPUTC(CTLMBCHAR, out);
+ }
+
+ return out - start;
+ }
+
+ if (ml > 1)
+ pungetn(ml - 1);
+
+ return 0;
+}
/*
* If eofmark is NULL, read a word or a redirection symbol. If eofmark
@@ -929,12 +979,27 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
}
#endif
CHECKEND(); /* set c to PEOF if at end of here document */
- for (;;) { /* until end of line or end of word */
- CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */
+ /* Until end of line or end of word */
+ for (;; c = pgetc_top(synstack)) {
+ int fieldsplitting;
+ unsigned ml;
+
+ /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */
+ CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7,
+ out);
+ fieldsplitting = synstack->syntax == BASESYNTAX &&
+ !synstack->varnest ? 4 : 0;
+ ml = getmbc(c, out, fieldsplitting);
+ if (ml == 1) {
+ c = pgetc();
+ break;
+ }
+ out += ml;
+ if (ml)
+ continue;
switch(synstack->syntax[c]) {
case CNL: /* '\n' */
- if (synstack->syntax == BASESYNTAX &&
- !synstack->varnest)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
nlprompt();
@@ -956,26 +1021,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
USTPUTC(CTLESC, out);
USTPUTC('\\', out);
pungetc();
- } else {
- if (
- synstack->dblquote &&
- c != '\\' && c != '`' &&
- c != '$' && (
- c != '"' ||
- (eofmark != NULL &&
- !synstack->varnest)
- ) && (
- c != '}' ||
- !synstack->varnest
- )
- ) {
- USTPUTC(CTLESC, out);
- USTPUTC('\\', out);
- }
- USTPUTC(CTLESC, out);
- USTPUTC(c, out);
- quotef++;
+ break;
}
+
+ if (
+ synstack->dblquote &&
+ c != '\\' && c != '`' &&
+ c != '$' && (
+ c != '"' ||
+ (eofmark != NULL &&
+ !synstack->varnest)
+ ) && (
+ c != '}' ||
+ !synstack->varnest
+ )
+ ) {
+ USTPUTC(CTLESC, out);
+ USTPUTC('\\', out);
+ }
+ quotef++;
+
+ ml = getmbc(c, out, 1);
+ out += ml;
+ if (ml)
+ break;
+
+ USTPUTC(CTLESC, out);
+ USTPUTC(c, out);
break;
case CSQUOTE:
synstack->syntax = SQSYNTAX;
@@ -1053,11 +1125,10 @@ toggledq:
case CEOF:
goto endword; /* exit outer loop */
default:
- if (synstack->varnest == 0)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
}
- c = pgetc_top(synstack);
}
}
endword:
@@ -1384,6 +1455,7 @@ parsebackq: {
size_t psavelen;
size_t savelen;
union node *n;
+ unsigned ml;
char *pstr;
char *str;
@@ -1415,6 +1487,11 @@ parsebackq: {
if (pc != '\\' && pc != '`' && pc != '$'
&& (!synstack->dblquote || pc != '"'))
STPUTC('\\', pout);
+ CHECKSTRSPACE(MB_LEN_MAX, pout);
+ ml = getmbc(pc, pout, 2);
+ pout += ml;
+ if (ml)
+ continue;
break;
case PEOF:
--
2.39.2
next prev parent reply other threads:[~2024-04-27 11:07 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-27 11:03 [PATCH 0/8] Add multi-byte support Herbert Xu
2024-04-16 10:03 ` [PATCH 1/8] shell: Call setlocale Herbert Xu
2024-04-16 10:38 ` [PATCH 2/8] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-04-16 23:13 ` [PATCH 3/8] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-04-18 8:59 ` [PATCH 4/8] expand: Process multi-byte characters in subevalvar Herbert Xu
2024-04-20 13:46 ` [PATCH 5/8] expand: Process multi-byte characters in expmeta Herbert Xu
2024-04-23 11:17 ` [PATCH 6/8] expand: Support multi-byte characters during field splitting Herbert Xu
2024-04-27 8:15 ` [PATCH 7/8] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-04-27 8:41 ` Herbert Xu [this message]
2024-04-27 21:31 ` [PATCH 0/8] Add multi-byte support Christoph Anton Mitterer
2024-04-28 0:49 ` Herbert Xu
2024-04-28 1:19 ` Christoph Anton Mitterer
2024-04-28 1:35 ` Lawrence Velázquez
2024-04-28 1:50 ` Christoph Anton Mitterer
2024-04-28 2:03 ` Christoph Anton Mitterer
2024-04-28 14:50 ` Harald van Dijk
2024-04-29 13:12 ` Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=dce77dcec0adb6d2dbaecdb3025b35425981a41c.1714215826.git.herbert@gondor.apana.org.au \
--to=herbert@gondor.apana.org.au \
--cc=dash@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.