From: Harald van Dijk <harald@gigawatt.nl>
To: Martijn Dekker <martijn@inlv.org>, dash@vger.kernel.org
Subject: Re: dash tested against ash testsuite: 17 failures
Date: Mon, 10 Oct 2016 22:20:07 +0200 [thread overview]
Message-ID: <503584db-0131-3264-397e-0bc784eed58d@gigawatt.nl> (raw)
In-Reply-To: <7d291bb2-a968-471d-d2a0-87adfd0bc38d@inlv.org>
[-- Attachment #1: Type: text/plain, Size: 2358 bytes --]
On 08/10/16 21:42, Martijn Dekker wrote:
> Op 01-10-16 om 19:17 schreef Denys Vlasenko:
>> ash-vars/var_unbackslash.tests
>
> ITYM ash-vars/var_unbackslash1.tests
>
>> echo Forty two:$\
>> (\
>> (\
>> 42\
>> )\
>> )
>> dash says: Syntax error: Missing '))'
>
> Yes, but it's not clear to me that it shouldn't.
>
> Hmm... maybe this is indeed a bug:
> http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_02_01
> "A <backslash> that is not quoted shall preserve the literal value of
> the following character, with the exception of a <newline>. If a
> <newline> follows the <backslash>, the shell shall interpret this as
> line continuation. The <backslash> and <newline> shall be removed before
> splitting the input into tokens. Since the escaped <newline> is removed
> entirely from the input and is not replaced by any white space, it
> cannot serve as a token separator."
>
> So, unless I'm misreading this, it looks like backslashes need to be
> parsed before *any* other kind of lexical analysis.
There does appear to be one exception: a comment may end with a
backslash. This does not cause the next line to be treated as a comment:
once a # is seen, the remaining characters on the line are not subjected
to the regular lexical analysis, so the above does not apply.
I would have expected another exception to be in alias expansions that
end in a backslash. Shells are not entirely in agreement there, but most
appear to treat this the regular way, meaning
dash -c 'alias bs=\\
bs
'
prints nothing.
dash has a pgetc_eatbnl function already in parser.c which skips any
backslash-newline combinations. It's not used everywhere it could be.
There is also some duplicated backslash-newline handling elsewhere in
parser.c. Replacing all the calls to pgetc() to call pgetc_eatbnl()
instead, with the exception of the one that handles comments, and
removing the duplicated backslash-newline handling, lets this test case
work, as well as several other similar ones, such as:
: &\
& :
: \
<\
<\
EO\
F
123
E\
OF
A nice benefit is that the removal of the duplicated BSNL handling
causes a reduction in code size.
There are probably a few corner cases I'm not handling correctly in this
patch, though. Feedback welcome.
Cheers,
Harald van Dijk
[-- Attachment #2: parser-bnl.patch --]
[-- Type: text/x-patch, Size: 4477 bytes --]
--- a/src/parser.c
+++ b/src/parser.c
@@ -106,6 +106,7 @@ STATIC void parseheredoc(void);
STATIC int peektoken(void);
STATIC int readtoken(void);
STATIC int xxreadtoken(void);
+STATIC int pgetc_eatbnl();
STATIC int readtoken1(int, char const *, char *, int);
STATIC void synexpect(int) __attribute__((__noreturn__));
STATIC void synerror(const char *) __attribute__((__noreturn__));
@@ -656,7 +657,7 @@ parseheredoc(void)
if (needprompt) {
setprompt(2);
}
- readtoken1(pgetc(), here->here->type == NHERE? SQSYNTAX : DQSYNTAX,
+ readtoken1(pgetc_eatbnl(), here->here->type == NHERE? SQSYNTAX : DQSYNTAX,
here->eofmark, here->striptabs);
n = (union node *)stalloc(sizeof (struct narg));
n->narg.type = NARG;
@@ -782,7 +783,7 @@ xxreadtoken(void)
setprompt(2);
}
for (;;) { /* until token or start of word found */
- c = pgetc();
+ c = pgetc_eatbnl();
switch (c) {
case ' ': case '\t':
case PEOA:
@@ -791,30 +792,23 @@ xxreadtoken(void)
while ((c = pgetc()) != '\n' && c != PEOF);
pungetc();
continue;
- case '\\':
- if (pgetc() == '\n') {
- nlprompt();
- continue;
- }
- pungetc();
- goto breakloop;
case '\n':
nlnoprompt();
RETURN(TNL);
case PEOF:
RETURN(TEOF);
case '&':
- if (pgetc() == '&')
+ if (pgetc_eatbnl() == '&')
RETURN(TAND);
pungetc();
RETURN(TBACKGND);
case '|':
- if (pgetc() == '|')
+ if (pgetc_eatbnl() == '|')
RETURN(TOR);
pungetc();
RETURN(TPIPE);
case ';':
- if (pgetc() == ';')
+ if (pgetc_eatbnl() == ';')
RETURN(TENDCASE);
pungetc();
RETURN(TSEMI);
@@ -822,11 +816,9 @@ xxreadtoken(void)
RETURN(TLP);
case ')':
RETURN(TRP);
- default:
- goto breakloop;
}
+ break;
}
-breakloop:
return readtoken1(c, BASESYNTAX, (char *)NULL, 0);
#undef RETURN
}
@@ -903,7 +895,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
attyline();
if (syntax == BASESYNTAX)
return readtoken();
- c = pgetc();
+ c = pgetc_eatbnl();
goto loop;
}
#endif
@@ -916,7 +908,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
goto endword; /* exit outer loop */
USTPUTC(c, out);
nlprompt();
- c = pgetc();
+ c = pgetc_eatbnl();
goto loop; /* continue outer loop */
case CWORD:
USTPUTC(c, out);
@@ -997,7 +989,7 @@ quotemark:
USTPUTC(c, out);
--parenlevel;
} else {
- if (pgetc() == ')') {
+ if (pgetc_eatbnl() == ')') {
USTPUTC(CTLENDARI, out);
if (!--arinest)
syntax = prevsyntax;
@@ -1025,7 +1017,7 @@ quotemark:
USTPUTC(c, out);
}
}
- c = pgetc();
+ c = pgetc_eatbnl();
}
}
endword:
@@ -1132,7 +1124,7 @@ parseredir: {
np = (union node *)stalloc(sizeof (struct nfile));
if (c == '>') {
np->nfile.fd = 1;
- c = pgetc();
+ c = pgetc_eatbnl();
if (c == '>')
np->type = NAPPEND;
else if (c == '|')
@@ -1145,7 +1137,7 @@ parseredir: {
}
} else { /* c == '<' */
np->nfile.fd = 0;
- switch (c = pgetc()) {
+ switch (c = pgetc_eatbnl()) {
case '<':
if (sizeof (struct nfile) != sizeof (struct nhere)) {
np = (union node *)stalloc(sizeof (struct nhere));
@@ -1154,7 +1146,7 @@ parseredir: {
np->type = NHERE;
heredoc = (struct heredoc *)stalloc(sizeof (struct heredoc));
heredoc->here = np;
- if ((c = pgetc()) == '-') {
+ if ((c = pgetc_eatbnl()) == '-') {
heredoc->striptabs = 1;
} else {
heredoc->striptabs = 0;
@@ -1336,21 +1328,12 @@ parsebackq: {
if (needprompt) {
setprompt(2);
}
- switch (pc = pgetc()) {
+ switch (pc = pgetc_eatbnl()) {
case '`':
goto done;
case '\\':
- if ((pc = pgetc()) == '\n') {
- nlprompt();
- /*
- * If eating a newline, avoid putting
- * the newline into the new character
- * stream (via the STPUTC after the
- * switch).
- */
- continue;
- }
+ pc = pgetc_eatbnl();
if (pc != '\\' && pc != '`' && pc != '$'
&& (!dblquote || pc != '"'))
STPUTC('\\', pout);
@@ -1529,7 +1512,7 @@ expandstr(const char *ps)
saveprompt = doprompt;
doprompt = 0;
- readtoken1(pgetc(), DQSYNTAX, FAKEEOFMARK, 0);
+ readtoken1(pgetc_eatbnl(), DQSYNTAX, FAKEEOFMARK, 0);
doprompt = saveprompt;
next prev parent reply other threads:[~2016-10-10 20:27 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-10-01 17:17 dash tested against ash testsuite: 17 failures Denys Vlasenko
2016-10-08 19:42 ` Martijn Dekker
2016-10-10 20:20 ` Harald van Dijk [this message]
2016-10-12 17:24 ` Harald van Dijk
2016-10-10 21:51 ` Jilles Tjoelker
2016-10-10 22:30 ` Harald van Dijk
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=503584db-0131-3264-397e-0bc784eed58d@gigawatt.nl \
--to=harald@gigawatt.nl \
--cc=dash@vger.kernel.org \
--cc=martijn@inlv.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).