From 082052b5a841edd32c1d3bbebba3da805d639fab Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Wed, 22 Jul 2020 20:11:25 -0700 Subject: [PATCH] Document double-quoted string escape sequences --- pkg/parse/parse.go | 43 +++++++++++++------------ pkg/parse/parse_test.go | 17 ++++++++-- website/ref/language.md | 71 ++++++++++++++++++++++++++++++++--------- 3 files changed, 93 insertions(+), 38 deletions(-) diff --git a/pkg/parse/parse.go b/pkg/parse/parse.go index b4897228..9fed3a9a 100644 --- a/pkg/parse/parse.go +++ b/pkg/parse/parse.go @@ -53,20 +53,19 @@ func ParseAs(src Source, n Node, w io.Writer) error { // Errors. var ( - errShouldBeForm = newError("", "form") - errBadLHS = errors.New("bad assignment LHS") - errBadRedirSign = newError("bad redir sign", "'<'", "'>'", "'>>'", "'<>'") - errShouldBeFD = newError("", "a composite term representing fd") - errShouldBeFilename = newError("", "a composite term representing filename") - errShouldBeArray = newError("", "spaced") - errStringUnterminated = newError("string not terminated") - errChainedAssignment = newError("chained assignment not yet supported") - errInvalidEscape = newError("invalid escape sequence") - errInvalidEscapeOct = newError("invalid escape sequence", "octal digit") - errInvalidEscapeHex = newError("invalid escape sequence", "hex digit") - errInvalidEscapeControl = newError("invalid control sequence", "a rune between @ (0x40) and _(0x5F)") - errShouldBePrimary = newError("", - "single-quoted string", "double-quoted string", "bareword") + errShouldBeForm = newError("", "form") + errBadLHS = errors.New("bad assignment LHS") + errBadRedirSign = newError("bad redir sign", "'<'", "'>'", "'>>'", "'<>'") + errShouldBeFD = newError("", "a composite term representing fd") + errShouldBeFilename = newError("", "a composite term representing filename") + errShouldBeArray = newError("", "spaced") + errStringUnterminated = newError("string not terminated") + errChainedAssignment = newError("chained assignment not yet supported") + errInvalidEscape = newError("invalid escape sequence") + errInvalidEscapeOct = newError("invalid escape sequence", "octal digit") + errInvalidEscapeHex = newError("invalid escape sequence", "hex digit") + errInvalidEscapeControl = newError("invalid control sequence", "a codepoint between 0x3F and 0x5F") + errShouldBePrimary = newError("", "single-quoted string", "double-quoted string", "bareword") errShouldBeVariableName = newError("", "variable name") errShouldBeRBracket = newError("", "']'") errShouldBeRBrace = newError("", "'}'") @@ -585,16 +584,19 @@ func (pn *Primary) doubleQuoted(ps *parser) { return case '\\': switch r := ps.next(); r { - case 'c', '^': - // Control sequence + case 'c', '^': // control sequence r := ps.next() - if r < 0x40 || r >= 0x60 { + if r < 0x3F || r > 0x5F { ps.backup() ps.error(errInvalidEscapeControl) ps.next() } - buf.WriteByte(byte(r - 0x40)) - case 'x', 'u', 'U': + if byte(r) == '?' { // special-case: \c? => del + buf.WriteByte(byte(0x7F)) + } else { + buf.WriteByte(byte(r - 0x40)) + } + case 'x', 'u', 'U': // two, four, or eight hex digits var n int switch r { case 'x': @@ -615,8 +617,7 @@ func (pn *Primary) doubleQuoted(ps *parser) { rr = rr*16 + d } buf.WriteRune(rr) - case '0', '1', '2', '3', '4', '5', '6', '7': - // 2 more octal digits + case '0', '1', '2', '3', '4', '5', '6', '7': // three octal digits rr := r - '0' for i := 0; i < 2; i++ { r := ps.next() diff --git a/pkg/parse/parse_test.go b/pkg/parse/parse_test.go index c19c31e7..c8ebdbd4 100644 --- a/pkg/parse/parse_test.go +++ b/pkg/parse/parse_test.go @@ -94,7 +94,19 @@ var goodCases = []struct { "Type": SingleQuoted, "Value": "'x'y'", }})}, // Double quote - {`a "b\^[\x1b\u548c\U0002CE23\123\n\t\\"`, + {`a "[\c?\c@\cI\^I\^[]"`, // control char sequences + a(ast{"Compound/Indexing/Primary", fs{ + "Type": DoubleQuoted, + "Value": "[\x7f\x00\t\t\x1b]", + }})}, + + {`a "[\n\t\a\v\\\"]"`, // single char sequences + a(ast{"Compound/Indexing/Primary", fs{ + "Type": DoubleQuoted, + "Value": "[\n\t\a\v\\\"]", + }})}, + + {`a "b\^[\x1b\u548c\U0002CE23\123\n\t\\"`, // numeric sequences a(ast{"Compound/Indexing/Primary", fs{ "Type": DoubleQuoted, "Value": "b\x1b\x1b\u548c\U0002CE23\123\n\t\\", @@ -364,7 +376,8 @@ var parseErrorTests = []struct { {src: "'a", errAtEnd: true, errMsg: "string not terminated"}, {src: `"a`, errAtEnd: true, errMsg: "string not terminated"}, // Bad escape sequence. - {src: `a "\^0"`, errPart: "0", errMsg: "invalid control sequence, should be a rune between @ (0x40) and _(0x5F)"}, + {src: `a "\^` + "\t", errPart: "\t", + errMsg: "invalid control sequence, should be a codepoint between 0x3F and 0x5F"}, {src: `a "\xQQ"`, errPart: "Q", errMsg: "invalid escape sequence, should be hex digit"}, {src: `a "\1ab"`, errPart: "a", errMsg: "invalid escape sequence, should be octal digit"}, {src: `a "\i"`, errPart: "i", errMsg: "invalid escape sequence"}, diff --git a/website/ref/language.md b/website/ref/language.md index 1fed0c13..e6e9e200 100644 --- a/website/ref/language.md +++ b/website/ref/language.md @@ -55,30 +55,71 @@ some values. (The traditional terms for the two levels are "commands" and ## String The most common data structure in shells is the string. String literals can be -quoted or unquoted (barewords). +quoted or unquoted (barewords). There are two types of quoted strings in Elvish: +single-quoted strings and double-quoted strings. -### Quoted +### Single-Quoted String -There are two types of quoted strings in Elvish, single-quoted strings and -double-quoted strings. - -In single-quoted strings, all characters represent themselves, except single +In single-quoted strings all characters represent themselves, except single quotes, which need to be doubled. For instance, `'*\'` evaluates to `*\`, and `'it''s'` evaluates to `it's`. -In double-quoted strings, the backslash `\` introduces a **escape sequence**. -For instance, `"\n"` evaluates to a newline; `"\\"` evaluates to a backslash; -invalid escape sequences like `"\*"` result in a syntax error. +### Double-Quoted String -**TODO**: Document the full list of supported escape sequences. +In double-quoted strings the backslash, `\`, introduces an **escape sequence**. +For instance, `\n` evaluates to a newline and `\\` evaluates to a backslash. +Invalid escape sequences like `\*` result in a syntax error when the program is +compiled. -Unlike most other shells, double-quoted strings do not support interpolation. -For instance, `"$USER"` simply evaluates to the string `$USER`. To get a similar -effect, simply concatenate strings: instead of `"my name is $name"`, write -`"my name is "$name`. Under the hood this is a +Unlike most other shells, double-quoted strings in Elvish do not support +interpolation. For instance, `"$name"` simply evaluates to the string `$name`. +To get a similar effect, simply concatenate strings: instead of +`"my name is $name"`, write `"my name is "$name`. Under the hood this is a [compound expression](#compound-expression-and-braced-lists). -### Barewords +The following escape sequences are recognized in double-quoted strings: + +- `\cX`, where _X_ is a character with codepoint between 0x40 and 0x5F, + represents the codepoint that is 0x40 lower than _X_. For example, `\cI` is + the tab character: 0x49 (`I`) - 0x40 = 0x09 (tab). There is one special + case: A question-mark is converted to del; i.e., `\c?` or `\^?` is + equivalent to `\x7F`. + +- `\^X` is the same as `\cX`. + +- `\[0..7][0..7][0..7]` is a byte written as an octal value. There must be + three octal digits following the backslash. For example, `\000` is the nul + character, and `\101` is the same as `A`, but `\0` is an invalid escape + sequence (too few digits). + +- `\x..` is a Unicode code point represented by two hexadecimal digits. + +- `\u....` is a Unicode code point represented by four hexadecimal digits. + +- `\U......` is a Unicode code point represented by eight hexadecimal digits. + +- The following single character escape sequences: + + - `\a` is the "bel" character, equivalent to `\007` or `\x07`. + + - `\b` is the "backspace" character, equivalent to `\010` or `\x08`. + + - `\f` is the "formfeed" (aka "np") character, equivalent to `\014` or + `\x0c`. + + - `\n` is the "nl" character, equivalent to `\012` or `\x0a`. + + - `\r` is the "cr" character, equivalent to `\015` or `\x0d`. + + - `\t` is the "tab" character, equivalent to `\011` or `\x09`. + + - `\v` is the "vt" character, equivalent to `\013` or `\x0b`. + + - `\\` is the "backslash" character, equivalent to `\134` or `\x5c`. + + - `\"` is the "double-quote" character, equivalent to `\042` or `\x22`. + +### Bareword String If a string only consists of bareword characters, it can be written without any quote; this is called a **bareword**. Examples are `a.txt`, `long-bareword`, and