722 lines
24 KiB
JavaScript
Raw Normal View History

2024-07-07 18:49:38 -07:00
import { BOM, DOCUMENT, FLOW_END, SCALAR } from './cst.js';
/*
START -> stream
stream
directive -> line-end -> stream
indent + line-end -> stream
[else] -> line-start
line-end
comment -> line-end
newline -> .
input-end -> END
line-start
doc-start -> doc
doc-end -> stream
[else] -> indent -> block-start
block-start
seq-item-start -> block-start
explicit-key-start -> block-start
map-value-start -> block-start
[else] -> doc
doc
line-end -> line-start
spaces -> doc
anchor -> doc
tag -> doc
flow-start -> flow -> doc
flow-end -> error -> doc
seq-item-start -> error -> doc
explicit-key-start -> error -> doc
map-value-start -> doc
alias -> doc
quote-start -> quoted-scalar -> doc
block-scalar-header -> line-end -> block-scalar(min) -> line-start
[else] -> plain-scalar(false, min) -> doc
flow
line-end -> flow
spaces -> flow
anchor -> flow
tag -> flow
flow-start -> flow -> flow
flow-end -> .
seq-item-start -> error -> flow
explicit-key-start -> flow
map-value-start -> flow
alias -> flow
quote-start -> quoted-scalar -> flow
comma -> flow
[else] -> plain-scalar(true, 0) -> flow
quoted-scalar
quote-end -> .
[else] -> quoted-scalar
block-scalar(min)
newline + peek(indent < min) -> .
[else] -> block-scalar(min)
plain-scalar(is-flow, min)
scalar-end(is-flow) -> .
peek(newline + (indent < min)) -> .
[else] -> plain-scalar(min)
*/
function isEmpty(ch) {
switch (ch) {
case undefined:
case ' ':
case '\n':
case '\r':
case '\t':
return true;
default:
return false;
}
}
const hexDigits = new Set('0123456789ABCDEFabcdef');
const tagChars = new Set("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-#;/?:@&=+$_.!~*'()");
const flowIndicatorChars = new Set(',[]{}');
const invalidAnchorChars = new Set(' ,[]{}\n\r\t');
const isNotAnchorChar = (ch) => !ch || invalidAnchorChars.has(ch);
/**
* Splits an input string into lexical tokens, i.e. smaller strings that are
* easily identifiable by `tokens.tokenType()`.
*
* Lexing starts always in a "stream" context. Incomplete input may be buffered
* until a complete token can be emitted.
*
* In addition to slices of the original input, the following control characters
* may also be emitted:
*
* - `\x02` (Start of Text): A document starts with the next token
* - `\x18` (Cancel): Unexpected end of flow-mode (indicates an error)
* - `\x1f` (Unit Separator): Next token is a scalar value
* - `\u{FEFF}` (Byte order mark): Emitted separately outside documents
*/
class Lexer {
constructor() {
/**
* Flag indicating whether the end of the current buffer marks the end of
* all input
*/
this.atEnd = false;
/**
* Explicit indent set in block scalar header, as an offset from the current
* minimum indent, so e.g. set to 1 from a header `|2+`. Set to -1 if not
* explicitly set.
*/
this.blockScalarIndent = -1;
/**
* Block scalars that include a + (keep) chomping indicator in their header
* include trailing empty lines, which are otherwise excluded from the
* scalar's contents.
*/
this.blockScalarKeep = false;
/** Current input */
this.buffer = '';
/**
* Flag noting whether the map value indicator : can immediately follow this
* node within a flow context.
*/
this.flowKey = false;
/** Count of surrounding flow collection levels. */
this.flowLevel = 0;
/**
* Minimum level of indentation required for next lines to be parsed as a
* part of the current scalar value.
*/
this.indentNext = 0;
/** Indentation level of the current line. */
this.indentValue = 0;
/** Position of the next \n character. */
this.lineEndPos = null;
/** Stores the state of the lexer if reaching the end of incpomplete input */
this.next = null;
/** A pointer to `buffer`; the current position of the lexer. */
this.pos = 0;
}
/**
* Generate YAML tokens from the `source` string. If `incomplete`,
* a part of the last line may be left as a buffer for the next call.
*
* @returns A generator of lexical tokens
*/
*lex(source, incomplete = false) {
if (source) {
if (typeof source !== 'string')
throw TypeError('source is not a string');
this.buffer = this.buffer ? this.buffer + source : source;
this.lineEndPos = null;
}
this.atEnd = !incomplete;
let next = this.next ?? 'stream';
while (next && (incomplete || this.hasChars(1)))
next = yield* this.parseNext(next);
}
atLineEnd() {
let i = this.pos;
let ch = this.buffer[i];
while (ch === ' ' || ch === '\t')
ch = this.buffer[++i];
if (!ch || ch === '#' || ch === '\n')
return true;
if (ch === '\r')
return this.buffer[i + 1] === '\n';
return false;
}
charAt(n) {
return this.buffer[this.pos + n];
}
continueScalar(offset) {
let ch = this.buffer[offset];
if (this.indentNext > 0) {
let indent = 0;
while (ch === ' ')
ch = this.buffer[++indent + offset];
if (ch === '\r') {
const next = this.buffer[indent + offset + 1];
if (next === '\n' || (!next && !this.atEnd))
return offset + indent + 1;
}
return ch === '\n' || indent >= this.indentNext || (!ch && !this.atEnd)
? offset + indent
: -1;
}
if (ch === '-' || ch === '.') {
const dt = this.buffer.substr(offset, 3);
if ((dt === '---' || dt === '...') && isEmpty(this.buffer[offset + 3]))
return -1;
}
return offset;
}
getLine() {
let end = this.lineEndPos;
if (typeof end !== 'number' || (end !== -1 && end < this.pos)) {
end = this.buffer.indexOf('\n', this.pos);
this.lineEndPos = end;
}
if (end === -1)
return this.atEnd ? this.buffer.substring(this.pos) : null;
if (this.buffer[end - 1] === '\r')
end -= 1;
return this.buffer.substring(this.pos, end);
}
hasChars(n) {
return this.pos + n <= this.buffer.length;
}
setNext(state) {
this.buffer = this.buffer.substring(this.pos);
this.pos = 0;
this.lineEndPos = null;
this.next = state;
return null;
}
peek(n) {
return this.buffer.substr(this.pos, n);
}
*parseNext(next) {
switch (next) {
case 'stream':
return yield* this.parseStream();
case 'line-start':
return yield* this.parseLineStart();
case 'block-start':
return yield* this.parseBlockStart();
case 'doc':
return yield* this.parseDocument();
case 'flow':
return yield* this.parseFlowCollection();
case 'quoted-scalar':
return yield* this.parseQuotedScalar();
case 'block-scalar':
return yield* this.parseBlockScalar();
case 'plain-scalar':
return yield* this.parsePlainScalar();
}
}
*parseStream() {
let line = this.getLine();
if (line === null)
return this.setNext('stream');
if (line[0] === BOM) {
yield* this.pushCount(1);
line = line.substring(1);
}
if (line[0] === '%') {
let dirEnd = line.length;
let cs = line.indexOf('#');
while (cs !== -1) {
const ch = line[cs - 1];
if (ch === ' ' || ch === '\t') {
dirEnd = cs - 1;
break;
}
else {
cs = line.indexOf('#', cs + 1);
}
}
while (true) {
const ch = line[dirEnd - 1];
if (ch === ' ' || ch === '\t')
dirEnd -= 1;
else
break;
}
const n = (yield* this.pushCount(dirEnd)) + (yield* this.pushSpaces(true));
yield* this.pushCount(line.length - n); // possible comment
this.pushNewline();
return 'stream';
}
if (this.atLineEnd()) {
const sp = yield* this.pushSpaces(true);
yield* this.pushCount(line.length - sp);
yield* this.pushNewline();
return 'stream';
}
yield DOCUMENT;
return yield* this.parseLineStart();
}
*parseLineStart() {
const ch = this.charAt(0);
if (!ch && !this.atEnd)
return this.setNext('line-start');
if (ch === '-' || ch === '.') {
if (!this.atEnd && !this.hasChars(4))
return this.setNext('line-start');
const s = this.peek(3);
if (s === '---' && isEmpty(this.charAt(3))) {
yield* this.pushCount(3);
this.indentValue = 0;
this.indentNext = 0;
return 'doc';
}
else if (s === '...' && isEmpty(this.charAt(3))) {
yield* this.pushCount(3);
return 'stream';
}
}
this.indentValue = yield* this.pushSpaces(false);
if (this.indentNext > this.indentValue && !isEmpty(this.charAt(1)))
this.indentNext = this.indentValue;
return yield* this.parseBlockStart();
}
*parseBlockStart() {
const [ch0, ch1] = this.peek(2);
if (!ch1 && !this.atEnd)
return this.setNext('block-start');
if ((ch0 === '-' || ch0 === '?' || ch0 === ':') && isEmpty(ch1)) {
const n = (yield* this.pushCount(1)) + (yield* this.pushSpaces(true));
this.indentNext = this.indentValue + 1;
this.indentValue += n;
return yield* this.parseBlockStart();
}
return 'doc';
}
*parseDocument() {
yield* this.pushSpaces(true);
const line = this.getLine();
if (line === null)
return this.setNext('doc');
let n = yield* this.pushIndicators();
switch (line[n]) {
case '#':
yield* this.pushCount(line.length - n);
// fallthrough
case undefined:
yield* this.pushNewline();
return yield* this.parseLineStart();
case '{':
case '[':
yield* this.pushCount(1);
this.flowKey = false;
this.flowLevel = 1;
return 'flow';
case '}':
case ']':
// this is an error
yield* this.pushCount(1);
return 'doc';
case '*':
yield* this.pushUntil(isNotAnchorChar);
return 'doc';
case '"':
case "'":
return yield* this.parseQuotedScalar();
case '|':
case '>':
n += yield* this.parseBlockScalarHeader();
n += yield* this.pushSpaces(true);
yield* this.pushCount(line.length - n);
yield* this.pushNewline();
return yield* this.parseBlockScalar();
default:
return yield* this.parsePlainScalar();
}
}
*parseFlowCollection() {
let nl, sp;
let indent = -1;
do {
nl = yield* this.pushNewline();
if (nl > 0) {
sp = yield* this.pushSpaces(false);
this.indentValue = indent = sp;
}
else {
sp = 0;
}
sp += yield* this.pushSpaces(true);
} while (nl + sp > 0);
const line = this.getLine();
if (line === null)
return this.setNext('flow');
if ((indent !== -1 && indent < this.indentNext && line[0] !== '#') ||
(indent === 0 &&
(line.startsWith('---') || line.startsWith('...')) &&
isEmpty(line[3]))) {
// Allowing for the terminal ] or } at the same (rather than greater)
// indent level as the initial [ or { is technically invalid, but
// failing here would be surprising to users.
const atFlowEndMarker = indent === this.indentNext - 1 &&
this.flowLevel === 1 &&
(line[0] === ']' || line[0] === '}');
if (!atFlowEndMarker) {
// this is an error
this.flowLevel = 0;
yield FLOW_END;
return yield* this.parseLineStart();
}
}
let n = 0;
while (line[n] === ',') {
n += yield* this.pushCount(1);
n += yield* this.pushSpaces(true);
this.flowKey = false;
}
n += yield* this.pushIndicators();
switch (line[n]) {
case undefined:
return 'flow';
case '#':
yield* this.pushCount(line.length - n);
return 'flow';
case '{':
case '[':
yield* this.pushCount(1);
this.flowKey = false;
this.flowLevel += 1;
return 'flow';
case '}':
case ']':
yield* this.pushCount(1);
this.flowKey = true;
this.flowLevel -= 1;
return this.flowLevel ? 'flow' : 'doc';
case '*':
yield* this.pushUntil(isNotAnchorChar);
return 'flow';
case '"':
case "'":
this.flowKey = true;
return yield* this.parseQuotedScalar();
case ':': {
const next = this.charAt(1);
if (this.flowKey || isEmpty(next) || next === ',') {
this.flowKey = false;
yield* this.pushCount(1);
yield* this.pushSpaces(true);
return 'flow';
}
}
// fallthrough
default:
this.flowKey = false;
return yield* this.parsePlainScalar();
}
}
*parseQuotedScalar() {
const quote = this.charAt(0);
let end = this.buffer.indexOf(quote, this.pos + 1);
if (quote === "'") {
while (end !== -1 && this.buffer[end + 1] === "'")
end = this.buffer.indexOf("'", end + 2);
}
else {
// double-quote
while (end !== -1) {
let n = 0;
while (this.buffer[end - 1 - n] === '\\')
n += 1;
if (n % 2 === 0)
break;
end = this.buffer.indexOf('"', end + 1);
}
}
// Only looking for newlines within the quotes
const qb = this.buffer.substring(0, end);
let nl = qb.indexOf('\n', this.pos);
if (nl !== -1) {
while (nl !== -1) {
const cs = this.continueScalar(nl + 1);
if (cs === -1)
break;
nl = qb.indexOf('\n', cs);
}
if (nl !== -1) {
// this is an error caused by an unexpected unindent
end = nl - (qb[nl - 1] === '\r' ? 2 : 1);
}
}
if (end === -1) {
if (!this.atEnd)
return this.setNext('quoted-scalar');
end = this.buffer.length;
}
yield* this.pushToIndex(end + 1, false);
return this.flowLevel ? 'flow' : 'doc';
}
*parseBlockScalarHeader() {
this.blockScalarIndent = -1;
this.blockScalarKeep = false;
let i = this.pos;
while (true) {
const ch = this.buffer[++i];
if (ch === '+')
this.blockScalarKeep = true;
else if (ch > '0' && ch <= '9')
this.blockScalarIndent = Number(ch) - 1;
else if (ch !== '-')
break;
}
return yield* this.pushUntil(ch => isEmpty(ch) || ch === '#');
}
*parseBlockScalar() {
let nl = this.pos - 1; // may be -1 if this.pos === 0
let indent = 0;
let ch;
loop: for (let i = this.pos; (ch = this.buffer[i]); ++i) {
switch (ch) {
case ' ':
indent += 1;
break;
case '\n':
nl = i;
indent = 0;
break;
case '\r': {
const next = this.buffer[i + 1];
if (!next && !this.atEnd)
return this.setNext('block-scalar');
if (next === '\n')
break;
} // fallthrough
default:
break loop;
}
}
if (!ch && !this.atEnd)
return this.setNext('block-scalar');
if (indent >= this.indentNext) {
if (this.blockScalarIndent === -1)
this.indentNext = indent;
else {
this.indentNext =
this.blockScalarIndent + (this.indentNext === 0 ? 1 : this.indentNext);
}
do {
const cs = this.continueScalar(nl + 1);
if (cs === -1)
break;
nl = this.buffer.indexOf('\n', cs);
} while (nl !== -1);
if (nl === -1) {
if (!this.atEnd)
return this.setNext('block-scalar');
nl = this.buffer.length;
}
}
// Trailing insufficiently indented tabs are invalid.
// To catch that during parsing, we include them in the block scalar value.
let i = nl + 1;
ch = this.buffer[i];
while (ch === ' ')
ch = this.buffer[++i];
if (ch === '\t') {
while (ch === '\t' || ch === ' ' || ch === '\r' || ch === '\n')
ch = this.buffer[++i];
nl = i - 1;
}
else if (!this.blockScalarKeep) {
do {
let i = nl - 1;
let ch = this.buffer[i];
if (ch === '\r')
ch = this.buffer[--i];
const lastChar = i; // Drop the line if last char not more indented
while (ch === ' ')
ch = this.buffer[--i];
if (ch === '\n' && i >= this.pos && i + 1 + indent > lastChar)
nl = i;
else
break;
} while (true);
}
yield SCALAR;
yield* this.pushToIndex(nl + 1, true);
return yield* this.parseLineStart();
}
*parsePlainScalar() {
const inFlow = this.flowLevel > 0;
let end = this.pos - 1;
let i = this.pos - 1;
let ch;
while ((ch = this.buffer[++i])) {
if (ch === ':') {
const next = this.buffer[i + 1];
if (isEmpty(next) || (inFlow && flowIndicatorChars.has(next)))
break;
end = i;
}
else if (isEmpty(ch)) {
let next = this.buffer[i + 1];
if (ch === '\r') {
if (next === '\n') {
i += 1;
ch = '\n';
next = this.buffer[i + 1];
}
else
end = i;
}
if (next === '#' || (inFlow && flowIndicatorChars.has(next)))
break;
if (ch === '\n') {
const cs = this.continueScalar(i + 1);
if (cs === -1)
break;
i = Math.max(i, cs - 2); // to advance, but still account for ' #'
}
}
else {
if (inFlow && flowIndicatorChars.has(ch))
break;
end = i;
}
}
if (!ch && !this.atEnd)
return this.setNext('plain-scalar');
yield SCALAR;
yield* this.pushToIndex(end + 1, true);
return inFlow ? 'flow' : 'doc';
}
*pushCount(n) {
if (n > 0) {
yield this.buffer.substr(this.pos, n);
this.pos += n;
return n;
}
return 0;
}
*pushToIndex(i, allowEmpty) {
const s = this.buffer.slice(this.pos, i);
if (s) {
yield s;
this.pos += s.length;
return s.length;
}
else if (allowEmpty)
yield '';
return 0;
}
*pushIndicators() {
switch (this.charAt(0)) {
case '!':
return ((yield* this.pushTag()) +
(yield* this.pushSpaces(true)) +
(yield* this.pushIndicators()));
case '&':
return ((yield* this.pushUntil(isNotAnchorChar)) +
(yield* this.pushSpaces(true)) +
(yield* this.pushIndicators()));
case '-': // this is an error
case '?': // this is an error outside flow collections
case ':': {
const inFlow = this.flowLevel > 0;
const ch1 = this.charAt(1);
if (isEmpty(ch1) || (inFlow && flowIndicatorChars.has(ch1))) {
if (!inFlow)
this.indentNext = this.indentValue + 1;
else if (this.flowKey)
this.flowKey = false;
return ((yield* this.pushCount(1)) +
(yield* this.pushSpaces(true)) +
(yield* this.pushIndicators()));
}
}
}
return 0;
}
*pushTag() {
if (this.charAt(1) === '<') {
let i = this.pos + 2;
let ch = this.buffer[i];
while (!isEmpty(ch) && ch !== '>')
ch = this.buffer[++i];
return yield* this.pushToIndex(ch === '>' ? i + 1 : i, false);
}
else {
let i = this.pos + 1;
let ch = this.buffer[i];
while (ch) {
if (tagChars.has(ch))
ch = this.buffer[++i];
else if (ch === '%' &&
hexDigits.has(this.buffer[i + 1]) &&
hexDigits.has(this.buffer[i + 2])) {
ch = this.buffer[(i += 3)];
}
else
break;
}
return yield* this.pushToIndex(i, false);
}
}
*pushNewline() {
const ch = this.buffer[this.pos];
if (ch === '\n')
return yield* this.pushCount(1);
else if (ch === '\r' && this.charAt(1) === '\n')
return yield* this.pushCount(2);
else
return 0;
}
*pushSpaces(allowTabs) {
let i = this.pos - 1;
let ch;
do {
ch = this.buffer[++i];
} while (ch === ' ' || (allowTabs && ch === '\t'));
const n = i - this.pos;
if (n > 0) {
yield this.buffer.substr(this.pos, n);
this.pos = i;
}
return n;
}
*pushUntil(test) {
let i = this.pos;
let ch = this.buffer[i];
while (!test(ch))
ch = this.buffer[++i];
return yield* this.pushToIndex(i, false);
}
}
export { Lexer };