173 lines
5.7 KiB
TypeScript
173 lines
5.7 KiB
TypeScript
/** All the states the tokenizer can be in. */
|
|
declare const enum State {
|
|
Text = 1,
|
|
BeforeTagName = 2,
|
|
InTagName = 3,
|
|
InSelfClosingTag = 4,
|
|
BeforeClosingTagName = 5,
|
|
InClosingTagName = 6,
|
|
AfterClosingTagName = 7,
|
|
BeforeAttributeName = 8,
|
|
InAttributeName = 9,
|
|
AfterAttributeName = 10,
|
|
BeforeAttributeValue = 11,
|
|
InAttributeValueDq = 12,
|
|
InAttributeValueSq = 13,
|
|
InAttributeValueNq = 14,
|
|
BeforeDeclaration = 15,
|
|
InDeclaration = 16,
|
|
InProcessingInstruction = 17,
|
|
BeforeComment = 18,
|
|
CDATASequence = 19,
|
|
InSpecialComment = 20,
|
|
InCommentLike = 21,
|
|
BeforeSpecialS = 22,
|
|
SpecialStartSequence = 23,
|
|
InSpecialTag = 24,
|
|
BeforeEntity = 25,
|
|
BeforeNumericEntity = 26,
|
|
InNamedEntity = 27,
|
|
InNumericEntity = 28,
|
|
InHexEntity = 29
|
|
}
|
|
export interface Callbacks {
|
|
onattribdata(value: string): void;
|
|
onattribend(quote: string | undefined | null): void;
|
|
onattribname(name: string): void;
|
|
oncdata(data: string): void;
|
|
onclosetag(name: string): void;
|
|
oncomment(data: string): void;
|
|
ondeclaration(content: string): void;
|
|
onend(): void;
|
|
onerror(error: Error, state?: State): void;
|
|
onopentagend(): void;
|
|
onopentagname(name: string): void;
|
|
onprocessinginstruction(instruction: string): void;
|
|
onselfclosingtag(): void;
|
|
ontext(value: string): void;
|
|
}
|
|
export default class Tokenizer {
|
|
private readonly cbs;
|
|
/** The current state the tokenizer is in. */
|
|
private _state;
|
|
/** The read buffer. */
|
|
private buffer;
|
|
/** The beginning of the section that is currently being read. */
|
|
sectionStart: number;
|
|
/** The index within the buffer that we are currently looking at. */
|
|
private _index;
|
|
/**
|
|
* Data that has already been processed will be removed from the buffer occasionally.
|
|
* `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
|
|
*/
|
|
private bufferOffset;
|
|
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
|
|
private baseState;
|
|
/** For special parsing behavior inside of script and style tags. */
|
|
private isSpecial;
|
|
/** Indicates whether the tokenizer has been paused. */
|
|
private running;
|
|
/** Indicates whether the tokenizer has finished running / `.end` has been called. */
|
|
private ended;
|
|
private readonly xmlMode;
|
|
private readonly decodeEntities;
|
|
private readonly entityTrie;
|
|
constructor({ xmlMode, decodeEntities, }: {
|
|
xmlMode?: boolean;
|
|
decodeEntities?: boolean;
|
|
}, cbs: Callbacks);
|
|
reset(): void;
|
|
write(chunk: string): void;
|
|
end(chunk?: string): void;
|
|
pause(): void;
|
|
resume(): void;
|
|
/**
|
|
* The start of the current section.
|
|
*/
|
|
getAbsoluteSectionStart(): number;
|
|
/**
|
|
* The current index within all of the written data.
|
|
*/
|
|
getAbsoluteIndex(): number;
|
|
private stateText;
|
|
private currentSequence;
|
|
private sequenceIndex;
|
|
private stateSpecialStartSequence;
|
|
/** Look for an end tag. For <title> tags, also decode entities. */
|
|
private stateInSpecialTag;
|
|
private stateCDATASequence;
|
|
/**
|
|
* When we wait for one specific character, we can speed things up
|
|
* by skipping through the buffer until we find it.
|
|
*
|
|
* @returns Whether the character was found.
|
|
*/
|
|
private fastForwardTo;
|
|
/**
|
|
* Comments and CDATA end with `-->` and `]]>`.
|
|
*
|
|
* Their common qualities are:
|
|
* - Their end sequences have a distinct character they start with.
|
|
* - That character is then repeated, so we have to check multiple repeats.
|
|
* - All characters but the start character of the sequence can be skipped.
|
|
*/
|
|
private stateInCommentLike;
|
|
/**
|
|
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
|
|
*
|
|
* XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
|
|
* We allow anything that wouldn't end the tag.
|
|
*/
|
|
private isTagStartChar;
|
|
private startSpecial;
|
|
private stateBeforeTagName;
|
|
private stateInTagName;
|
|
private stateBeforeClosingTagName;
|
|
private stateInClosingTagName;
|
|
private stateAfterClosingTagName;
|
|
private stateBeforeAttributeName;
|
|
private stateInSelfClosingTag;
|
|
private stateInAttributeName;
|
|
private stateAfterAttributeName;
|
|
private stateBeforeAttributeValue;
|
|
private handleInAttributeValue;
|
|
private stateInAttributeValueDoubleQuotes;
|
|
private stateInAttributeValueSingleQuotes;
|
|
private stateInAttributeValueNoQuotes;
|
|
private stateBeforeDeclaration;
|
|
private stateInDeclaration;
|
|
private stateInProcessingInstruction;
|
|
private stateBeforeComment;
|
|
private stateInSpecialComment;
|
|
private stateBeforeSpecialS;
|
|
private trieIndex;
|
|
private trieCurrent;
|
|
private trieResult;
|
|
private entityExcess;
|
|
private stateBeforeEntity;
|
|
private stateInNamedEntity;
|
|
private emitNamedEntity;
|
|
private stateBeforeNumericEntity;
|
|
private decodeNumericEntity;
|
|
private stateInNumericEntity;
|
|
private stateInHexEntity;
|
|
private allowLegacyEntity;
|
|
/**
|
|
* Remove data that has already been consumed from the buffer.
|
|
*/
|
|
private cleanup;
|
|
private shouldContinue;
|
|
/**
|
|
* Iterates through the buffer, calling the function corresponding to the current state.
|
|
*
|
|
* States that are more likely to be hit are higher up, as a performance improvement.
|
|
*/
|
|
private parse;
|
|
private finish;
|
|
/** Handle any trailing data. */
|
|
private handleTrailingData;
|
|
private getSection;
|
|
private emitPartial;
|
|
}
|
|
export {};
|
|
//# sourceMappingURL=Tokenizer.d.ts.map
|