/leac.ts | leac@v0.6.0

Lexer / tokenizer
Latest
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301

import { createPositionQuery } from './positionQuery.ts';
/** Lexer options (not many so far). */export type Options = {  /**   * Enable line and column numbers computation.   */  lineNumbers?: boolean};
/** Result returned by a lexer function. */export type LexerResult = {  /** Array of tokens. */  tokens: Token[],  /** Final offset. */  offset: number,  /**   * True if whole input string was processed.   *   * Check this to see whether some input left untokenized.   */  complete: boolean};
/** * Lexer function. * * @param str - A string to tokenize. * @param offset - Initial offset. Used when composing lexers. */export type Lexer = (  str: string,  offset?: number) => LexerResult;
/** Token object, a result of matching an individual lexing rule. */export type Token = {  /** Name of the lexer containing the rule produced this token. */  state: string;  /** Name of the rule produced this token. */  name: string;  /** Text matched by the rule. _(Unless a replace value was used by a RegexRule.)_ */  text: string;  /** Start index of the match in the input string. */  offset: number;  /**   * The length of the matched substring.   *   * _(Might be different from the text length in case replace value   * was used in a RegexRule.)_   */  len: number;  /**   * Line number in the source string (1-based).   *   * _(Always zero if not enabled in the lexer options.)_   */  line: number;  /**   * Column number within the line in the source string (1-based).   *   * _(Always zero if line numbers not enabled in the lexer options.)_   */  column: number;}
/** * Lexing rule. * * Base rule looks for exact match by it's name. * * If the name and the lookup string have to be different * then specify `str` property as defined in {@link StringRule}. */export interface Rule {  /** The name of the rule, also the name of tokens produced by this rule. */  name: string;  /**   * Matched token won't be added to the output array if this set to `true`.   *   * (_Think twice before using this._)   * */  discard?: boolean;  /**   * Switch to another lexer function after this match,   * concatenate it's results and continue from where it stopped.   */  push?: Lexer;  /**   * Stop after this match and return.   *   * If there is a parent parser - it will continue from this point.   */  pop?: boolean;}
/** * String rule - looks for exact string match that * can be different from the name of the rule. */export interface StringRule extends Rule {  /**   * Specify the exact string to match   * if it is different from the name of the rule.   */  str: string;}
/** * Regex rule - looks for a regular expression match. */export interface RegexRule extends Rule {  /**   * Regular expression to match.   *   * - Can't have the global flag.   *   * - All regular expressions are used as sticky,   *   you don't have to specify the sticky flag.   *   * - Empty matches are considered as non-matches -   *   no token will be emitted in that case.   */  regex: RegExp;  /**   * Replacement string can include patterns,   * the same as [String.prototype.replace()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace#specifying_a_string_as_a_parameter).   *   * This will only affect the text property of an output token, not it's offset or length.   *   * Note: the regex has to be able to match the matched substring when taken out of context   * in order for replace to work - boundary/neighborhood conditions may prevent this.   */  replace?: string;}
function isRegexRule (r: Rule): r is RegexRule {  return Object.prototype.hasOwnProperty.call(r, 'regex');}
function isStringRule (r: Rule): r is StringRule {  return Object.prototype.hasOwnProperty.call(r, 'str');}
/** * Non-empty array of rules. * * Rules are processed in provided order, first match is taken. * * Rules can have the same name. For example, you can have * separate rules for various keywords and use the same name "keyword". */export type Rules = [  (Rule|StringRule|RegexRule),  ...(Rule|StringRule|RegexRule)[]];

/** * Create a lexer function. * * @param rules - Non-empty array of lexing rules. * * Rules are processed in provided order, first match is taken. * * Rules can have the same name - you can have separate rules * for keywords and use the same name "keyword" for example. * * @param state - The name of this lexer. Use when composing lexers. * Empty string by default. * * @param options - Lexer options object. */export function createLexer (  rules: Rules,  state?: string,  options?: Options): Lexer;
/** * Create a lexer function. * * @param rules - Non-empty array of lexing rules. * * Rules are processed in provided order, first match is taken. * * Rules can have the same name - you can have separate rules * for keywords and use the same name "keyword" for example. * * @param options - Lexer options object. */export function createLexer (  rules: Rules,  options?: Options): Lexer;
export function createLexer (  rules: Rules,  state: string|Options = '',  options: Options = {}): Lexer {  const options1 = (typeof state !== 'string') ? state : options;  const state1 = (typeof state === 'string') ? state : '';  const regexRules: RegexRule[] = rules.map(toRegexRule);  const isLineNumbers = !!options1.lineNumbers;  return function (str: string, offset = 0) {    const positionQuery = (isLineNumbers)      ? createPositionQuery(str)      : () => ({ line: 0, column: 0 });    let currentIndex = offset;    const tokens: Token[] = [];    loopStr:    while (currentIndex < str.length) {      let anyMatch = false;      for (const rule of regexRules) {        rule.regex.lastIndex = currentIndex;        const match = rule.regex.exec(str);        if (match && match[0].length > 0) {          if (!rule.discard) {            const position = positionQuery(currentIndex);            const text = (typeof rule.replace === 'string')              ? match[0].replace(                new RegExp(rule.regex.source, rule.regex.flags),                rule.replace              )              : match[0];            tokens.push({              state: state1,              name: rule.name,              text: text,              offset: currentIndex,              len: match[0].length,              line: position.line,              column: position.column            });          }          currentIndex = rule.regex.lastIndex;          anyMatch = true;          if (rule.push) {            const r = rule.push(str, currentIndex);            tokens.push(...r.tokens);            currentIndex = r.offset;          }          if (rule.pop) {            break loopStr;          }          break;        }      }      if (!anyMatch) {        break;      }    }    return {      tokens: tokens,      offset: currentIndex,      complete: str.length <= currentIndex    };  };}
function toRegexRule (r: Rule, i: number): RegexRule {  return { ...r, regex: toRegExp(r, i) };}
function toRegExp (r: Rule, i: number): RegExp {  if (r.name.length === 0) {    throw new Error(      `Rule #${i} has empty name, which is not allowed.`    );  }  if (isRegexRule(r)) {    return toSticky(r.regex);  }  if (isStringRule(r)) {    if (r.str.length === 0) {      throw new Error(        `Rule #${i} ("${r.name}") has empty "str" property, which is not allowed.`      );    }    return new RegExp(escapeRegExp(r.str), 'y');  }  return new RegExp(escapeRegExp(r.name), 'y');}
function escapeRegExp (str: string) {  return str.replace(/[-[\]{}()*+!<=:?./\\^$|#\s,]/g, '\\$&');}
function toSticky (re: RegExp) {  if (re.global) {    throw new Error(      `Regular expression /${re.source}/${re.flags} contains the global flag, which is not allowed.`    );  }  return (re.sticky)    ? re    : new RegExp(re.source, re.flags + 'y');}