Skip to main content
Module

x/sax_ts/src/sax.ts

SAX-style (Simple API for XML) in TypeScript
Go to Latest
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
/* eslint-disable no-constant-condition *//* eslint-disable @typescript-eslint/no-explicit-any *//* eslint-disable @typescript-eslint/no-this-alias *//* eslint-disable no-empty *//* eslint-disable no-prototype-builtins *//* eslint-disable no-case-declarations *//* eslint-disable no-misleading-character-class */// TODO: remove all "any" types and fix eslint issues
const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;export const ENTITIES: {[key: string]: number | string} = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'", AElig: 198, Aacute: 193, Acirc: 194, Agrave: 192, Aring: 197, Atilde: 195, Auml: 196, Ccedil: 199, ETH: 208, Eacute: 201, Ecirc: 202, Egrave: 200, Euml: 203, Iacute: 205, Icirc: 206, Igrave: 204, Iuml: 207, Ntilde: 209, Oacute: 211, Ocirc: 212, Ograve: 210, Oslash: 216, Otilde: 213, Ouml: 214, THORN: 222, Uacute: 218, Ucirc: 219, Ugrave: 217, Uuml: 220, Yacute: 221, aacute: 225, acirc: 226, aelig: 230, agrave: 224, aring: 229, atilde: 227, auml: 228, ccedil: 231, eacute: 233, ecirc: 234, egrave: 232, eth: 240, euml: 235, iacute: 237, icirc: 238, igrave: 236, iuml: 239, ntilde: 241, oacute: 243, ocirc: 244, ograve: 242, oslash: 248, otilde: 245, ouml: 246, szlig: 223, thorn: 254, uacute: 250, ucirc: 251, ugrave: 249, uuml: 252, yacute: 253, yuml: 255, copy: 169, reg: 174, nbsp: 160, iexcl: 161, cent: 162, pound: 163, curren: 164, yen: 165, brvbar: 166, sect: 167, uml: 168, ordf: 170, laquo: 171, not: 172, shy: 173, macr: 175, deg: 176, plusmn: 177, sup1: 185, sup2: 178, sup3: 179, acute: 180, micro: 181, para: 182, middot: 183, cedil: 184, ordm: 186, raquo: 187, frac14: 188, frac12: 189, frac34: 190, iquest: 191, times: 215, divide: 247, OElig: 338, oelig: 339, Scaron: 352, scaron: 353, Yuml: 376, fnof: 402, circ: 710, tilde: 732, Alpha: 913, Beta: 914, Gamma: 915, Delta: 916, Epsilon: 917, Zeta: 918, Eta: 919, Theta: 920, Iota: 921, Kappa: 922, Lambda: 923, Mu: 924, Nu: 925, Xi: 926, Omicron: 927, Pi: 928, Rho: 929, Sigma: 931, Tau: 932, Upsilon: 933, Phi: 934, Chi: 935, Psi: 936, Omega: 937, alpha: 945, beta: 946, gamma: 947, delta: 948, epsilon: 949, zeta: 950, eta: 951, theta: 952, iota: 953, kappa: 954, lambda: 955, mu: 956, nu: 957, xi: 958, omicron: 959, pi: 960, rho: 961, sigmaf: 962, sigma: 963, tau: 964, upsilon: 965, phi: 966, chi: 967, psi: 968, omega: 969, thetasym: 977, upsih: 978, piv: 982, ensp: 8194, emsp: 8195, thinsp: 8201, zwnj: 8204, zwj: 8205, lrm: 8206, rlm: 8207, ndash: 8211, mdash: 8212, lsquo: 8216, rsquo: 8217, sbquo: 8218, ldquo: 8220, rdquo: 8221, bdquo: 8222, dagger: 8224, Dagger: 8225, bull: 8226, hellip: 8230, permil: 8240, prime: 8242, Prime: 8243, lsaquo: 8249, rsaquo: 8250, oline: 8254, frasl: 8260, euro: 8364, image: 8465, weierp: 8472, real: 8476, trade: 8482, alefsym: 8501, larr: 8592, uarr: 8593, rarr: 8594, darr: 8595, harr: 8596, crarr: 8629, lArr: 8656, uArr: 8657, rArr: 8658, dArr: 8659, hArr: 8660, forall: 8704, part: 8706, exist: 8707, empty: 8709, nabla: 8711, isin: 8712, notin: 8713, ni: 8715, prod: 8719, sum: 8721, minus: 8722, lowast: 8727, radic: 8730, prop: 8733, infin: 8734, ang: 8736, and: 8743, or: 8744, cap: 8745, cup: 8746, int: 8747, there4: 8756, sim: 8764, cong: 8773, asymp: 8776, ne: 8800, equiv: 8801, le: 8804, ge: 8805, sub: 8834, sup: 8835, nsub: 8836, sube: 8838, supe: 8839, oplus: 8853, otimes: 8855, perp: 8869, sdot: 8901, lceil: 8968, rceil: 8969, lfloor: 8970, rfloor: 8971, lang: 9001, rang: 9002, loz: 9674, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830,};Object.keys(ENTITIES).forEach(key => { const e = ENTITIES[key]; ENTITIES[key] = typeof e === 'number' ? String.fromCharCode(e) : e;});
interface SAXInterface { [key: string]: any;}
export class SAX implements SAXInterface { [key: string]: any;
EVENTS: string[]; ENTITIES: {[key: string]: number | string} = { // TODO: make it readonly, needed for entity-mega test // amp, gt, lt, quot and apos are resolved to strings instead of numerical // codes, IDK why ...ENTITIES, }; protected XML_ENTITIES: {[key: string]: string} = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'", }; protected S: any = 0; protected opt: any; protected trackPosition = false; protected column = 0; protected line = 0; protected c = ''; protected error: any; protected q = ''; protected bufferCheckPosition: any; protected closed = false; protected tags: any[] = []; protected looseCase = ''; protected closedRoot = false; protected sawRoot = false; protected strict = false; protected tag: any; protected strictEntities: any; protected state: any; protected noscript = false; protected attribList: any[] = []; protected ns: any; protected position = 0; private STATE: {[index: string]: any} = { BEGIN: this.S++, // leading byte order mark or whitespace BEGIN_WHITESPACE: this.S++, // leading whitespace TEXT: this.S++, // general stuff TEXT_ENTITY: this.S++, // &amp and such. OPEN_WAKA: this.S++, // < SGML_DECL: this.S++, // <!BLARG SGML_DECL_QUOTED: this.S++, // <!BLARG foo "bar DOCTYPE: this.S++, // <!DOCTYPE DOCTYPE_QUOTED: this.S++, // <!DOCTYPE "//blah DOCTYPE_DTD: this.S++, // <!DOCTYPE "//blah" [ ... DOCTYPE_DTD_QUOTED: this.S++, // <!DOCTYPE "//blah" [ "foo COMMENT_STARTING: this.S++, // <!- COMMENT: this.S++, // <!-- COMMENT_ENDING: this.S++, // <!-- blah - COMMENT_ENDED: this.S++, // <!-- blah -- CDATA: this.S++, // <![CDATA[ something CDATA_ENDING: this.S++, // ] CDATA_ENDING_2: this.S++, // ]] PROC_INST: this.S++, // <?hi PROC_INST_BODY: this.S++, // <?hi there PROC_INST_ENDING: this.S++, // <?hi "there" ? OPEN_TAG: this.S++, // <strong OPEN_TAG_SLASH: this.S++, // <strong / ATTRIB: this.S++, // <a ATTRIB_NAME: this.S++, // <a foo ATTRIB_NAME_SAW_WHITE: this.S++, // <a foo _ ATTRIB_VALUE: this.S++, // <a foo= ATTRIB_VALUE_QUOTED: this.S++, // <a foo="bar ATTRIB_VALUE_CLOSED: this.S++, // <a foo="bar" ATTRIB_VALUE_UNQUOTED: this.S++, // <a foo=bar ATTRIB_VALUE_ENTITY_Q: this.S++, // <foo bar="&quot;" ATTRIB_VALUE_ENTITY_U: this.S++, // <foo bar=&quot CLOSE_TAG: this.S++, // </a CLOSE_TAG_SAW_WHITE: this.S++, // </a > SCRIPT: this.S++, // <script> ... SCRIPT_ENDING: this.S++, // <script> ... < }; private readonly BUFFERS: string[]; private parser: (strict: boolean, opt: any) => SAXParser; private CDATA = '[CDATA['; private DOCTYPE = 'DOCTYPE'; private XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; private XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'; protected rootNS: {} = { xml: this.XML_NAMESPACE, xmlns: this.XMLNS_NAMESPACE, }; private comment: any; private sgmlDecl: any; private textNode = ''; private tagName: any; private doctype: any; private procInstName: any; private procInstBody: any; private entity = ''; private attribName: any; private attribValue: any; private cdata = ''; private script = ''; private startTagPosition = 0;
constructor() { this.BUFFERS = [ 'comment', 'sgmlDecl', 'textNode', 'tagName', 'doctype', 'procInstName', 'procInstBody', 'entity', 'attribName', 'attribValue', 'cdata', 'script', ]; this.EVENTS = [ 'text', 'processinginstruction', 'sgmldeclaration', 'doctype', 'comment', 'opentagstart', 'attribute', 'opentag', 'closetag', 'opencdata', 'cdata', 'closecdata', 'error', 'end', 'ready', 'script', 'opennamespace', 'closenamespace', ];
this.S = 0;
for (const s in this.STATE) { if (this.STATE.hasOwnProperty(s)) { this.STATE[this.STATE[s]] = s; } }
// shorthand this.S = this.STATE;
this.parser = (strict, opt) => new SAXParser(strict, opt); }
private static charAt(chunk: string, i: number) { let result = ''; if (i < chunk.length) { result = chunk.charAt(i); } return result; }
private static isWhitespace(c: string) { return c === ' ' || c === '\n' || c === '\r' || c === '\t'; }
private static isQuote(c: string) { return c === '"' || c === "'"; }
private static isAttribEnd(c: string) { return c === '>' || SAX.isWhitespace(c); }
private static isMatch(regex: RegExp, c: string) { return regex.test(c); }
private static notMatch(regex: RegExp, c: string) { return !SAX.isMatch(regex, c); }
private static qname(name: string, attribute?: string | boolean) { const i = name.indexOf(':'); const qualName = i < 0 ? ['', name] : name.split(':'); let prefix = qualName[0]; let local = qualName[1];
// <x "xmlns"="http://foo"> if (attribute && name === 'xmlns') { prefix = 'xmlns'; local = ''; }
return {prefix, local}; }
write(chunk: null | object | string) { if (this.error) { throw this.error; } if (this.closed) { return this.errorFunction( 'Cannot write after close. Assign an onready handler.' ); } if (chunk === null) { return this.end(); } if (typeof chunk === 'object') { chunk = chunk.toString(); } let i = 0; let c: string; while (true) { c = SAX.charAt(chunk, i++); this.c = c;
if (!c) { break; }
if (this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } }
switch (this.state) { case this.S.BEGIN: this.state = this.S.BEGIN_WHITESPACE; if (c === '\uFEFF') { continue; } this.beginWhiteSpace(c); continue;
case this.S.BEGIN_WHITESPACE: this.beginWhiteSpace(c); continue;
case this.S.TEXT: if (this.sawRoot && !this.closedRoot) { const starti = i - 1; while (c && c !== '<' && c !== '&') { c = SAX.charAt(chunk, i++); if (c && this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } } } this.textNode += chunk.substring(starti, i - 1); } if (c === '<' && !(this.sawRoot && this.closedRoot && !this.strict)) { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else { if (!SAX.isWhitespace(c) && (!this.sawRoot || this.closedRoot)) { this.strictFail('Text data outside of root node.'); } if (c === '&') { this.state = this.S.TEXT_ENTITY; } else { this.textNode += c; } } continue;
case this.S.SCRIPT: // only non-strict if (c === '<') { this.state = this.S.SCRIPT_ENDING; } else { this.script += c; } continue;
case this.S.SCRIPT_ENDING: if (c === '/') { this.state = this.S.CLOSE_TAG; } else { this.script += '<' + c; this.state = this.S.SCRIPT; } continue;
case this.S.OPEN_WAKA: // either a /, ?, !, or text is coming next. if (c === '!') { this.state = this.S.SGML_DECL; this.sgmlDecl = ''; } else if (SAX.isWhitespace(c)) { // wait for it... } else if (SAX.isMatch(nameStart, c)) { this.state = this.S.OPEN_TAG; this.tagName = c; } else if (c === '/') { this.state = this.S.CLOSE_TAG; this.tagName = ''; } else if (c === '?') { this.state = this.S.PROC_INST; this.procInstName = this.procInstBody = ''; } else { this.strictFail('Unencoded <'); // if there was some whitespace, then add that in. if (this.startTagPosition + 1 < this.position) { const pad = this.position - this.startTagPosition; c = new Array(pad).join(' ') + c; } this.textNode += '<' + c; this.state = this.S.TEXT; } continue;
case this.S.SGML_DECL: if ((this.sgmlDecl + c).toUpperCase() === this.CDATA) { this.emitNode('onopencdata'); this.state = this.S.CDATA; this.sgmlDecl = ''; this.cdata = ''; } else if (this.sgmlDecl + c === '--') { this.state = this.S.COMMENT; this.comment = ''; this.sgmlDecl = ''; } else if ((this.sgmlDecl + c).toUpperCase() === this.DOCTYPE) { this.state = this.S.DOCTYPE; if (this.doctype || this.sawRoot) { this.strictFail('Inappropriately located doctype declaration'); } this.doctype = ''; this.sgmlDecl = ''; } else if (c === '>') { this.emitNode('onsgmldeclaration', this.sgmlDecl); this.sgmlDecl = ''; this.state = this.S.TEXT; } else if (SAX.isQuote(c)) { this.state = this.S.SGML_DECL_QUOTED; this.sgmlDecl += c; } else { this.sgmlDecl += c; } continue;
case this.S.SGML_DECL_QUOTED: if (c === this.q) { this.state = this.S.SGML_DECL; this.q = ''; } this.sgmlDecl += c; continue;
case this.S.DOCTYPE: if (c === '>') { this.state = this.S.TEXT; this.emitNode('ondoctype', this.doctype); this.doctype = true; // just remember that we saw it. } else { this.doctype += c; if (c === '[') { this.state = this.S.DOCTYPE_DTD; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_QUOTED; this.q = c; } } continue;
case this.S.DOCTYPE_QUOTED: this.doctype += c; if (c === this.q) { this.q = ''; this.state = this.S.DOCTYPE; } continue;
case this.S.DOCTYPE_DTD: this.doctype += c; if (c === ']') { this.state = this.S.DOCTYPE; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_DTD_QUOTED; this.q = c; } continue;
case this.S.DOCTYPE_DTD_QUOTED: this.doctype += c; if (c === this.q) { this.state = this.S.DOCTYPE_DTD; this.q = ''; } continue;
case this.S.COMMENT: if (c === '-') { this.state = this.S.COMMENT_ENDING; } else { this.comment += c; } continue;
case this.S.COMMENT_ENDING: if (c === '-') { this.state = this.S.COMMENT_ENDED; this.comment = this.textApplyOptions(this.comment); if (this.comment) { this.emitNode('oncomment', this.comment); } this.comment = ''; } else { this.comment += '-' + c; this.state = this.S.COMMENT; } continue;
case this.S.COMMENT_ENDED: if (c !== '>') { this.strictFail('Malformed comment'); // allow <!-- blah -- bloo --> in non-strict mode, // which is a comment of " blah -- bloo " this.comment += '--' + c; this.state = this.S.COMMENT; } else { this.state = this.S.TEXT; } continue;
case this.S.CDATA: if (c === ']') { this.state = this.S.CDATA_ENDING; } else { this.cdata += c; } continue;
case this.S.CDATA_ENDING: if (c === ']') { this.state = this.S.CDATA_ENDING_2; } else { this.cdata += ']' + c; this.state = this.S.CDATA; } continue;
case this.S.CDATA_ENDING_2: if (c === '>') { if (this.cdata) { this.emitNode('oncdata', this.cdata); } this.emitNode('onclosecdata'); this.cdata = ''; this.state = this.S.TEXT; } else if (c === ']') { this.cdata += ']'; } else { this.cdata += ']]' + c; this.state = this.S.CDATA; } continue;
case this.S.PROC_INST: if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else if (SAX.isWhitespace(c)) { this.state = this.S.PROC_INST_BODY; } else { this.procInstName += c; } continue;
case this.S.PROC_INST_BODY: if (!this.procInstBody && SAX.isWhitespace(c)) { continue; } else if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else { this.procInstBody += c; } continue;
case this.S.PROC_INST_ENDING: if (c === '>') { this.emitNode('onprocessinginstruction', { name: this.procInstName, body: this.procInstBody, }); this.procInstName = this.procInstBody = ''; this.state = this.S.TEXT; } else { this.procInstBody += '?' + c; this.state = this.S.PROC_INST_BODY; } continue;
case this.S.OPEN_TAG: if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else { this.newTag(); if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid character in tag name'); } this.state = this.S.ATTRIB; } } continue;
case this.S.OPEN_TAG_SLASH: if (c === '>') { this.openTag(true); this.closeTag(); } else { this.strictFail('Forward-slash in opening tag not followed by >'); this.state = this.S.ATTRIB; } continue;
case this.S.ATTRIB: // haven't read the attribute name yet. if (SAX.isWhitespace(c)) { continue; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue;
case this.S.ATTRIB_NAME: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (c === '>') { this.strictFail('Attribute without value'); this.attribValue = this.attribName; this.attrib(); this.openTag(); } else if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB_NAME_SAW_WHITE; } else if (SAX.isMatch(nameBody, c)) { this.attribName += c; } else { this.strictFail('Invalid attribute name'); } continue;
case this.S.ATTRIB_NAME_SAW_WHITE: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (SAX.isWhitespace(c)) { continue; } else { this.strictFail('Attribute without value'); this.tag.attributes[this.attribName] = ''; this.attribValue = ''; this.emitNode('onattribute', { name: this.attribName, value: '', }); this.attribName = ''; if (c === '>') { this.openTag(); } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); this.state = this.S.ATTRIB; } } continue;
case this.S.ATTRIB_VALUE: if (SAX.isWhitespace(c)) { continue; } else if (SAX.isQuote(c)) { this.q = c; this.state = this.S.ATTRIB_VALUE_QUOTED; } else { this.strictFail('Unquoted attribute value'); this.state = this.S.ATTRIB_VALUE_UNQUOTED; this.attribValue = c; } continue;
case this.S.ATTRIB_VALUE_QUOTED: if (c !== this.q) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_Q; } else { this.attribValue += c; } continue; } this.attrib(); this.q = ''; this.state = this.S.ATTRIB_VALUE_CLOSED; continue;
case this.S.ATTRIB_VALUE_CLOSED: if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.strictFail('No whitespace between attributes'); this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue;
case this.S.ATTRIB_VALUE_UNQUOTED: if (!SAX.isAttribEnd(c)) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_U; } else { this.attribValue += c; } continue; } this.attrib(); if (c === '>') { this.openTag(); } else { this.state = this.S.ATTRIB; } continue;
case this.S.CLOSE_TAG: if (!this.tagName) { if (SAX.isWhitespace(c)) { continue; } else if (SAX.notMatch(nameStart, c)) { if (this.script) { this.script += '</' + c; this.state = this.S.SCRIPT; } else { this.strictFail('Invalid tagname in closing tag.'); } } else { this.tagName = c; } } else if (c === '>') { this.closeTag(); } else if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else if (this.script) { this.script += '</' + this.tagName; this.tagName = ''; this.state = this.S.SCRIPT; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid tagname in closing tag'); } this.state = this.S.CLOSE_TAG_SAW_WHITE; } continue;
case this.S.CLOSE_TAG_SAW_WHITE: if (SAX.isWhitespace(c)) { continue; } if (c === '>') { this.closeTag(); } else { this.strictFail('Invalid characters in closing tag'); } continue;
case this.S.TEXT_ENTITY: case this.S.ATTRIB_VALUE_ENTITY_Q: case this.S.ATTRIB_VALUE_ENTITY_U: let returnState; let buffer; switch (this.state) { case this.S.TEXT_ENTITY: returnState = this.S.TEXT; buffer = 'textNode'; break;
case this.S.ATTRIB_VALUE_ENTITY_Q: returnState = this.S.ATTRIB_VALUE_QUOTED; buffer = 'attribValue'; break;
case this.S.ATTRIB_VALUE_ENTITY_U: returnState = this.S.ATTRIB_VALUE_UNQUOTED; buffer = 'attribValue'; break;
default: throw new Error('Unknown state: ' + this.state); }
if (c === ';') { this[buffer] += this.parseEntity(); this.entity = ''; this.state = returnState; } else if ( SAX.isMatch(this.entity.length ? entityBody : entityStart, c) ) { this.entity += c; } else { this.strictFail('Invalid character in entity name'); this[buffer] += '&' + this.entity + c; this.entity = ''; this.state = returnState; }
continue;
default: throw new Error('Unknown state: ' + this.state); } } // while
if (this.position >= this.bufferCheckPosition) { this.checkBufferLength(); } return this; }
protected emit(event: string, data?: Error | {}): void { if (this.hasOwnProperty(event)) this[event](data); }
protected clearBuffers() { for (let i = 0, l = this.BUFFERS.length; i < l; i++) { this[this[i]] = ''; } }
protected flushBuffers() { this.closeText(); if (this.cdata !== '') { this.emitNode('oncdata', this.cdata); this.cdata = ''; } if (this.script !== '') { this.emitNode('onscript', this.script); this.script = ''; } }
protected end() { if (this.sawRoot && !this.closedRoot) this.strictFail('Unclosed root tag'); if ( this.state !== this.S.BEGIN && this.state !== this.S.BEGIN_WHITESPACE && this.state !== this.S.TEXT ) { this.errorFunction('Unexpected end'); } this.closeText(); this.c = ''; this.closed = true; this.emit('onend'); return new SAXParser(this.strict, this.opt); }
protected errorFunction(er: string) { this.closeText(); if (this.trackPosition) { er += '\nLine: ' + this.line + '\nColumn: ' + this.column + '\nChar: ' + this.c; } const error = new Error(er); this.error = error; this.emit('onerror', error); return this; }
private attrib() { if (!this.strict) { this.attribName = this.attribName[this.looseCase](); }
if ( this.attribList.indexOf(this.attribName) !== -1 || this.tag.attributes.hasOwnProperty(this.attribName) ) { this.attribName = this.attribValue = ''; return; }
if (this.opt.xmlns) { const qn = SAX.qname(this.attribName, true); const prefix = qn.prefix; const local = qn.local;
if (prefix === 'xmlns') { // namespace binding attribute. push the binding into scope if (local === 'xml' && this.attribValue !== this.XML_NAMESPACE) { this.strictFail( 'xml: prefix must be bound to ' + this.XML_NAMESPACE + '\n' + 'Actual: ' + this.attribValue ); } else if ( local === 'xmlns' && this.attribValue !== this.XMLNS_NAMESPACE ) { this.strictFail( 'xmlns: prefix must be bound to ' + this.XMLNS_NAMESPACE + '\n' + 'Actual: ' + this.attribValue ); } else { const tag = this.tag; const parent = this.tags[this.tags.length - 1] || this; if (tag.ns === parent.ns) { tag.ns = Object.create(parent.ns); } tag.ns[local] = this.attribValue; } }
// defer onattribute events until all attributes have been seen // so any new bindings can take effect. preserve attribute order // so deferred events can be emitted in document order this.attribList.push([this.attribName, this.attribValue]); } else { // in non-xmlns mode, we can emit the event right away this.tag.attributes[this.attribName] = this.attribValue; this.emitNode('onattribute', { name: this.attribName, value: this.attribValue, }); }
this.attribName = this.attribValue = ''; }
private newTag() { if (!this.strict) this.tagName = this.tagName[this.looseCase](); const parent = this.tags[this.tags.length - 1] || this; const tag: any = (this.tag = {name: this.tagName, attributes: {}});
// will be overridden if tag contains an xmlns="foo" or xmlns:foo="bar" if (this.opt.xmlns) { tag.ns = parent.ns; } this.attribList.length = 0; this.emitNode('onopentagstart', tag); }
private parseEntity() { let entity = this.entity; const entityLC = entity.toLowerCase(); let num = NaN; let numStr = '';
if (this.ENTITIES[entity]) { return this.ENTITIES[entity]; } if (this.ENTITIES[entityLC]) { return this.ENTITIES[entityLC]; } entity = entityLC; if (entity.charAt(0) === '#') { if (entity.charAt(1) === 'x') { entity = entity.slice(2); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 16); numStr = num.toString(16); } else { entity = entity.slice(1); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 10); numStr = num.toString(10); } }
entity = entity.replace(/^0+/, ''); if (isNaN(num) || numStr.toLowerCase() !== entity) { this.strictFail('Invalid character entity'); return '&' + this.entity + ';'; }
return String.fromCodePoint(num); }
private beginWhiteSpace(c: string) { if (c === '<') { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else if (!SAX.isWhitespace(c)) { // have to process this as a text node. // weird, but happens. this.strictFail('Non-whitespace before first tag.'); this.textNode = c; this.state = this.S.TEXT; } else { } }
private strictFail(message: string) { if (typeof this !== 'object' || !(this instanceof SAXParser)) { throw new Error('bad call to strictFail'); } if (this.strict) { this.errorFunction(message); } }
private textApplyOptions(text: string): string { if (this.opt.trim) text = text.trim(); if (this.opt.normalize) text = text.replace(/\s+/g, ' '); return text; }
private emitNode(nodeType: string, data?: {}) { if (this.textNode) this.closeText(); this.emit(nodeType, data); }
private closeText() { this.textNode = this.textApplyOptions(this.textNode); // TODO: figure out why this.textNode can be "" and "undefined" if ( this.textNode !== undefined && this.textNode !== '' && this.textNode !== 'undefined' ) { this.emit('ontext', this.textNode); } this.textNode = ''; }
private checkBufferLength() { const maxAllowed = Math.max(this.opt.MAX_BUFFER_LENGTH, 10); let maxActual = 0; for (let i = 0, l = this.BUFFERS.length; i < l; i++) { const len = this.hasOwnProperty(this.BUFFERS[i]) ? this[this.BUFFERS[i]].length : 0; if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (this.BUFFERS[i]) { case 'textNode': this.closeText(); break; case 'cdata': this.emitNode('oncdata', this.cdata); this.cdata = ''; break; case 'script': this.emitNode('onscript', this.script); this.script = ''; break; default: this.errorFunction( 'Max buffer length exceeded: ' + this.BUFFERS[i] ); } } maxActual = Math.max(maxActual, len); } // schedule the next check for the earliest possible buffer overrun. const m = this.opt.MAX_BUFFER_LENGTH - maxActual; this.bufferCheckPosition = m + this.position; }
private openTag(selfClosing?: boolean) { if (this.opt.xmlns) { // emit namespace binding events const tag = this.tag;
// add namespace info to tag const qn = SAX.qname(this.tagName); tag.prefix = qn.prefix; tag.local = qn.local; tag.uri = tag.ns[qn.prefix] || '';
if (tag.prefix && !tag.uri) { this.strictFail( 'Unbound namespace prefix: ' + JSON.stringify(this.tagName) ); tag.uri = qn.prefix; }
const parent = this.tags[this.tags.length - 1] || this; if (tag.ns && parent.ns !== tag.ns) { const that = this; Object.keys(tag.ns).forEach(p => { that.emitNode('onopennamespace', { prefix: p, uri: tag.ns[p], }); }); }
// handle deferred onattribute events // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (let i = 0, l = this.attribList.length; i < l; i++) { const nv = this.attribList[i]; const name = nv[0]; const value = nv[1]; const qualName = SAX.qname(name, true); const prefix = qualName.prefix; const local = qualName.local; const uri = prefix === '' ? '' : tag.ns[prefix] || ''; const a = { name, value, prefix, local, uri, };
// if there's any attributes with an undefined namespace, // then fail on them now. if (prefix && prefix !== 'xmlns' && !uri) { this.strictFail( 'Unbound namespace prefix: ' + JSON.stringify(prefix) ); a.uri = prefix; } this.tag.attributes[name] = a; this.emitNode('onattribute', a); } this.attribList.length = 0; }
this.tag.isSelfClosing = !!selfClosing;
// process the tag this.sawRoot = true; this.tags.push(this.tag); this.emitNode('onopentag', this.tag); if (!selfClosing) { // special case for <script> in non-strict mode. if (!this.noscript && this.tagName.toLowerCase() === 'script') { this.state = this.S.SCRIPT; } else { this.state = this.S.TEXT; } this.tag = null; this.tagName = ''; } this.attribName = this.attribValue = ''; this.attribList.length = 0; }
private closeTag() { if (!this.tagName) { this.strictFail('Weird empty close tag.'); this.textNode += '</>'; this.state = this.S.TEXT; return; }
if (this.script) { if (this.tagName !== 'script') { this.script += '</' + this.tagName + '>'; this.tagName = ''; this.state = this.S.SCRIPT; return; } this.emitNode('onscript', this.script); this.script = ''; }
// first make sure that the closing tag actually exists. // <a><b></c></b></a> will close everything, otherwise. let t = this.tags.length; let tagName = this.tagName; if (!this.strict) { tagName = tagName[this.looseCase](); } while (t--) { const close = this.tags[t]; if (close.name !== tagName) { // fail the first time in strict mode this.strictFail('Unexpected close tag'); } else { break; } }
// didn't find it. we already failed for strict, so just abort. if (t < 0) { this.strictFail('Unmatched closing tag: ' + this.tagName); this.textNode += '</' + this.tagName + '>'; this.state = this.S.TEXT; return; } this.tagName = tagName; let s = this.tags.length; while (s-- > t) { const tag = (this.tag = this.tags.pop()); this.tagName = this.tag.name; this.emitNode('onclosetag', this.tagName);
const x: {[index: string]: any} = {}; for (const i in tag.ns) { if (tag.ns.hasOwnProperty(i)) { x[i] = tag.ns[i]; } }
const parent = this.tags[this.tags.length - 1] || this; if (this.opt.xmlns && tag.ns !== parent.ns) { // remove namespace bindings introduced by tag const that = this; Object.keys(tag.ns).forEach(p => { const n = tag.ns[p]; that.emitNode('onclosenamespace', {prefix: p, uri: n}); }); } } if (t === 0) this.closedRoot = true; this.tagName = this.attribValue = this.attribName = ''; this.attribList.length = 0; this.state = this.S.TEXT; }}
export class SAXParser extends SAX { constructor(strict: boolean, opt: any) { super();
if (!(this instanceof SAXParser)) { return new SAXParser(strict, opt); }
this.clearBuffers(); this.q = this.c = ''; this.opt = {MAX_BUFFER_LENGTH: 64 * 1024, ...opt}; this.bufferCheckPosition = this.opt.MAX_BUFFER_LENGTH; this.opt.lowercase = this.opt.lowercase || this.opt.lowercasetags || false; this.looseCase = this.opt.lowercase ? 'toLowerCase' : 'toUpperCase'; this.tags = []; this.closed = this.closedRoot = this.sawRoot = false; this.tag = this.error = null; this.strict = !!strict; this.noscript = !!(strict || this.opt.noscript); this.state = this.S.BEGIN; this.strictEntities = this.opt.strictEntities; this.ENTITIES = this.strictEntities ? Object.create(this.XML_ENTITIES) : Object.create(this.ENTITIES); this.attribList = [];
// namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (this.opt.xmlns) { this.ns = Object.create(this.rootNS); }
// mostly just for error reporting this.trackPosition = this.opt.position !== false; if (this.trackPosition) { this.position = this.line = this.column = 0; } this.emit('onready'); }
// TODO: try to make it better ontext: Function = () => {}; onprocessinginstruction: Function = () => {}; onsgmldeclaration: Function = () => {}; ondoctype: Function = () => {}; oncomment: Function = () => {}; onopentagstart: Function = () => {}; onattribute: Function = () => {}; onopentag: Function = () => {}; onclosetag: Function = () => {}; onopencdata: Function = () => {}; oncdata: Function = () => {}; onclosecdata: Function = () => {}; onerror: Function = () => {}; onend: Function = () => {}; onready: Function = () => {}; onscript: Function = () => {}; onopennamespace: Function = () => {}; onclosenamespace: Function = () => {};
resume() { this.error = null; return this; }
close() { return this.write(null); }
flush() { this.flushBuffers(); }}