Tokenizer | /src/mod.ts | html_parser@v0.1.3

class Tokenizer

import { Tokenizer } from "https://deno.land/x/html_parser@v0.1.3/src/mod.ts";

Constructors

new

Tokenizer(options: { xmlMode?: boolean; decodeEntities?: boolean; } | null, cbs: Callbacks)

Properties

private

baseState

Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type.

private

buffer: string

The read buffer.

private

bufferOffset: number

Data that has already been processed will be removed from the buffer occasionally. _bufferOffset keeps track of how many characters have been removed, to make sure position information is accurate.

private

readonly

cbs: Callbacks

private

readonly

decodeEntities: boolean

private

ended: boolean

Indicates whether the tokenizer has finished running / .end has been called.

private

running: boolean

Indicates whether the tokenizer has been paused.

private

special

For special parsing behavior inside of script and style tags.

private

readonly

xmlMode: boolean

_index: number

The index within the buffer that we are currently looking at.

_state

The current state the tokenizer is in.

sectionStart: number

The beginning of the section that is currently being read.

Methods

private

cleanup()

private

decodeNumericEntity(

offset: number,

base: number,

strict: boolean,

)

private

emitPartial(value: string)

private

emitToken(name: "onopentagname" | "onclosetag" | "onattribdata")

private

finish()

private

getSection(): string

private

handleInAttributeValue(c: string, quote: string)

private

handleTrailingData()

private

isTagStartChar(c: string)

HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.

XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). We allow anything that wouldn't end the tag.

private

parse()

Iterates through the buffer, calling the function corresponding to the current state.

States that are more likely to be hit are higher up, as a performance improvement.

private

parseFixedEntity(map?: Record<string, string>)

private

parseLegacyEntity()

private

stateAfterAttributeName(c: string)

private

stateAfterCdata1(c: string)

private

stateAfterCdata2(c: string)

private

stateAfterClosingTagName(c: string)

private

stateAfterComment1(c: string)

private

stateAfterComment2(c: string)

private

stateAfterSpecialLast(c: string, sectionStartOffset: number)

private

stateBeforeAttributeName(c: string)

private

stateBeforeAttributeValue(c: string)

private

stateBeforeCdata6(c: string)

private

stateBeforeClosingTagName(c: string)

private

stateBeforeComment(c: string)

private

stateBeforeDeclaration(c: string)

private

stateBeforeSpecialLast(c: string, special: Special)

private

stateBeforeSpecialS(c: string)

private

stateBeforeSpecialSEnd(c: string)

private

stateBeforeTagName(c: string)

private

stateInAttributeName(c: string)

private

stateInAttributeValueDoubleQuotes(c: string)

private

stateInAttributeValueNoQuotes(c: string)

private

stateInAttributeValueSingleQuotes(c: string)

private

stateInCdata(c: string)

private

stateInClosingTagName(c: string)

private

stateInComment(c: string)

private

stateInDeclaration(c: string)

private

stateInHexEntity(c: string)

private

stateInNamedEntity(c: string)

private

stateInNumericEntity(c: string)

private

stateInProcessingInstruction(c: string)

private

stateInSelfClosingTag(c: string)

private

stateInSpecialComment(c: string)

private

stateInTagName(c: string)

private

stateText(c: string)

end(chunk?: string): void

getAbsoluteIndex(): number

The current index within all of the written data.

pause(): void

reset(): void

resume(): void

write(chunk: string): void