123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- export default createTokenizer
- import assert from 'assert'
- import createDebug from 'debug'
- import assign from '../constant/assign.mjs'
- import codes from '../character/codes.mjs'
- import markdownLineEnding from '../character/markdown-line-ending.mjs'
- import chunkedPush from './chunked-push.mjs'
- import chunkedSplice from './chunked-splice.mjs'
- import miniflat from './miniflat.mjs'
- import resolveAll from './resolve-all.mjs'
- import serializeChunks from './serialize-chunks.mjs'
- import shallow from './shallow.mjs'
- import sliceChunks from './slice-chunks.mjs'
- var debug = createDebug('micromark')
- // Create a tokenizer.
- // Tokenizers deal with one type of data (e.g., containers, flow, text).
- // The parser is the object dealing with it all.
- // `initialize` works like other constructs, except that only its `tokenize`
- // function is used, in which case it doesn’t receive an `ok` or `nok`.
- // `from` can be given to set the point before the first character, although
- // when further lines are indented, they must be set with `defineSkip`.
- function createTokenizer(parser, initialize, from) {
- var point = from ? shallow(from) : {line: 1, column: 1, offset: 0}
- var columnStart = {}
- var resolveAllConstructs = []
- var chunks = []
- var stack = []
- var consumed = true
- // Tools used for tokenizing.
- var effects = {
- consume: consume,
- enter: enter,
- exit: exit,
- attempt: constructFactory(onsuccessfulconstruct),
- check: constructFactory(onsuccessfulcheck),
- interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}),
- lazy: constructFactory(onsuccessfulcheck, {lazy: true})
- }
- // State and tools for resolving and serializing.
- var context = {
- previous: codes.eof,
- events: [],
- parser: parser,
- sliceStream: sliceStream,
- sliceSerialize: sliceSerialize,
- now: now,
- defineSkip: skip,
- write: write
- }
- // The state function.
- var state = initialize.tokenize.call(context, effects)
- // Track which character we expect to be consumed, to catch bugs.
- var expectedCode
- if (initialize.resolveAll) {
- resolveAllConstructs.push(initialize)
- }
- // Store where we are in the input stream.
- point._index = 0
- point._bufferIndex = -1
- return context
- function write(slice) {
- chunks = chunkedPush(chunks, slice)
- main()
- // Exit if we’re not done, resolve might change stuff.
- if (chunks[chunks.length - 1] !== codes.eof) {
- return []
- }
- addResult(initialize, 0)
- // Otherwise, resolve, and exit.
- context.events = resolveAll(resolveAllConstructs, context.events, context)
- return context.events
- }
- //
- // Tools.
- //
- function sliceSerialize(token) {
- return serializeChunks(sliceStream(token))
- }
- function sliceStream(token) {
- return sliceChunks(chunks, token)
- }
- function now() {
- return shallow(point)
- }
- function skip(value) {
- columnStart[value.line] = value.column
- accountForPotentialSkip()
- debug('position: define skip: `%j`', point)
- }
- //
- // State management.
- //
- // Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
- // `consume`).
- // Here is where we walk through the chunks, which either include strings of
- // several characters, or numerical character codes.
- // The reason to do this in a loop instead of a call is so the stack can
- // drain.
- function main() {
- var chunkIndex
- var chunk
- while (point._index < chunks.length) {
- chunk = chunks[point._index]
- // If we’re in a buffer chunk, loop through it.
- if (typeof chunk === 'string') {
- chunkIndex = point._index
- if (point._bufferIndex < 0) {
- point._bufferIndex = 0
- }
- while (
- point._index === chunkIndex &&
- point._bufferIndex < chunk.length
- ) {
- go(chunk.charCodeAt(point._bufferIndex))
- }
- } else {
- go(chunk)
- }
- }
- }
- // Deal with one code.
- function go(code) {
- assert.equal(consumed, true, 'expected character to be consumed')
- consumed = undefined
- debug('main: passing `%s` to %s', code, state.name)
- expectedCode = code
- state = state(code)
- }
- // Move a character forward.
- function consume(code) {
- assert.equal(
- code,
- expectedCode,
- 'expected given code to equal expected code'
- )
- debug('consume: `%s`', code)
- assert.equal(consumed, undefined, 'expected code to not have been consumed')
- assert(
- code === null
- ? !context.events.length ||
- context.events[context.events.length - 1][0] === 'exit'
- : context.events[context.events.length - 1][0] === 'enter',
- 'expected last token to be open'
- )
- if (markdownLineEnding(code)) {
- point.line++
- point.column = 1
- point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
- accountForPotentialSkip()
- debug('position: after eol: `%j`', point)
- } else if (code !== codes.virtualSpace) {
- point.column++
- point.offset++
- }
- // Not in a string chunk.
- if (point._bufferIndex < 0) {
- point._index++
- } else {
- point._bufferIndex++
- // At end of string chunk.
- if (point._bufferIndex === chunks[point._index].length) {
- point._bufferIndex = -1
- point._index++
- }
- }
- // Expose the previous character.
- context.previous = code
- // Mark as consumed.
- consumed = true
- }
- // Start a token.
- function enter(type, fields) {
- var token = fields || {}
- token.type = type
- token.start = now()
- assert.equal(typeof type, 'string', 'expected string type')
- assert.notEqual(type.length, 0, 'expected non-empty string')
- debug('enter: `%s`', type)
- context.events.push(['enter', token, context])
- stack.push(token)
- return token
- }
- // Stop a token.
- function exit(type) {
- assert.equal(typeof type, 'string', 'expected string type')
- assert.notEqual(type.length, 0, 'expected non-empty string')
- assert.notEqual(stack.length, 0, 'cannot close w/o open tokens')
- var token = stack.pop()
- token.end = now()
- assert.equal(type, token.type, 'expected exit token to match current token')
- assert(
- !(
- token.start._index === token.end._index &&
- token.start._bufferIndex === token.end._bufferIndex
- ),
- 'expected non-empty token (`' + type + '`)'
- )
- debug('exit: `%s`', token.type)
- context.events.push(['exit', token, context])
- return token
- }
- // Use results.
- function onsuccessfulconstruct(construct, info) {
- addResult(construct, info.from)
- }
- // Discard results.
- function onsuccessfulcheck(construct, info) {
- info.restore()
- }
- // Factory to attempt/check/interrupt.
- function constructFactory(onreturn, fields) {
- return hook
- // Handle either an object mapping codes to constructs, a list of
- // constructs, or a single construct.
- function hook(constructs, returnState, bogusState) {
- var listOfConstructs
- var constructIndex
- var currentConstruct
- var info
- return constructs.tokenize || 'length' in constructs
- ? handleListOfConstructs(miniflat(constructs))
- : handleMapOfConstructs
- function handleMapOfConstructs(code) {
- if (code in constructs || codes.eof in constructs) {
- return handleListOfConstructs(
- constructs.null
- ? /* c8 ignore next */
- miniflat(constructs[code]).concat(miniflat(constructs.null))
- : constructs[code]
- )(code)
- }
- return bogusState(code)
- }
- function handleListOfConstructs(list) {
- listOfConstructs = list
- constructIndex = 0
- return handleConstruct(list[constructIndex])
- }
- function handleConstruct(construct) {
- return start
- function start(code) {
- // To do: not nede to store if there is no bogus state, probably?
- // Currently doesn’t work because `inspect` in document does a check
- // w/o a bogus, which doesn’t make sense. But it does seem to help perf
- // by not storing.
- info = store()
- currentConstruct = construct
- if (!construct.partial) {
- context.currentConstruct = construct
- }
- if (
- construct.name &&
- context.parser.constructs.disable.null.indexOf(construct.name) > -1
- ) {
- return nok(code)
- }
- return construct.tokenize.call(
- fields ? assign({}, context, fields) : context,
- effects,
- ok,
- nok
- )(code)
- }
- }
- function ok(code) {
- assert.equal(code, expectedCode, 'expected code')
- consumed = true
- onreturn(currentConstruct, info)
- return returnState
- }
- function nok(code) {
- assert.equal(code, expectedCode, 'expected code')
- consumed = true
- info.restore()
- if (++constructIndex < listOfConstructs.length) {
- return handleConstruct(listOfConstructs[constructIndex])
- }
- return bogusState
- }
- }
- }
- function addResult(construct, from) {
- if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) {
- resolveAllConstructs.push(construct)
- }
- if (construct.resolve) {
- chunkedSplice(
- context.events,
- from,
- context.events.length - from,
- construct.resolve(context.events.slice(from), context)
- )
- }
- if (construct.resolveTo) {
- context.events = construct.resolveTo(context.events, context)
- }
- assert(
- construct.partial ||
- !context.events.length ||
- context.events[context.events.length - 1][0] === 'exit',
- 'expected last token to end'
- )
- }
- function store() {
- var startPoint = now()
- var startPrevious = context.previous
- var startCurrentConstruct = context.currentConstruct
- var startEventsIndex = context.events.length
- var startStack = Array.from(stack)
- return {restore: restore, from: startEventsIndex}
- function restore() {
- point = startPoint
- context.previous = startPrevious
- context.currentConstruct = startCurrentConstruct
- context.events.length = startEventsIndex
- stack = startStack
- accountForPotentialSkip()
- debug('position: restore: `%j`', point)
- }
- }
- function accountForPotentialSkip() {
- if (point.line in columnStart && point.column < 2) {
- point.column = columnStart[point.line]
- point.offset += columnStart[point.line] - 1
- }
- }
- }
|