create-tokenizer.mjs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. export default createTokenizer
  2. import assert from 'assert'
  3. import createDebug from 'debug'
  4. import assign from '../constant/assign.mjs'
  5. import codes from '../character/codes.mjs'
  6. import markdownLineEnding from '../character/markdown-line-ending.mjs'
  7. import chunkedPush from './chunked-push.mjs'
  8. import chunkedSplice from './chunked-splice.mjs'
  9. import miniflat from './miniflat.mjs'
  10. import resolveAll from './resolve-all.mjs'
  11. import serializeChunks from './serialize-chunks.mjs'
  12. import shallow from './shallow.mjs'
  13. import sliceChunks from './slice-chunks.mjs'
  14. var debug = createDebug('micromark')
  15. // Create a tokenizer.
  16. // Tokenizers deal with one type of data (e.g., containers, flow, text).
  17. // The parser is the object dealing with it all.
  18. // `initialize` works like other constructs, except that only its `tokenize`
  19. // function is used, in which case it doesn’t receive an `ok` or `nok`.
  20. // `from` can be given to set the point before the first character, although
  21. // when further lines are indented, they must be set with `defineSkip`.
  22. function createTokenizer(parser, initialize, from) {
  23. var point = from ? shallow(from) : {line: 1, column: 1, offset: 0}
  24. var columnStart = {}
  25. var resolveAllConstructs = []
  26. var chunks = []
  27. var stack = []
  28. var consumed = true
  29. // Tools used for tokenizing.
  30. var effects = {
  31. consume: consume,
  32. enter: enter,
  33. exit: exit,
  34. attempt: constructFactory(onsuccessfulconstruct),
  35. check: constructFactory(onsuccessfulcheck),
  36. interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}),
  37. lazy: constructFactory(onsuccessfulcheck, {lazy: true})
  38. }
  39. // State and tools for resolving and serializing.
  40. var context = {
  41. previous: codes.eof,
  42. events: [],
  43. parser: parser,
  44. sliceStream: sliceStream,
  45. sliceSerialize: sliceSerialize,
  46. now: now,
  47. defineSkip: skip,
  48. write: write
  49. }
  50. // The state function.
  51. var state = initialize.tokenize.call(context, effects)
  52. // Track which character we expect to be consumed, to catch bugs.
  53. var expectedCode
  54. if (initialize.resolveAll) {
  55. resolveAllConstructs.push(initialize)
  56. }
  57. // Store where we are in the input stream.
  58. point._index = 0
  59. point._bufferIndex = -1
  60. return context
  61. function write(slice) {
  62. chunks = chunkedPush(chunks, slice)
  63. main()
  64. // Exit if we’re not done, resolve might change stuff.
  65. if (chunks[chunks.length - 1] !== codes.eof) {
  66. return []
  67. }
  68. addResult(initialize, 0)
  69. // Otherwise, resolve, and exit.
  70. context.events = resolveAll(resolveAllConstructs, context.events, context)
  71. return context.events
  72. }
  73. //
  74. // Tools.
  75. //
  76. function sliceSerialize(token) {
  77. return serializeChunks(sliceStream(token))
  78. }
  79. function sliceStream(token) {
  80. return sliceChunks(chunks, token)
  81. }
  82. function now() {
  83. return shallow(point)
  84. }
  85. function skip(value) {
  86. columnStart[value.line] = value.column
  87. accountForPotentialSkip()
  88. debug('position: define skip: `%j`', point)
  89. }
  90. //
  91. // State management.
  92. //
  93. // Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
  94. // `consume`).
  95. // Here is where we walk through the chunks, which either include strings of
  96. // several characters, or numerical character codes.
  97. // The reason to do this in a loop instead of a call is so the stack can
  98. // drain.
  99. function main() {
  100. var chunkIndex
  101. var chunk
  102. while (point._index < chunks.length) {
  103. chunk = chunks[point._index]
  104. // If we’re in a buffer chunk, loop through it.
  105. if (typeof chunk === 'string') {
  106. chunkIndex = point._index
  107. if (point._bufferIndex < 0) {
  108. point._bufferIndex = 0
  109. }
  110. while (
  111. point._index === chunkIndex &&
  112. point._bufferIndex < chunk.length
  113. ) {
  114. go(chunk.charCodeAt(point._bufferIndex))
  115. }
  116. } else {
  117. go(chunk)
  118. }
  119. }
  120. }
  121. // Deal with one code.
  122. function go(code) {
  123. assert.equal(consumed, true, 'expected character to be consumed')
  124. consumed = undefined
  125. debug('main: passing `%s` to %s', code, state.name)
  126. expectedCode = code
  127. state = state(code)
  128. }
  129. // Move a character forward.
  130. function consume(code) {
  131. assert.equal(
  132. code,
  133. expectedCode,
  134. 'expected given code to equal expected code'
  135. )
  136. debug('consume: `%s`', code)
  137. assert.equal(consumed, undefined, 'expected code to not have been consumed')
  138. assert(
  139. code === null
  140. ? !context.events.length ||
  141. context.events[context.events.length - 1][0] === 'exit'
  142. : context.events[context.events.length - 1][0] === 'enter',
  143. 'expected last token to be open'
  144. )
  145. if (markdownLineEnding(code)) {
  146. point.line++
  147. point.column = 1
  148. point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
  149. accountForPotentialSkip()
  150. debug('position: after eol: `%j`', point)
  151. } else if (code !== codes.virtualSpace) {
  152. point.column++
  153. point.offset++
  154. }
  155. // Not in a string chunk.
  156. if (point._bufferIndex < 0) {
  157. point._index++
  158. } else {
  159. point._bufferIndex++
  160. // At end of string chunk.
  161. if (point._bufferIndex === chunks[point._index].length) {
  162. point._bufferIndex = -1
  163. point._index++
  164. }
  165. }
  166. // Expose the previous character.
  167. context.previous = code
  168. // Mark as consumed.
  169. consumed = true
  170. }
  171. // Start a token.
  172. function enter(type, fields) {
  173. var token = fields || {}
  174. token.type = type
  175. token.start = now()
  176. assert.equal(typeof type, 'string', 'expected string type')
  177. assert.notEqual(type.length, 0, 'expected non-empty string')
  178. debug('enter: `%s`', type)
  179. context.events.push(['enter', token, context])
  180. stack.push(token)
  181. return token
  182. }
  183. // Stop a token.
  184. function exit(type) {
  185. assert.equal(typeof type, 'string', 'expected string type')
  186. assert.notEqual(type.length, 0, 'expected non-empty string')
  187. assert.notEqual(stack.length, 0, 'cannot close w/o open tokens')
  188. var token = stack.pop()
  189. token.end = now()
  190. assert.equal(type, token.type, 'expected exit token to match current token')
  191. assert(
  192. !(
  193. token.start._index === token.end._index &&
  194. token.start._bufferIndex === token.end._bufferIndex
  195. ),
  196. 'expected non-empty token (`' + type + '`)'
  197. )
  198. debug('exit: `%s`', token.type)
  199. context.events.push(['exit', token, context])
  200. return token
  201. }
  202. // Use results.
  203. function onsuccessfulconstruct(construct, info) {
  204. addResult(construct, info.from)
  205. }
  206. // Discard results.
  207. function onsuccessfulcheck(construct, info) {
  208. info.restore()
  209. }
  210. // Factory to attempt/check/interrupt.
  211. function constructFactory(onreturn, fields) {
  212. return hook
  213. // Handle either an object mapping codes to constructs, a list of
  214. // constructs, or a single construct.
  215. function hook(constructs, returnState, bogusState) {
  216. var listOfConstructs
  217. var constructIndex
  218. var currentConstruct
  219. var info
  220. return constructs.tokenize || 'length' in constructs
  221. ? handleListOfConstructs(miniflat(constructs))
  222. : handleMapOfConstructs
  223. function handleMapOfConstructs(code) {
  224. if (code in constructs || codes.eof in constructs) {
  225. return handleListOfConstructs(
  226. constructs.null
  227. ? /* c8 ignore next */
  228. miniflat(constructs[code]).concat(miniflat(constructs.null))
  229. : constructs[code]
  230. )(code)
  231. }
  232. return bogusState(code)
  233. }
  234. function handleListOfConstructs(list) {
  235. listOfConstructs = list
  236. constructIndex = 0
  237. return handleConstruct(list[constructIndex])
  238. }
  239. function handleConstruct(construct) {
  240. return start
  241. function start(code) {
  242. // To do: not nede to store if there is no bogus state, probably?
  243. // Currently doesn’t work because `inspect` in document does a check
  244. // w/o a bogus, which doesn’t make sense. But it does seem to help perf
  245. // by not storing.
  246. info = store()
  247. currentConstruct = construct
  248. if (!construct.partial) {
  249. context.currentConstruct = construct
  250. }
  251. if (
  252. construct.name &&
  253. context.parser.constructs.disable.null.indexOf(construct.name) > -1
  254. ) {
  255. return nok(code)
  256. }
  257. return construct.tokenize.call(
  258. fields ? assign({}, context, fields) : context,
  259. effects,
  260. ok,
  261. nok
  262. )(code)
  263. }
  264. }
  265. function ok(code) {
  266. assert.equal(code, expectedCode, 'expected code')
  267. consumed = true
  268. onreturn(currentConstruct, info)
  269. return returnState
  270. }
  271. function nok(code) {
  272. assert.equal(code, expectedCode, 'expected code')
  273. consumed = true
  274. info.restore()
  275. if (++constructIndex < listOfConstructs.length) {
  276. return handleConstruct(listOfConstructs[constructIndex])
  277. }
  278. return bogusState
  279. }
  280. }
  281. }
  282. function addResult(construct, from) {
  283. if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) {
  284. resolveAllConstructs.push(construct)
  285. }
  286. if (construct.resolve) {
  287. chunkedSplice(
  288. context.events,
  289. from,
  290. context.events.length - from,
  291. construct.resolve(context.events.slice(from), context)
  292. )
  293. }
  294. if (construct.resolveTo) {
  295. context.events = construct.resolveTo(context.events, context)
  296. }
  297. assert(
  298. construct.partial ||
  299. !context.events.length ||
  300. context.events[context.events.length - 1][0] === 'exit',
  301. 'expected last token to end'
  302. )
  303. }
  304. function store() {
  305. var startPoint = now()
  306. var startPrevious = context.previous
  307. var startCurrentConstruct = context.currentConstruct
  308. var startEventsIndex = context.events.length
  309. var startStack = Array.from(stack)
  310. return {restore: restore, from: startEventsIndex}
  311. function restore() {
  312. point = startPoint
  313. context.previous = startPrevious
  314. context.currentConstruct = startCurrentConstruct
  315. context.events.length = startEventsIndex
  316. stack = startStack
  317. accountForPotentialSkip()
  318. debug('position: restore: `%j`', point)
  319. }
  320. }
  321. function accountForPotentialSkip() {
  322. if (point.line in columnStart && point.column < 2) {
  323. point.column = columnStart[point.line]
  324. point.offset += columnStart[point.line] - 1
  325. }
  326. }
  327. }