create-tokenizer.js 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. 'use strict'
  2. var assign = require('../constant/assign.js')
  3. var markdownLineEnding = require('../character/markdown-line-ending.js')
  4. var chunkedPush = require('./chunked-push.js')
  5. var chunkedSplice = require('./chunked-splice.js')
  6. var miniflat = require('./miniflat.js')
  7. var resolveAll = require('./resolve-all.js')
  8. var serializeChunks = require('./serialize-chunks.js')
  9. var shallow = require('./shallow.js')
  10. var sliceChunks = require('./slice-chunks.js')
  11. // Create a tokenizer.
  12. // Tokenizers deal with one type of data (e.g., containers, flow, text).
  13. // The parser is the object dealing with it all.
  14. // `initialize` works like other constructs, except that only its `tokenize`
  15. // function is used, in which case it doesn’t receive an `ok` or `nok`.
  16. // `from` can be given to set the point before the first character, although
  17. // when further lines are indented, they must be set with `defineSkip`.
  18. function createTokenizer(parser, initialize, from) {
  19. var point = from
  20. ? shallow(from)
  21. : {
  22. line: 1,
  23. column: 1,
  24. offset: 0
  25. }
  26. var columnStart = {}
  27. var resolveAllConstructs = []
  28. var chunks = []
  29. var stack = []
  30. var effects = {
  31. consume: consume,
  32. enter: enter,
  33. exit: exit,
  34. attempt: constructFactory(onsuccessfulconstruct),
  35. check: constructFactory(onsuccessfulcheck),
  36. interrupt: constructFactory(onsuccessfulcheck, {
  37. interrupt: true
  38. }),
  39. lazy: constructFactory(onsuccessfulcheck, {
  40. lazy: true
  41. })
  42. } // State and tools for resolving and serializing.
  43. var context = {
  44. previous: null,
  45. events: [],
  46. parser: parser,
  47. sliceStream: sliceStream,
  48. sliceSerialize: sliceSerialize,
  49. now: now,
  50. defineSkip: skip,
  51. write: write
  52. } // The state function.
  53. var state = initialize.tokenize.call(context, effects) // Track which character we expect to be consumed, to catch bugs.
  54. if (initialize.resolveAll) {
  55. resolveAllConstructs.push(initialize)
  56. } // Store where we are in the input stream.
  57. point._index = 0
  58. point._bufferIndex = -1
  59. return context
  60. function write(slice) {
  61. chunks = chunkedPush(chunks, slice)
  62. main() // Exit if we’re not done, resolve might change stuff.
  63. if (chunks[chunks.length - 1] !== null) {
  64. return []
  65. }
  66. addResult(initialize, 0) // Otherwise, resolve, and exit.
  67. context.events = resolveAll(resolveAllConstructs, context.events, context)
  68. return context.events
  69. } //
  70. // Tools.
  71. //
  72. function sliceSerialize(token) {
  73. return serializeChunks(sliceStream(token))
  74. }
  75. function sliceStream(token) {
  76. return sliceChunks(chunks, token)
  77. }
  78. function now() {
  79. return shallow(point)
  80. }
  81. function skip(value) {
  82. columnStart[value.line] = value.column
  83. accountForPotentialSkip()
  84. } //
  85. // State management.
  86. //
  87. // Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
  88. // `consume`).
  89. // Here is where we walk through the chunks, which either include strings of
  90. // several characters, or numerical character codes.
  91. // The reason to do this in a loop instead of a call is so the stack can
  92. // drain.
  93. function main() {
  94. var chunkIndex
  95. var chunk
  96. while (point._index < chunks.length) {
  97. chunk = chunks[point._index] // If we’re in a buffer chunk, loop through it.
  98. if (typeof chunk === 'string') {
  99. chunkIndex = point._index
  100. if (point._bufferIndex < 0) {
  101. point._bufferIndex = 0
  102. }
  103. while (
  104. point._index === chunkIndex &&
  105. point._bufferIndex < chunk.length
  106. ) {
  107. go(chunk.charCodeAt(point._bufferIndex))
  108. }
  109. } else {
  110. go(chunk)
  111. }
  112. }
  113. } // Deal with one code.
  114. function go(code) {
  115. state = state(code)
  116. } // Move a character forward.
  117. function consume(code) {
  118. if (markdownLineEnding(code)) {
  119. point.line++
  120. point.column = 1
  121. point.offset += code === -3 ? 2 : 1
  122. accountForPotentialSkip()
  123. } else if (code !== -1) {
  124. point.column++
  125. point.offset++
  126. } // Not in a string chunk.
  127. if (point._bufferIndex < 0) {
  128. point._index++
  129. } else {
  130. point._bufferIndex++ // At end of string chunk.
  131. if (point._bufferIndex === chunks[point._index].length) {
  132. point._bufferIndex = -1
  133. point._index++
  134. }
  135. } // Expose the previous character.
  136. context.previous = code // Mark as consumed.
  137. } // Start a token.
  138. function enter(type, fields) {
  139. var token = fields || {}
  140. token.type = type
  141. token.start = now()
  142. context.events.push(['enter', token, context])
  143. stack.push(token)
  144. return token
  145. } // Stop a token.
  146. function exit(type) {
  147. var token = stack.pop()
  148. token.end = now()
  149. context.events.push(['exit', token, context])
  150. return token
  151. } // Use results.
  152. function onsuccessfulconstruct(construct, info) {
  153. addResult(construct, info.from)
  154. } // Discard results.
  155. function onsuccessfulcheck(construct, info) {
  156. info.restore()
  157. } // Factory to attempt/check/interrupt.
  158. function constructFactory(onreturn, fields) {
  159. return hook // Handle either an object mapping codes to constructs, a list of
  160. // constructs, or a single construct.
  161. function hook(constructs, returnState, bogusState) {
  162. var listOfConstructs
  163. var constructIndex
  164. var currentConstruct
  165. var info
  166. return constructs.tokenize || 'length' in constructs
  167. ? handleListOfConstructs(miniflat(constructs))
  168. : handleMapOfConstructs
  169. function handleMapOfConstructs(code) {
  170. if (code in constructs || null in constructs) {
  171. return handleListOfConstructs(
  172. constructs.null
  173. ? /* c8 ignore next */
  174. miniflat(constructs[code]).concat(miniflat(constructs.null))
  175. : constructs[code]
  176. )(code)
  177. }
  178. return bogusState(code)
  179. }
  180. function handleListOfConstructs(list) {
  181. listOfConstructs = list
  182. constructIndex = 0
  183. return handleConstruct(list[constructIndex])
  184. }
  185. function handleConstruct(construct) {
  186. return start
  187. function start(code) {
  188. // To do: not nede to store if there is no bogus state, probably?
  189. // Currently doesn’t work because `inspect` in document does a check
  190. // w/o a bogus, which doesn’t make sense. But it does seem to help perf
  191. // by not storing.
  192. info = store()
  193. currentConstruct = construct
  194. if (!construct.partial) {
  195. context.currentConstruct = construct
  196. }
  197. if (
  198. construct.name &&
  199. context.parser.constructs.disable.null.indexOf(construct.name) > -1
  200. ) {
  201. return nok()
  202. }
  203. return construct.tokenize.call(
  204. fields ? assign({}, context, fields) : context,
  205. effects,
  206. ok,
  207. nok
  208. )(code)
  209. }
  210. }
  211. function ok(code) {
  212. onreturn(currentConstruct, info)
  213. return returnState
  214. }
  215. function nok(code) {
  216. info.restore()
  217. if (++constructIndex < listOfConstructs.length) {
  218. return handleConstruct(listOfConstructs[constructIndex])
  219. }
  220. return bogusState
  221. }
  222. }
  223. }
  224. function addResult(construct, from) {
  225. if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) {
  226. resolveAllConstructs.push(construct)
  227. }
  228. if (construct.resolve) {
  229. chunkedSplice(
  230. context.events,
  231. from,
  232. context.events.length - from,
  233. construct.resolve(context.events.slice(from), context)
  234. )
  235. }
  236. if (construct.resolveTo) {
  237. context.events = construct.resolveTo(context.events, context)
  238. }
  239. }
  240. function store() {
  241. var startPoint = now()
  242. var startPrevious = context.previous
  243. var startCurrentConstruct = context.currentConstruct
  244. var startEventsIndex = context.events.length
  245. var startStack = Array.from(stack)
  246. return {
  247. restore: restore,
  248. from: startEventsIndex
  249. }
  250. function restore() {
  251. point = startPoint
  252. context.previous = startPrevious
  253. context.currentConstruct = startCurrentConstruct
  254. context.events.length = startEventsIndex
  255. stack = startStack
  256. accountForPotentialSkip()
  257. }
  258. }
  259. function accountForPotentialSkip() {
  260. if (point.line in columnStart && point.column < 2) {
  261. point.column = columnStart[point.line]
  262. point.offset += columnStart[point.line] - 1
  263. }
  264. }
  265. }
  266. module.exports = createTokenizer