AbstractLexer.php 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. <?php
  2. declare(strict_types=1);
  3. namespace Doctrine\Common\Lexer;
  4. use ReflectionClass;
  5. use function implode;
  6. use function in_array;
  7. use function preg_split;
  8. use function sprintf;
  9. use function substr;
  10. use const PREG_SPLIT_DELIM_CAPTURE;
  11. use const PREG_SPLIT_NO_EMPTY;
  12. use const PREG_SPLIT_OFFSET_CAPTURE;
  13. /**
  14. * Base class for writing simple lexers, i.e. for creating small DSLs.
  15. *
  16. * @psalm-type Token = array{value: int|string, type:string|int|null, position:int}
  17. */
  18. abstract class AbstractLexer
  19. {
  20. /**
  21. * Lexer original input string.
  22. *
  23. * @var string
  24. */
  25. private $input;
  26. /**
  27. * Array of scanned tokens.
  28. *
  29. * Each token is an associative array containing three items:
  30. * - 'value' : the string value of the token in the input string
  31. * - 'type' : the type of the token (identifier, numeric, string, input
  32. * parameter, none)
  33. * - 'position' : the position of the token in the input string
  34. *
  35. * @var mixed[][]
  36. * @psalm-var list<Token>
  37. */
  38. private $tokens = [];
  39. /**
  40. * Current lexer position in input string.
  41. *
  42. * @var int
  43. */
  44. private $position = 0;
  45. /**
  46. * Current peek of current lexer position.
  47. *
  48. * @var int
  49. */
  50. private $peek = 0;
  51. /**
  52. * The next token in the input.
  53. *
  54. * @var mixed[]|null
  55. * @psalm-var Token|null
  56. */
  57. public $lookahead;
  58. /**
  59. * The last matched/seen token.
  60. *
  61. * @var mixed[]|null
  62. * @psalm-var Token|null
  63. */
  64. public $token;
  65. /**
  66. * Composed regex for input parsing.
  67. *
  68. * @var string|null
  69. */
  70. private $regex;
  71. /**
  72. * Sets the input data to be tokenized.
  73. *
  74. * The Lexer is immediately reset and the new input tokenized.
  75. * Any unprocessed tokens from any previous input are lost.
  76. *
  77. * @param string $input The input to be tokenized.
  78. *
  79. * @return void
  80. */
  81. public function setInput($input)
  82. {
  83. $this->input = $input;
  84. $this->tokens = [];
  85. $this->reset();
  86. $this->scan($input);
  87. }
  88. /**
  89. * Resets the lexer.
  90. *
  91. * @return void
  92. */
  93. public function reset()
  94. {
  95. $this->lookahead = null;
  96. $this->token = null;
  97. $this->peek = 0;
  98. $this->position = 0;
  99. }
  100. /**
  101. * Resets the peek pointer to 0.
  102. *
  103. * @return void
  104. */
  105. public function resetPeek()
  106. {
  107. $this->peek = 0;
  108. }
  109. /**
  110. * Resets the lexer position on the input to the given position.
  111. *
  112. * @param int $position Position to place the lexical scanner.
  113. *
  114. * @return void
  115. */
  116. public function resetPosition($position = 0)
  117. {
  118. $this->position = $position;
  119. }
  120. /**
  121. * Retrieve the original lexer's input until a given position.
  122. *
  123. * @param int $position
  124. *
  125. * @return string
  126. */
  127. public function getInputUntilPosition($position)
  128. {
  129. return substr($this->input, 0, $position);
  130. }
  131. /**
  132. * Checks whether a given token matches the current lookahead.
  133. *
  134. * @param int|string $type
  135. *
  136. * @return bool
  137. */
  138. public function isNextToken($type)
  139. {
  140. return $this->lookahead !== null && $this->lookahead['type'] === $type;
  141. }
  142. /**
  143. * Checks whether any of the given tokens matches the current lookahead.
  144. *
  145. * @param list<int|string> $types
  146. *
  147. * @return bool
  148. */
  149. public function isNextTokenAny(array $types)
  150. {
  151. return $this->lookahead !== null && in_array($this->lookahead['type'], $types, true);
  152. }
  153. /**
  154. * Moves to the next token in the input string.
  155. *
  156. * @return bool
  157. */
  158. public function moveNext()
  159. {
  160. $this->peek = 0;
  161. $this->token = $this->lookahead;
  162. $this->lookahead = isset($this->tokens[$this->position])
  163. ? $this->tokens[$this->position++] : null;
  164. return $this->lookahead !== null;
  165. }
  166. /**
  167. * Tells the lexer to skip input tokens until it sees a token with the given value.
  168. *
  169. * @param string $type The token type to skip until.
  170. *
  171. * @return void
  172. */
  173. public function skipUntil($type)
  174. {
  175. while ($this->lookahead !== null && $this->lookahead['type'] !== $type) {
  176. $this->moveNext();
  177. }
  178. }
  179. /**
  180. * Checks if given value is identical to the given token.
  181. *
  182. * @param mixed $value
  183. * @param int|string $token
  184. *
  185. * @return bool
  186. */
  187. public function isA($value, $token)
  188. {
  189. return $this->getType($value) === $token;
  190. }
  191. /**
  192. * Moves the lookahead token forward.
  193. *
  194. * @return mixed[]|null The next token or NULL if there are no more tokens ahead.
  195. * @psalm-return Token|null
  196. */
  197. public function peek()
  198. {
  199. if (isset($this->tokens[$this->position + $this->peek])) {
  200. return $this->tokens[$this->position + $this->peek++];
  201. }
  202. return null;
  203. }
  204. /**
  205. * Peeks at the next token, returns it and immediately resets the peek.
  206. *
  207. * @return mixed[]|null The next token or NULL if there are no more tokens ahead.
  208. * @psalm-return Token|null
  209. */
  210. public function glimpse()
  211. {
  212. $peek = $this->peek();
  213. $this->peek = 0;
  214. return $peek;
  215. }
  216. /**
  217. * Scans the input string for tokens.
  218. *
  219. * @param string $input A query string.
  220. *
  221. * @return void
  222. */
  223. protected function scan($input)
  224. {
  225. if (! isset($this->regex)) {
  226. $this->regex = sprintf(
  227. '/(%s)|%s/%s',
  228. implode(')|(', $this->getCatchablePatterns()),
  229. implode('|', $this->getNonCatchablePatterns()),
  230. $this->getModifiers()
  231. );
  232. }
  233. $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
  234. $matches = preg_split($this->regex, $input, -1, $flags);
  235. if ($matches === false) {
  236. // Work around https://bugs.php.net/78122
  237. $matches = [[$input, 0]];
  238. }
  239. foreach ($matches as $match) {
  240. // Must remain before 'value' assignment since it can change content
  241. $type = $this->getType($match[0]);
  242. $this->tokens[] = [
  243. 'value' => $match[0],
  244. 'type' => $type,
  245. 'position' => $match[1],
  246. ];
  247. }
  248. }
  249. /**
  250. * Gets the literal for a given token.
  251. *
  252. * @param int|string $token
  253. *
  254. * @return int|string
  255. */
  256. public function getLiteral($token)
  257. {
  258. $className = static::class;
  259. $reflClass = new ReflectionClass($className);
  260. $constants = $reflClass->getConstants();
  261. foreach ($constants as $name => $value) {
  262. if ($value === $token) {
  263. return $className . '::' . $name;
  264. }
  265. }
  266. return $token;
  267. }
  268. /**
  269. * Regex modifiers
  270. *
  271. * @return string
  272. */
  273. protected function getModifiers()
  274. {
  275. return 'iu';
  276. }
  277. /**
  278. * Lexical catchable patterns.
  279. *
  280. * @return string[]
  281. */
  282. abstract protected function getCatchablePatterns();
  283. /**
  284. * Lexical non-catchable patterns.
  285. *
  286. * @return string[]
  287. */
  288. abstract protected function getNonCatchablePatterns();
  289. /**
  290. * Retrieve token type. Also processes the token value if necessary.
  291. *
  292. * @param string $value
  293. *
  294. * @return int|string|null
  295. */
  296. abstract protected function getType(&$value);
  297. }