Strings.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. <?php
  2. /**
  3. * This file is part of the Nette Framework (https://nette.org)
  4. * Copyright (c) 2004 David Grudl (https://davidgrudl.com)
  5. */
  6. declare(strict_types=1);
  7. namespace Nette\Utils;
  8. use Nette;
  9. use function is_array, is_object, strlen;
  10. /**
  11. * String tools library.
  12. */
  13. class Strings
  14. {
  15. use Nette\StaticClass;
  16. public const TRIM_CHARACTERS = " \t\n\r\0\x0B\u{A0}";
  17. /**
  18. * Checks if the string is valid in UTF-8 encoding.
  19. */
  20. public static function checkEncoding(string $s): bool
  21. {
  22. return $s === self::fixEncoding($s);
  23. }
  24. /**
  25. * Removes all invalid UTF-8 characters from a string.
  26. */
  27. public static function fixEncoding(string $s): string
  28. {
  29. // removes xD800-xDFFF, x110000 and higher
  30. return htmlspecialchars_decode(htmlspecialchars($s, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8'), ENT_NOQUOTES);
  31. }
  32. /**
  33. * Returns a specific character in UTF-8 from code point (number in range 0x0000..D7FF or 0xE000..10FFFF).
  34. * @throws Nette\InvalidArgumentException if code point is not in valid range
  35. */
  36. public static function chr(int $code): string
  37. {
  38. if ($code < 0 || ($code >= 0xD800 && $code <= 0xDFFF) || $code > 0x10FFFF) {
  39. throw new Nette\InvalidArgumentException('Code point must be in range 0x0 to 0xD7FF or 0xE000 to 0x10FFFF.');
  40. } elseif (!extension_loaded('iconv')) {
  41. throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.');
  42. }
  43. return iconv('UTF-32BE', 'UTF-8//IGNORE', pack('N', $code));
  44. }
  45. /**
  46. * Starts the $haystack string with the prefix $needle?
  47. */
  48. public static function startsWith(string $haystack, string $needle): bool
  49. {
  50. return strncmp($haystack, $needle, strlen($needle)) === 0;
  51. }
  52. /**
  53. * Ends the $haystack string with the suffix $needle?
  54. */
  55. public static function endsWith(string $haystack, string $needle): bool
  56. {
  57. return $needle === '' || substr($haystack, -strlen($needle)) === $needle;
  58. }
  59. /**
  60. * Does $haystack contain $needle?
  61. */
  62. public static function contains(string $haystack, string $needle): bool
  63. {
  64. return strpos($haystack, $needle) !== false;
  65. }
  66. /**
  67. * Returns a part of UTF-8 string specified by starting position and length. If start is negative,
  68. * the returned string will start at the start'th character from the end of string.
  69. */
  70. public static function substring(string $s, int $start, ?int $length = null): string
  71. {
  72. if (function_exists('mb_substr')) {
  73. return mb_substr($s, $start, $length, 'UTF-8'); // MB is much faster
  74. } elseif (!extension_loaded('iconv')) {
  75. throw new Nette\NotSupportedException(__METHOD__ . '() requires extension ICONV or MBSTRING, neither is loaded.');
  76. } elseif ($length === null) {
  77. $length = self::length($s);
  78. } elseif ($start < 0 && $length < 0) {
  79. $start += self::length($s); // unifies iconv_substr behavior with mb_substr
  80. }
  81. return iconv_substr($s, $start, $length, 'UTF-8');
  82. }
  83. /**
  84. * Removes control characters, normalizes line breaks to `\n`, removes leading and trailing blank lines,
  85. * trims end spaces on lines, normalizes UTF-8 to the normal form of NFC.
  86. */
  87. public static function normalize(string $s): string
  88. {
  89. // convert to compressed normal form (NFC)
  90. if (class_exists('Normalizer', false) && ($n = \Normalizer::normalize($s, \Normalizer::FORM_C)) !== false) {
  91. $s = $n;
  92. }
  93. $s = self::normalizeNewLines($s);
  94. // remove control characters; leave \t + \n
  95. $s = self::pcre('preg_replace', ['#[\x00-\x08\x0B-\x1F\x7F-\x9F]+#u', '', $s]);
  96. // right trim
  97. $s = self::pcre('preg_replace', ['#[\t ]+$#m', '', $s]);
  98. // leading and trailing blank lines
  99. $s = trim($s, "\n");
  100. return $s;
  101. }
  102. /**
  103. * Standardize line endings to unix-like.
  104. */
  105. public static function normalizeNewLines(string $s): string
  106. {
  107. return str_replace(["\r\n", "\r"], "\n", $s);
  108. }
  109. /**
  110. * Converts UTF-8 string to ASCII, ie removes diacritics etc.
  111. */
  112. public static function toAscii(string $s): string
  113. {
  114. $iconv = defined('ICONV_IMPL') ? trim(ICONV_IMPL, '"\'') : null;
  115. static $transliterator = null;
  116. if ($transliterator === null) {
  117. if (class_exists('Transliterator', false)) {
  118. $transliterator = \Transliterator::create('Any-Latin; Latin-ASCII');
  119. } else {
  120. trigger_error(__METHOD__ . "(): it is recommended to enable PHP extensions 'intl'.", E_USER_NOTICE);
  121. $transliterator = false;
  122. }
  123. }
  124. // remove control characters and check UTF-8 validity
  125. $s = self::pcre('preg_replace', ['#[^\x09\x0A\x0D\x20-\x7E\xA0-\x{2FF}\x{370}-\x{10FFFF}]#u', '', $s]);
  126. // transliteration (by Transliterator and iconv) is not optimal, replace some characters directly
  127. $s = strtr($s, ["\u{201E}" => '"', "\u{201C}" => '"', "\u{201D}" => '"', "\u{201A}" => "'", "\u{2018}" => "'", "\u{2019}" => "'", "\u{B0}" => '^', "\u{42F}" => 'Ya', "\u{44F}" => 'ya', "\u{42E}" => 'Yu', "\u{44E}" => 'yu', "\u{c4}" => 'Ae', "\u{d6}" => 'Oe', "\u{dc}" => 'Ue', "\u{1e9e}" => 'Ss', "\u{e4}" => 'ae', "\u{f6}" => 'oe', "\u{fc}" => 'ue', "\u{df}" => 'ss']); // „ “ ” ‚ ‘ ’ ° Я я Ю ю Ä Ö Ü ẞ ä ö ü ß
  128. if ($iconv !== 'libiconv') {
  129. $s = strtr($s, ["\u{AE}" => '(R)', "\u{A9}" => '(c)', "\u{2026}" => '...', "\u{AB}" => '<<', "\u{BB}" => '>>', "\u{A3}" => 'lb', "\u{A5}" => 'yen', "\u{B2}" => '^2', "\u{B3}" => '^3', "\u{B5}" => 'u', "\u{B9}" => '^1', "\u{BA}" => 'o', "\u{BF}" => '?', "\u{2CA}" => "'", "\u{2CD}" => '_', "\u{2DD}" => '"', "\u{1FEF}" => '', "\u{20AC}" => 'EUR', "\u{2122}" => 'TM', "\u{212E}" => 'e', "\u{2190}" => '<-', "\u{2191}" => '^', "\u{2192}" => '->', "\u{2193}" => 'V', "\u{2194}" => '<->']); // ® © … « » £ ¥ ² ³ µ ¹ º ¿ ˊ ˍ ˝ ` € ™ ℮ ← ↑ → ↓ ↔
  130. }
  131. if ($transliterator) {
  132. $s = $transliterator->transliterate($s);
  133. // use iconv because The transliterator leaves some characters out of ASCII, eg → ʾ
  134. if ($iconv === 'glibc') {
  135. $s = strtr($s, '?', "\x01"); // temporarily hide ? to distinguish them from the garbage that iconv creates
  136. $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
  137. $s = str_replace(['?', "\x01"], ['', '?'], $s); // remove garbage and restore ? characters
  138. } elseif ($iconv === 'libiconv') {
  139. $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
  140. } else { // null or 'unknown' (#216)
  141. $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars
  142. }
  143. } elseif ($iconv === 'glibc' || $iconv === 'libiconv') {
  144. // temporarily hide these characters to distinguish them from the garbage that iconv creates
  145. $s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06");
  146. if ($iconv === 'glibc') {
  147. // glibc implementation is very limited. transliterate into Windows-1250 and then into ASCII, so most Eastern European characters are preserved
  148. $s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s);
  149. $s = strtr(
  150. $s,
  151. "\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\x96\xa0\x8b\x97\x9b\xa6\xad\xb7",
  152. 'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.'
  153. );
  154. $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]);
  155. } else {
  156. $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
  157. }
  158. // remove garbage that iconv creates during transliteration (eg Ý -> Y')
  159. $s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s);
  160. // restore temporarily hidden characters
  161. $s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?');
  162. } else {
  163. $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars
  164. }
  165. return $s;
  166. }
  167. /**
  168. * Modifies the UTF-8 string to the form used in the URL, ie removes diacritics and replaces all characters
  169. * except letters of the English alphabet and numbers with a hyphens.
  170. */
  171. public static function webalize(string $s, ?string $charlist = null, bool $lower = true): string
  172. {
  173. $s = self::toAscii($s);
  174. if ($lower) {
  175. $s = strtolower($s);
  176. }
  177. $s = self::pcre('preg_replace', ['#[^a-z0-9' . ($charlist !== null ? preg_quote($charlist, '#') : '') . ']+#i', '-', $s]);
  178. $s = trim($s, '-');
  179. return $s;
  180. }
  181. /**
  182. * Truncates a UTF-8 string to given maximal length, while trying not to split whole words. Only if the string is truncated,
  183. * an ellipsis (or something else set with third argument) is appended to the string.
  184. */
  185. public static function truncate(string $s, int $maxLen, string $append = "\u{2026}"): string
  186. {
  187. if (self::length($s) > $maxLen) {
  188. $maxLen -= self::length($append);
  189. if ($maxLen < 1) {
  190. return $append;
  191. } elseif ($matches = self::match($s, '#^.{1,' . $maxLen . '}(?=[\s\x00-/:-@\[-`{-~])#us')) {
  192. return $matches[0] . $append;
  193. } else {
  194. return self::substring($s, 0, $maxLen) . $append;
  195. }
  196. }
  197. return $s;
  198. }
  199. /**
  200. * Indents a multiline text from the left. Second argument sets how many indentation chars should be used,
  201. * while the indent itself is the third argument (*tab* by default).
  202. */
  203. public static function indent(string $s, int $level = 1, string $chars = "\t"): string
  204. {
  205. if ($level > 0) {
  206. $s = self::replace($s, '#(?:^|[\r\n]+)(?=[^\r\n])#', '$0' . str_repeat($chars, $level));
  207. }
  208. return $s;
  209. }
  210. /**
  211. * Converts all characters of UTF-8 string to lower case.
  212. */
  213. public static function lower(string $s): string
  214. {
  215. return mb_strtolower($s, 'UTF-8');
  216. }
  217. /**
  218. * Converts the first character of a UTF-8 string to lower case and leaves the other characters unchanged.
  219. */
  220. public static function firstLower(string $s): string
  221. {
  222. return self::lower(self::substring($s, 0, 1)) . self::substring($s, 1);
  223. }
  224. /**
  225. * Converts all characters of a UTF-8 string to upper case.
  226. */
  227. public static function upper(string $s): string
  228. {
  229. return mb_strtoupper($s, 'UTF-8');
  230. }
  231. /**
  232. * Converts the first character of a UTF-8 string to upper case and leaves the other characters unchanged.
  233. */
  234. public static function firstUpper(string $s): string
  235. {
  236. return self::upper(self::substring($s, 0, 1)) . self::substring($s, 1);
  237. }
  238. /**
  239. * Converts the first character of every word of a UTF-8 string to upper case and the others to lower case.
  240. */
  241. public static function capitalize(string $s): string
  242. {
  243. return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8');
  244. }
  245. /**
  246. * Compares two UTF-8 strings or their parts, without taking character case into account. If length is null, whole strings are compared,
  247. * if it is negative, the corresponding number of characters from the end of the strings is compared,
  248. * otherwise the appropriate number of characters from the beginning is compared.
  249. */
  250. public static function compare(string $left, string $right, ?int $length = null): bool
  251. {
  252. if (class_exists('Normalizer', false)) {
  253. $left = \Normalizer::normalize($left, \Normalizer::FORM_D); // form NFD is faster
  254. $right = \Normalizer::normalize($right, \Normalizer::FORM_D); // form NFD is faster
  255. }
  256. if ($length < 0) {
  257. $left = self::substring($left, $length, -$length);
  258. $right = self::substring($right, $length, -$length);
  259. } elseif ($length !== null) {
  260. $left = self::substring($left, 0, $length);
  261. $right = self::substring($right, 0, $length);
  262. }
  263. return self::lower($left) === self::lower($right);
  264. }
  265. /**
  266. * Finds the common prefix of strings or returns empty string if the prefix was not found.
  267. * @param string[] $strings
  268. */
  269. public static function findPrefix(array $strings): string
  270. {
  271. $first = array_shift($strings);
  272. for ($i = 0; $i < strlen($first); $i++) {
  273. foreach ($strings as $s) {
  274. if (!isset($s[$i]) || $first[$i] !== $s[$i]) {
  275. while ($i && $first[$i - 1] >= "\x80" && $first[$i] >= "\x80" && $first[$i] < "\xC0") {
  276. $i--;
  277. }
  278. return substr($first, 0, $i);
  279. }
  280. }
  281. }
  282. return $first;
  283. }
  284. /**
  285. * Returns number of characters (not bytes) in UTF-8 string.
  286. * That is the number of Unicode code points which may differ from the number of graphemes.
  287. */
  288. public static function length(string $s): int
  289. {
  290. return function_exists('mb_strlen')
  291. ? mb_strlen($s, 'UTF-8')
  292. : strlen(utf8_decode($s));
  293. }
  294. /**
  295. * Removes all left and right side spaces (or the characters passed as second argument) from a UTF-8 encoded string.
  296. */
  297. public static function trim(string $s, string $charlist = self::TRIM_CHARACTERS): string
  298. {
  299. $charlist = preg_quote($charlist, '#');
  300. return self::replace($s, '#^[' . $charlist . ']+|[' . $charlist . ']+$#Du', '');
  301. }
  302. /**
  303. * Pads a UTF-8 string to given length by prepending the $pad string to the beginning.
  304. */
  305. public static function padLeft(string $s, int $length, string $pad = ' '): string
  306. {
  307. $length = max(0, $length - self::length($s));
  308. $padLen = self::length($pad);
  309. return str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen) . $s;
  310. }
  311. /**
  312. * Pads UTF-8 string to given length by appending the $pad string to the end.
  313. */
  314. public static function padRight(string $s, int $length, string $pad = ' '): string
  315. {
  316. $length = max(0, $length - self::length($s));
  317. $padLen = self::length($pad);
  318. return $s . str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen);
  319. }
  320. /**
  321. * Reverses UTF-8 string.
  322. */
  323. public static function reverse(string $s): string
  324. {
  325. if (!extension_loaded('iconv')) {
  326. throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.');
  327. }
  328. return iconv('UTF-32LE', 'UTF-8', strrev(iconv('UTF-8', 'UTF-32BE', $s)));
  329. }
  330. /**
  331. * Returns part of $haystack before $nth occurence of $needle or returns null if the needle was not found.
  332. * Negative value means searching from the end.
  333. */
  334. public static function before(string $haystack, string $needle, int $nth = 1): ?string
  335. {
  336. $pos = self::pos($haystack, $needle, $nth);
  337. return $pos === null
  338. ? null
  339. : substr($haystack, 0, $pos);
  340. }
  341. /**
  342. * Returns part of $haystack after $nth occurence of $needle or returns null if the needle was not found.
  343. * Negative value means searching from the end.
  344. */
  345. public static function after(string $haystack, string $needle, int $nth = 1): ?string
  346. {
  347. $pos = self::pos($haystack, $needle, $nth);
  348. return $pos === null
  349. ? null
  350. : substr($haystack, $pos + strlen($needle));
  351. }
  352. /**
  353. * Returns position in characters of $nth occurence of $needle in $haystack or null if the $needle was not found.
  354. * Negative value of `$nth` means searching from the end.
  355. */
  356. public static function indexOf(string $haystack, string $needle, int $nth = 1): ?int
  357. {
  358. $pos = self::pos($haystack, $needle, $nth);
  359. return $pos === null
  360. ? null
  361. : self::length(substr($haystack, 0, $pos));
  362. }
  363. /**
  364. * Returns position in characters of $nth occurence of $needle in $haystack or null if the needle was not found.
  365. */
  366. private static function pos(string $haystack, string $needle, int $nth = 1): ?int
  367. {
  368. if (!$nth) {
  369. return null;
  370. } elseif ($nth > 0) {
  371. if ($needle === '') {
  372. return 0;
  373. }
  374. $pos = 0;
  375. while (($pos = strpos($haystack, $needle, $pos)) !== false && --$nth) {
  376. $pos++;
  377. }
  378. } else {
  379. $len = strlen($haystack);
  380. if ($needle === '') {
  381. return $len;
  382. } elseif ($len === 0) {
  383. return null;
  384. }
  385. $pos = $len - 1;
  386. while (($pos = strrpos($haystack, $needle, $pos - $len)) !== false && ++$nth) {
  387. $pos--;
  388. }
  389. }
  390. return Helpers::falseToNull($pos);
  391. }
  392. /**
  393. * Splits a string into array by the regular expression. Parenthesized expression in the delimiter are captured.
  394. * Parameter $flags can be any combination of PREG_SPLIT_NO_EMPTY and PREG_OFFSET_CAPTURE flags.
  395. */
  396. public static function split(string $subject, string $pattern, int $flags = 0): array
  397. {
  398. return self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]);
  399. }
  400. /**
  401. * Checks if given string matches a regular expression pattern and returns an array with first found match and each subpattern.
  402. * Parameter $flags can be any combination of PREG_OFFSET_CAPTURE and PREG_UNMATCHED_AS_NULL flags.
  403. */
  404. public static function match(string $subject, string $pattern, int $flags = 0, int $offset = 0): ?array
  405. {
  406. if ($offset > strlen($subject)) {
  407. return null;
  408. }
  409. return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
  410. ? $m
  411. : null;
  412. }
  413. /**
  414. * Finds all occurrences matching regular expression pattern and returns a two-dimensional array. Result is array of matches (ie uses by default PREG_SET_ORDER).
  415. * Parameter $flags can be any combination of PREG_OFFSET_CAPTURE, PREG_UNMATCHED_AS_NULL and PREG_PATTERN_ORDER flags.
  416. */
  417. public static function matchAll(string $subject, string $pattern, int $flags = 0, int $offset = 0): array
  418. {
  419. if ($offset > strlen($subject)) {
  420. return [];
  421. }
  422. self::pcre('preg_match_all', [
  423. $pattern, $subject, &$m,
  424. ($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
  425. $offset,
  426. ]);
  427. return $m;
  428. }
  429. /**
  430. * Replaces all occurrences matching regular expression $pattern which can be string or array in the form `pattern => replacement`.
  431. * @param string|array $pattern
  432. * @param string|callable $replacement
  433. */
  434. public static function replace(string $subject, $pattern, $replacement = '', int $limit = -1): string
  435. {
  436. if (is_object($replacement) || is_array($replacement)) {
  437. if (!is_callable($replacement, false, $textual)) {
  438. throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
  439. }
  440. return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit]);
  441. } elseif (is_array($pattern) && is_string(key($pattern))) {
  442. $replacement = array_values($pattern);
  443. $pattern = array_keys($pattern);
  444. }
  445. return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
  446. }
  447. /** @internal */
  448. public static function pcre(string $func, array $args)
  449. {
  450. $res = Callback::invokeSafe($func, $args, function (string $message) use ($args): void {
  451. // compile-time error, not detectable by preg_last_error
  452. throw new RegexpException($message . ' in pattern: ' . implode(' or ', (array) $args[0]));
  453. });
  454. if (($code = preg_last_error()) // run-time error, but preg_last_error & return code are liars
  455. && ($res === null || !in_array($func, ['preg_filter', 'preg_replace_callback', 'preg_replace'], true))
  456. ) {
  457. throw new RegexpException((RegexpException::MESSAGES[$code] ?? 'Unknown error')
  458. . ' (pattern: ' . implode(' or ', (array) $args[0]) . ')', $code);
  459. }
  460. return $res;
  461. }
  462. }