unidecode.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. /**
  2. * Unidecode takes UTF-8 data and tries to represent it in US-ASCII characters (i.e., the universally displayable characters between 0x00 and 0x7F).
  3. * The representation is almost always an attempt at transliteration -- i.e., conveying, in Roman letters, the pronunciation expressed by the text in
  4. * some other writing system.
  5. *
  6. * The tables used (in data) are converted from the tables provided in the perl library Text::Unidecode (http://search.cpan.org/dist/Text-Unidecode/lib/Text/Unidecode.pm)
  7. * and are distributed under the perl license
  8. *
  9. * @author Francois-Guillaume Ribreau
  10. *
  11. * Based on the port of unidecode for php
  12. */
  13. 'use strict';
  14. var tr = {};
  15. var utf8_rx = /(?![\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF7][\x80-\xBF]{3})./g;
  16. module.exports = function (str) {
  17. return str.replace(utf8_rx, unidecode_internal_replace);
  18. };
  19. function unidecode_internal_replace(match) {
  20. var utf16 = utf8_to_utf16(match);
  21. if (utf16 > 0xFFFF) {
  22. return '_';
  23. } else {
  24. var h = utf16 >> 8;
  25. var l = utf16 & 0xFF;
  26. // (18) 18 > h < 1e (30)
  27. if (h > 24 && h < 30) return '';
  28. //(d7) 215 > h < 249 (f9) no supported
  29. if (h > 215 && h < 249) return '';
  30. if (!tr[h]) {
  31. switch (dec2hex(h)) {
  32. case '00':
  33. tr[h] = require('./data/x00');
  34. break;
  35. case '01':
  36. tr[h] = require('./data/x01');
  37. break;
  38. case '02':
  39. tr[h] = require('./data/x02');
  40. break;
  41. case '03':
  42. tr[h] = require('./data/x03');
  43. break;
  44. case '04':
  45. tr[h] = require('./data/x04');
  46. break;
  47. case '05':
  48. tr[h] = require('./data/x05');
  49. break;
  50. case '06':
  51. tr[h] = require('./data/x06');
  52. break;
  53. case '07':
  54. tr[h] = require('./data/x07');
  55. break;
  56. case '09':
  57. tr[h] = require('./data/x09');
  58. break;
  59. case '0a':
  60. tr[h] = require('./data/x0a');
  61. break;
  62. case '0b':
  63. tr[h] = require('./data/x0b');
  64. break;
  65. case '0c':
  66. tr[h] = require('./data/x0c');
  67. break;
  68. case '0d':
  69. tr[h] = require('./data/x0d');
  70. break;
  71. case '0e':
  72. tr[h] = require('./data/x0e');
  73. break;
  74. case '0f':
  75. tr[h] = require('./data/x0f');
  76. break;
  77. case '10':
  78. tr[h] = require('./data/x10');
  79. break;
  80. case '11':
  81. tr[h] = require('./data/x11');
  82. break;
  83. case '12':
  84. tr[h] = require('./data/x12');
  85. break;
  86. case '13':
  87. tr[h] = require('./data/x13');
  88. break;
  89. case '14':
  90. tr[h] = require('./data/x14');
  91. break;
  92. case '15':
  93. tr[h] = require('./data/x15');
  94. break;
  95. case '16':
  96. tr[h] = require('./data/x16');
  97. break;
  98. case '17':
  99. tr[h] = require('./data/x17');
  100. break;
  101. case '18':
  102. tr[h] = require('./data/x18');
  103. break;
  104. case '1e':
  105. tr[h] = require('./data/x1e');
  106. break;
  107. case '1f':
  108. tr[h] = require('./data/x1f');
  109. break;
  110. case '20':
  111. tr[h] = require('./data/x20');
  112. break;
  113. case '21':
  114. tr[h] = require('./data/x21');
  115. break;
  116. case '22':
  117. tr[h] = require('./data/x22');
  118. break;
  119. case '23':
  120. tr[h] = require('./data/x23');
  121. break;
  122. case '24':
  123. tr[h] = require('./data/x24');
  124. break;
  125. case '25':
  126. tr[h] = require('./data/x25');
  127. break;
  128. case '26':
  129. tr[h] = require('./data/x26');
  130. break;
  131. case '27':
  132. tr[h] = require('./data/x27');
  133. break;
  134. case '28':
  135. tr[h] = require('./data/x28');
  136. break;
  137. case '2e':
  138. tr[h] = require('./data/x2e');
  139. break;
  140. case '2f':
  141. tr[h] = require('./data/x2f');
  142. break;
  143. case '30':
  144. tr[h] = require('./data/x30');
  145. break;
  146. case '31':
  147. tr[h] = require('./data/x31');
  148. break;
  149. case '32':
  150. tr[h] = require('./data/x32');
  151. break;
  152. case '33':
  153. tr[h] = require('./data/x33');
  154. break;
  155. case '4d':
  156. tr[h] = require('./data/x4d');
  157. break;
  158. case '4e':
  159. tr[h] = require('./data/x4e');
  160. break;
  161. case '4f':
  162. tr[h] = require('./data/x4f');
  163. break;
  164. case '50':
  165. tr[h] = require('./data/x50');
  166. break;
  167. case '51':
  168. tr[h] = require('./data/x51');
  169. break;
  170. case '52':
  171. tr[h] = require('./data/x52');
  172. break;
  173. case '53':
  174. tr[h] = require('./data/x53');
  175. break;
  176. case '54':
  177. tr[h] = require('./data/x54');
  178. break;
  179. case '55':
  180. tr[h] = require('./data/x55');
  181. break;
  182. case '56':
  183. tr[h] = require('./data/x56');
  184. break;
  185. case '57':
  186. tr[h] = require('./data/x57');
  187. break;
  188. case '58':
  189. tr[h] = require('./data/x58');
  190. break;
  191. case '59':
  192. tr[h] = require('./data/x59');
  193. break;
  194. case '5a':
  195. tr[h] = require('./data/x5a');
  196. break;
  197. case '5b':
  198. tr[h] = require('./data/x5b');
  199. break;
  200. case '5c':
  201. tr[h] = require('./data/x5c');
  202. break;
  203. case '5d':
  204. tr[h] = require('./data/x5d');
  205. break;
  206. case '5e':
  207. tr[h] = require('./data/x5e');
  208. break;
  209. case '5f':
  210. tr[h] = require('./data/x5f');
  211. break;
  212. case '60':
  213. tr[h] = require('./data/x60');
  214. break;
  215. case '61':
  216. tr[h] = require('./data/x61');
  217. break;
  218. case '62':
  219. tr[h] = require('./data/x62');
  220. break;
  221. case '63':
  222. tr[h] = require('./data/x63');
  223. break;
  224. case '64':
  225. tr[h] = require('./data/x64');
  226. break;
  227. case '65':
  228. tr[h] = require('./data/x65');
  229. break;
  230. case '66':
  231. tr[h] = require('./data/x66');
  232. break;
  233. case '67':
  234. tr[h] = require('./data/x67');
  235. break;
  236. case '68':
  237. tr[h] = require('./data/x68');
  238. break;
  239. case '69':
  240. tr[h] = require('./data/x69');
  241. break;
  242. case '6a':
  243. tr[h] = require('./data/x6a');
  244. break;
  245. case '6b':
  246. tr[h] = require('./data/x6b');
  247. break;
  248. case '6c':
  249. tr[h] = require('./data/x6c');
  250. break;
  251. case '6d':
  252. tr[h] = require('./data/x6d');
  253. break;
  254. case '6e':
  255. tr[h] = require('./data/x6e');
  256. break;
  257. case '6f':
  258. tr[h] = require('./data/x6f');
  259. break;
  260. case '70':
  261. tr[h] = require('./data/x70');
  262. break;
  263. case '71':
  264. tr[h] = require('./data/x71');
  265. break;
  266. case '72':
  267. tr[h] = require('./data/x72');
  268. break;
  269. case '73':
  270. tr[h] = require('./data/x73');
  271. break;
  272. case '74':
  273. tr[h] = require('./data/x74');
  274. break;
  275. case '75':
  276. tr[h] = require('./data/x75');
  277. break;
  278. case '76':
  279. tr[h] = require('./data/x76');
  280. break;
  281. case '77':
  282. tr[h] = require('./data/x77');
  283. break;
  284. case '78':
  285. tr[h] = require('./data/x78');
  286. break;
  287. case '79':
  288. tr[h] = require('./data/x79');
  289. break;
  290. case '7a':
  291. tr[h] = require('./data/x7a');
  292. break;
  293. case '7b':
  294. tr[h] = require('./data/x7b');
  295. break;
  296. case '7c':
  297. tr[h] = require('./data/x7c');
  298. break;
  299. case '7d':
  300. tr[h] = require('./data/x7d');
  301. break;
  302. case '7e':
  303. tr[h] = require('./data/x7e');
  304. break;
  305. case '7f':
  306. tr[h] = require('./data/x7f');
  307. break;
  308. case '80':
  309. tr[h] = require('./data/x80');
  310. break;
  311. case '81':
  312. tr[h] = require('./data/x81');
  313. break;
  314. case '82':
  315. tr[h] = require('./data/x82');
  316. break;
  317. case '83':
  318. tr[h] = require('./data/x83');
  319. break;
  320. case '84':
  321. tr[h] = require('./data/x84');
  322. break;
  323. case '85':
  324. tr[h] = require('./data/x85');
  325. break;
  326. case '86':
  327. tr[h] = require('./data/x86');
  328. break;
  329. case '87':
  330. tr[h] = require('./data/x87');
  331. break;
  332. case '88':
  333. tr[h] = require('./data/x88');
  334. break;
  335. case '89':
  336. tr[h] = require('./data/x89');
  337. break;
  338. case '8a':
  339. tr[h] = require('./data/x8a');
  340. break;
  341. case '8b':
  342. tr[h] = require('./data/x8b');
  343. break;
  344. case '8c':
  345. tr[h] = require('./data/x8c');
  346. break;
  347. case '8d':
  348. tr[h] = require('./data/x8d');
  349. break;
  350. case '8e':
  351. tr[h] = require('./data/x8e');
  352. break;
  353. case '8f':
  354. tr[h] = require('./data/x8f');
  355. break;
  356. case '90':
  357. tr[h] = require('./data/x90');
  358. break;
  359. case '91':
  360. tr[h] = require('./data/x91');
  361. break;
  362. case '92':
  363. tr[h] = require('./data/x92');
  364. break;
  365. case '93':
  366. tr[h] = require('./data/x93');
  367. break;
  368. case '94':
  369. tr[h] = require('./data/x94');
  370. break;
  371. case '95':
  372. tr[h] = require('./data/x95');
  373. break;
  374. case '96':
  375. tr[h] = require('./data/x96');
  376. break;
  377. case '97':
  378. tr[h] = require('./data/x97');
  379. break;
  380. case '98':
  381. tr[h] = require('./data/x98');
  382. break;
  383. case '99':
  384. tr[h] = require('./data/x99');
  385. break;
  386. case '9a':
  387. tr[h] = require('./data/x9a');
  388. break;
  389. case '9b':
  390. tr[h] = require('./data/x9b');
  391. break;
  392. case '9c':
  393. tr[h] = require('./data/x9c');
  394. break;
  395. case '9d':
  396. tr[h] = require('./data/x9d');
  397. break;
  398. case '9e':
  399. tr[h] = require('./data/x9e');
  400. break;
  401. case '9f':
  402. tr[h] = require('./data/x9f');
  403. break;
  404. case 'a0':
  405. tr[h] = require('./data/xa0');
  406. break;
  407. case 'a1':
  408. tr[h] = require('./data/xa1');
  409. break;
  410. case 'a2':
  411. tr[h] = require('./data/xa2');
  412. break;
  413. case 'a3':
  414. tr[h] = require('./data/xa3');
  415. break;
  416. case 'a4':
  417. tr[h] = require('./data/xa4');
  418. break;
  419. case 'ac':
  420. tr[h] = require('./data/xac');
  421. break;
  422. case 'ad':
  423. tr[h] = require('./data/xad');
  424. break;
  425. case 'ae':
  426. tr[h] = require('./data/xae');
  427. break;
  428. case 'af':
  429. tr[h] = require('./data/xaf');
  430. break;
  431. case 'b0':
  432. tr[h] = require('./data/xb0');
  433. break;
  434. case 'b1':
  435. tr[h] = require('./data/xb1');
  436. break;
  437. case 'b2':
  438. tr[h] = require('./data/xb2');
  439. break;
  440. case 'b3':
  441. tr[h] = require('./data/xb3');
  442. break;
  443. case 'b4':
  444. tr[h] = require('./data/xb4');
  445. break;
  446. case 'b5':
  447. tr[h] = require('./data/xb5');
  448. break;
  449. case 'b6':
  450. tr[h] = require('./data/xb6');
  451. break;
  452. case 'b7':
  453. tr[h] = require('./data/xb7');
  454. break;
  455. case 'b8':
  456. tr[h] = require('./data/xb8');
  457. break;
  458. case 'b9':
  459. tr[h] = require('./data/xb9');
  460. break;
  461. case 'ba':
  462. tr[h] = require('./data/xba');
  463. break;
  464. case 'bb':
  465. tr[h] = require('./data/xbb');
  466. break;
  467. case 'bc':
  468. tr[h] = require('./data/xbc');
  469. break;
  470. case 'bd':
  471. tr[h] = require('./data/xbd');
  472. break;
  473. case 'be':
  474. tr[h] = require('./data/xbe');
  475. break;
  476. case 'bf':
  477. tr[h] = require('./data/xbf');
  478. break;
  479. case 'c0':
  480. tr[h] = require('./data/xc0');
  481. break;
  482. case 'c1':
  483. tr[h] = require('./data/xc1');
  484. break;
  485. case 'c2':
  486. tr[h] = require('./data/xc2');
  487. break;
  488. case 'c3':
  489. tr[h] = require('./data/xc3');
  490. break;
  491. case 'c4':
  492. tr[h] = require('./data/xc4');
  493. break;
  494. case 'c5':
  495. tr[h] = require('./data/xc5');
  496. break;
  497. case 'c6':
  498. tr[h] = require('./data/xc6');
  499. break;
  500. case 'c7':
  501. tr[h] = require('./data/xc7');
  502. break;
  503. case 'c8':
  504. tr[h] = require('./data/xc8');
  505. break;
  506. case 'c9':
  507. tr[h] = require('./data/xc9');
  508. break;
  509. case 'ca':
  510. tr[h] = require('./data/xca');
  511. break;
  512. case 'cb':
  513. tr[h] = require('./data/xcb');
  514. break;
  515. case 'cc':
  516. tr[h] = require('./data/xcc');
  517. break;
  518. case 'cd':
  519. tr[h] = require('./data/xcd');
  520. break;
  521. case 'ce':
  522. tr[h] = require('./data/xce');
  523. break;
  524. case 'cf':
  525. tr[h] = require('./data/xcf');
  526. break;
  527. case 'd0':
  528. tr[h] = require('./data/xd0');
  529. break;
  530. case 'd1':
  531. tr[h] = require('./data/xd1');
  532. break;
  533. case 'd2':
  534. tr[h] = require('./data/xd2');
  535. break;
  536. case 'd3':
  537. tr[h] = require('./data/xd3');
  538. break;
  539. case 'd4':
  540. tr[h] = require('./data/xd4');
  541. break;
  542. case 'd5':
  543. tr[h] = require('./data/xd5');
  544. break;
  545. case 'd6':
  546. tr[h] = require('./data/xd6');
  547. break;
  548. case 'd7':
  549. tr[h] = require('./data/xd7');
  550. break;
  551. case 'f9':
  552. tr[h] = require('./data/xf9');
  553. break;
  554. case 'fa':
  555. tr[h] = require('./data/xfa');
  556. break;
  557. case 'fb':
  558. tr[h] = require('./data/xfb');
  559. break;
  560. case 'fc':
  561. tr[h] = require('./data/xfc');
  562. break;
  563. case 'fd':
  564. tr[h] = require('./data/xfd');
  565. break;
  566. case 'fe':
  567. tr[h] = require('./data/xfe');
  568. break;
  569. case 'ff':
  570. tr[h] = require('./data/xff');
  571. break;
  572. default:
  573. // console.error("Unidecode file not found for h=", h);
  574. return '';
  575. }
  576. }
  577. return tr[h][l];
  578. }
  579. }
  580. function dec2hex(i) {
  581. return (i + 0x100).toString(16).substr(-2);
  582. }
  583. function utf8_to_utf16(raw) {
  584. var b1, b2, b3, b4,
  585. x, y, z;
  586. while (Array.isArray(raw)) raw = raw[0];
  587. switch (raw.length) {
  588. case 1:
  589. return ord(raw);
  590. // http://en.wikipedia.org/wiki/UTF-8
  591. case 2:
  592. b1 = ord(raw.substr(0, 1));
  593. b2 = ord(raw.substr(1, 1));
  594. x = ((b1 & 0x03) << 6) | (b2 & 0x3F);
  595. y = (b1 & 0x1C) >> 2;
  596. return (y << 8) | x;
  597. case 3:
  598. b1 = ord(raw.substr(0, 1));
  599. b2 = ord(raw.substr(1, 1));
  600. b3 = ord(raw.substr(2, 1));
  601. x = ((b2 & 0x03) << 6) | (b3 & 0x3F);
  602. y = ((b1 & 0x0F) << 4) | ((b2 & 0x3C) >> 2);
  603. return (y << 8) | x;
  604. default:
  605. b1 = ord(raw.substr(0, 1));
  606. b2 = ord(raw.substr(1, 1));
  607. b3 = ord(raw.substr(2, 1));
  608. b4 = ord(raw.substr(3, 1));
  609. x = ((b3 & 0x03) << 6) | (b4 & 0x3F);
  610. y = ((b2 & 0x0F) << 4) | ((b3 & 0x3C) >> 2);
  611. z = ((b1 & 0x07) << 5) | ((b2 & 0x30) >> 4);
  612. return (z << 16) | (y << 8) | x;
  613. }
  614. }
  615. /* From php.js */
  616. function ord(string) {
  617. // Returns the codepoint value of a character
  618. //
  619. // version: 1109.2015
  620. // discuss at: http://phpjs.org/functions/ord
  621. // + original by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
  622. // + bugfixed by: Onno Marsman
  623. // + improved by: Brett Zamir (http://brett-zamir.me)
  624. // + input by: incidence
  625. // * example 1: ord('K');
  626. // * returns 1: 75
  627. // * example 2: ord('\uD800\uDC00'); // surrogate pair to create a single Unicode character
  628. // * returns 2: 65536
  629. var str = string + '',
  630. code = str.charCodeAt(0);
  631. if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters)
  632. var hi = code;
  633. if (str.length === 1) {
  634. return code; // This is just a high surrogate with no following low surrogate, so we return its value;
  635. // we could also throw an error as it is not a complete character, but someone may want to know
  636. }
  637. var low = str.charCodeAt(1);
  638. return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
  639. }
  640. if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
  641. return code; // This is just a low surrogate with no preceding high surrogate, so we return its value;
  642. // we could also throw an error as it is not a complete character, but someone may want to know
  643. }
  644. return code;
  645. }