Pinyin.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. <?php
  2. /*
  3. * This file is part of the overtrue/pinyin.
  4. *
  5. * (c) overtrue <i@overtrue.me>
  6. *
  7. * This source file is subject to the MIT license that is bundled
  8. * with this source code in the file LICENSE.
  9. */
  10. namespace Overtrue\Pinyin;
  11. use InvalidArgumentException;
  12. /*
  13. * Chinese to pinyin translator.
  14. *
  15. * @author overtrue <i@overtrue.me>
  16. * @copyright 2015 overtrue <i@overtrue.me>
  17. *
  18. * @link https://github.com/overtrue/pinyin
  19. * @link http://overtrue.me
  20. */
  21. /**
  22. * Class Pinyin.
  23. *
  24. * @author overtrue <i@overtrue.me>
  25. */
  26. class Pinyin
  27. {
  28. /**
  29. * Dict loader.
  30. *
  31. * @var \Overtrue\Pinyin\DictLoaderInterface
  32. */
  33. protected $loader;
  34. /**
  35. * Punctuations map.
  36. *
  37. * @var array
  38. */
  39. protected $punctuations = array(
  40. ',' => ',',
  41. '。' => '.',
  42. '!' => '!',
  43. '?' => '?',
  44. ':' => ':',
  45. '“' => '"',
  46. '”' => '"',
  47. '‘' => "'",
  48. '’' => "'",
  49. '_' => '_',
  50. );
  51. /**
  52. * Constructor.
  53. *
  54. * @param string $loaderName
  55. */
  56. public function __construct($loaderName = null)
  57. {
  58. $this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
  59. }
  60. /**
  61. * Convert string to pinyin.
  62. *
  63. * @param string $string
  64. * @param int $option
  65. *
  66. * @return array
  67. */
  68. public function convert($string, $option = PINYIN_DEFAULT)
  69. {
  70. $pinyin = $this->romanize($string, $option);
  71. return $this->splitWords($pinyin, $option);
  72. }
  73. /**
  74. * Convert string (person name) to pinyin.
  75. *
  76. * @param string $stringName
  77. * @param int $option
  78. *
  79. * @return array
  80. */
  81. public function name($stringName, $option = PINYIN_NAME)
  82. {
  83. $option = $option | PINYIN_NAME;
  84. $pinyin = $this->romanize($stringName, $option);
  85. return $this->splitWords($pinyin, $option);
  86. }
  87. /**
  88. * Return a pinyin permalink from string.
  89. *
  90. * @param string $string
  91. * @param string $delimiter
  92. * @param int $option
  93. *
  94. * @return string
  95. */
  96. public function permalink($string, $delimiter = '-', $option = PINYIN_DEFAULT)
  97. {
  98. if (\is_int($delimiter)) {
  99. list($option, $delimiter) = array($delimiter, '-');
  100. }
  101. if (!in_array($delimiter, array('_', '-', '.', ''), true)) {
  102. throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
  103. }
  104. return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_NUMBER | \PINYIN_KEEP_ENGLISH));
  105. }
  106. /**
  107. * Return first letters.
  108. *
  109. * @param string $string
  110. * @param string $delimiter
  111. * @param int $option
  112. *
  113. * @return string
  114. */
  115. public function abbr($string, $delimiter = '', $option = PINYIN_DEFAULT)
  116. {
  117. if (\is_int($delimiter)) {
  118. list($option, $delimiter) = array($delimiter, '');
  119. }
  120. return implode($delimiter, array_map(function ($pinyin) {
  121. return \is_numeric($pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
  122. }, $this->convert($string, $option)));
  123. }
  124. /**
  125. * Chinese phrase to pinyin.
  126. *
  127. * @param string $string
  128. * @param string $delimiter
  129. * @param int $option
  130. *
  131. * @return string
  132. */
  133. public function phrase($string, $delimiter = ' ', $option = PINYIN_DEFAULT)
  134. {
  135. if (\is_int($delimiter)) {
  136. list($option, $delimiter) = array($delimiter, ' ');
  137. }
  138. return implode($delimiter, $this->convert($string, $option));
  139. }
  140. /**
  141. * Chinese to pinyin sentence.
  142. *
  143. * @param string $string
  144. * @param string $delimiter
  145. * @param int $option
  146. *
  147. * @return string
  148. */
  149. public function sentence($string, $delimiter = ' ', $option = \PINYIN_NO_TONE)
  150. {
  151. if (\is_int($delimiter)) {
  152. list($option, $delimiter) = array($delimiter, ' ');
  153. }
  154. return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_PUNCTUATION | \PINYIN_KEEP_ENGLISH | \PINYIN_KEEP_NUMBER));
  155. }
  156. /**
  157. * Loader setter.
  158. *
  159. * @param \Overtrue\Pinyin\DictLoaderInterface $loader
  160. *
  161. * @return $this
  162. */
  163. public function setLoader(DictLoaderInterface $loader)
  164. {
  165. $this->loader = $loader;
  166. return $this;
  167. }
  168. /**
  169. * Return dict loader,.
  170. *
  171. * @return \Overtrue\Pinyin\DictLoaderInterface
  172. */
  173. public function getLoader()
  174. {
  175. if (!($this->loader instanceof DictLoaderInterface)) {
  176. $dataDir = dirname(__DIR__).'/data/';
  177. $loaderName = $this->loader;
  178. $this->loader = new $loaderName($dataDir);
  179. }
  180. return $this->loader;
  181. }
  182. /**
  183. * Convert Chinese to pinyin.
  184. *
  185. * @param string $string
  186. * @param int $option
  187. *
  188. * @return string
  189. */
  190. protected function romanize($string, $option = \PINYIN_DEFAULT)
  191. {
  192. $string = $this->prepare($string, $option);
  193. $dictLoader = $this->getLoader();
  194. if ($this->hasOption($option, \PINYIN_NAME)) {
  195. $string = $this->convertSurname($string, $dictLoader);
  196. }
  197. $dictLoader->map(function ($dictionary) use (&$string) {
  198. $string = strtr($string, $dictionary);
  199. });
  200. return $string;
  201. }
  202. /**
  203. * Convert Chinese Surname to pinyin.
  204. *
  205. * @param string $string
  206. * @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
  207. *
  208. * @return string
  209. */
  210. protected function convertSurname($string, $dictLoader)
  211. {
  212. $dictLoader->mapSurname(function ($dictionary) use (&$string) {
  213. foreach ($dictionary as $surname => $pinyin) {
  214. if (0 === strpos($string, $surname)) {
  215. $string = $pinyin.mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
  216. break;
  217. }
  218. }
  219. });
  220. return $string;
  221. }
  222. /**
  223. * Split pinyin string to words.
  224. *
  225. * @param string $pinyin
  226. * @param string $option
  227. *
  228. * @return array
  229. */
  230. protected function splitWords($pinyin, $option)
  231. {
  232. $split = array_filter(preg_split('/\s+/i', $pinyin));
  233. if (!$this->hasOption($option, PINYIN_TONE)) {
  234. foreach ($split as $index => $pinyin) {
  235. $split[$index] = $this->formatTone($pinyin, $option);
  236. }
  237. }
  238. return array_values($split);
  239. }
  240. /**
  241. * @param int $option
  242. * @param int $check
  243. *
  244. * @return bool
  245. */
  246. public function hasOption($option, $check)
  247. {
  248. return ($option & $check) === $check;
  249. }
  250. /**
  251. * Pre-process.
  252. *
  253. * @param string $string
  254. * @param int $option
  255. *
  256. * @return string
  257. */
  258. protected function prepare($string, $option = \PINYIN_DEFAULT)
  259. {
  260. $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
  261. return "\t".$matches[0];
  262. }, $string);
  263. $regex = array('\p{Han}', '\p{Z}', '\p{M}', "\t");
  264. if ($this->hasOption($option, \PINYIN_KEEP_NUMBER)) {
  265. \array_push($regex, '0-9');
  266. }
  267. if ($this->hasOption($option, \PINYIN_KEEP_ENGLISH)) {
  268. \array_push($regex, 'a-zA-Z');
  269. }
  270. if ($this->hasOption($option, \PINYIN_KEEP_PUNCTUATION)) {
  271. $punctuations = array_merge($this->punctuations, array("\t" => ' ', ' ' => ' '));
  272. $string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
  273. \array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
  274. }
  275. return preg_replace(\sprintf('~[^%s]~u', implode($regex)), '', $string);
  276. }
  277. /**
  278. * Format.
  279. *
  280. * @param string $pinyin
  281. * @param int $option
  282. *
  283. * @return string
  284. */
  285. protected function formatTone($pinyin, $option = \PINYIN_NO_TONE)
  286. {
  287. $replacements = array(
  288. 'üē' => array('ue', 1), 'üé' => array('ue', 2), 'üě' => array('ue', 3), 'üè' => array('ue', 4),
  289. 'ā' => array('a', 1), 'ē' => array('e', 1), 'ī' => array('i', 1), 'ō' => array('o', 1), 'ū' => array('u', 1), 'ǖ' => array('yu', 1),
  290. 'á' => array('a', 2), 'é' => array('e', 2), 'í' => array('i', 2), 'ó' => array('o', 2), 'ú' => array('u', 2), 'ǘ' => array('yu', 2),
  291. 'ǎ' => array('a', 3), 'ě' => array('e', 3), 'ǐ' => array('i', 3), 'ǒ' => array('o', 3), 'ǔ' => array('u', 3), 'ǚ' => array('yu', 3),
  292. 'à' => array('a', 4), 'è' => array('e', 4), 'ì' => array('i', 4), 'ò' => array('o', 4), 'ù' => array('u', 4), 'ǜ' => array('yu', 4),
  293. );
  294. foreach ($replacements as $unicode => $replacement) {
  295. if (false !== strpos($pinyin, $unicode)) {
  296. $umlaut = $replacement[0];
  297. // https://zh.wikipedia.org/wiki/%C3%9C
  298. if ($this->hasOption($option, \PINYIN_UMLAUT_V) && 'yu' == $umlaut) {
  299. $umlaut = 'v';
  300. }
  301. $pinyin = str_replace($unicode, $umlaut, $pinyin).($this->hasOption($option, PINYIN_ASCII_TONE) ? $replacement[1] : '');
  302. }
  303. }
  304. return $pinyin;
  305. }
  306. }