| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- <?php
- /*
- * This file is part of the overtrue/pinyin.
- *
- * (c) overtrue <i@overtrue.me>
- *
- * This source file is subject to the MIT license that is bundled
- * with this source code in the file LICENSE.
- */
- namespace Overtrue\Pinyin;
- use InvalidArgumentException;
- /*
- * Chinese to pinyin translator.
- *
- * @author overtrue <i@overtrue.me>
- * @copyright 2015 overtrue <i@overtrue.me>
- *
- * @link https://github.com/overtrue/pinyin
- * @link http://overtrue.me
- */
- /**
- * Class Pinyin.
- *
- * @author overtrue <i@overtrue.me>
- */
- class Pinyin
- {
- /**
- * Dict loader.
- *
- * @var \Overtrue\Pinyin\DictLoaderInterface
- */
- protected $loader;
- /**
- * Punctuations map.
- *
- * @var array
- */
- protected $punctuations = array(
- ',' => ',',
- '。' => '.',
- '!' => '!',
- '?' => '?',
- ':' => ':',
- '“' => '"',
- '”' => '"',
- '‘' => "'",
- '’' => "'",
- '_' => '_',
- );
- /**
- * Constructor.
- *
- * @param string $loaderName
- */
- public function __construct($loaderName = null)
- {
- $this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
- }
- /**
- * Convert string to pinyin.
- *
- * @param string $string
- * @param int $option
- *
- * @return array
- */
- public function convert($string, $option = PINYIN_DEFAULT)
- {
- $pinyin = $this->romanize($string, $option);
- return $this->splitWords($pinyin, $option);
- }
- /**
- * Convert string (person name) to pinyin.
- *
- * @param string $stringName
- * @param int $option
- *
- * @return array
- */
- public function name($stringName, $option = PINYIN_NAME)
- {
- $option = $option | PINYIN_NAME;
- $pinyin = $this->romanize($stringName, $option);
- return $this->splitWords($pinyin, $option);
- }
- /**
- * Return a pinyin permalink from string.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function permalink($string, $delimiter = '-', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = array($delimiter, '-');
- }
- if (!in_array($delimiter, array('_', '-', '.', ''), true)) {
- throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
- }
- return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_NUMBER | \PINYIN_KEEP_ENGLISH));
- }
- /**
- * Return first letters.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function abbr($string, $delimiter = '', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = array($delimiter, '');
- }
- return implode($delimiter, array_map(function ($pinyin) {
- return \is_numeric($pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
- }, $this->convert($string, $option)));
- }
- /**
- * Chinese phrase to pinyin.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function phrase($string, $delimiter = ' ', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = array($delimiter, ' ');
- }
- return implode($delimiter, $this->convert($string, $option));
- }
- /**
- * Chinese to pinyin sentence.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function sentence($string, $delimiter = ' ', $option = \PINYIN_NO_TONE)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = array($delimiter, ' ');
- }
- return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_PUNCTUATION | \PINYIN_KEEP_ENGLISH | \PINYIN_KEEP_NUMBER));
- }
- /**
- * Loader setter.
- *
- * @param \Overtrue\Pinyin\DictLoaderInterface $loader
- *
- * @return $this
- */
- public function setLoader(DictLoaderInterface $loader)
- {
- $this->loader = $loader;
- return $this;
- }
- /**
- * Return dict loader,.
- *
- * @return \Overtrue\Pinyin\DictLoaderInterface
- */
- public function getLoader()
- {
- if (!($this->loader instanceof DictLoaderInterface)) {
- $dataDir = dirname(__DIR__).'/data/';
- $loaderName = $this->loader;
- $this->loader = new $loaderName($dataDir);
- }
- return $this->loader;
- }
- /**
- * Convert Chinese to pinyin.
- *
- * @param string $string
- * @param int $option
- *
- * @return string
- */
- protected function romanize($string, $option = \PINYIN_DEFAULT)
- {
- $string = $this->prepare($string, $option);
- $dictLoader = $this->getLoader();
- if ($this->hasOption($option, \PINYIN_NAME)) {
- $string = $this->convertSurname($string, $dictLoader);
- }
- $dictLoader->map(function ($dictionary) use (&$string) {
- $string = strtr($string, $dictionary);
- });
- return $string;
- }
- /**
- * Convert Chinese Surname to pinyin.
- *
- * @param string $string
- * @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
- *
- * @return string
- */
- protected function convertSurname($string, $dictLoader)
- {
- $dictLoader->mapSurname(function ($dictionary) use (&$string) {
- foreach ($dictionary as $surname => $pinyin) {
- if (0 === strpos($string, $surname)) {
- $string = $pinyin.mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
- break;
- }
- }
- });
- return $string;
- }
- /**
- * Split pinyin string to words.
- *
- * @param string $pinyin
- * @param string $option
- *
- * @return array
- */
- protected function splitWords($pinyin, $option)
- {
- $split = array_filter(preg_split('/\s+/i', $pinyin));
- if (!$this->hasOption($option, PINYIN_TONE)) {
- foreach ($split as $index => $pinyin) {
- $split[$index] = $this->formatTone($pinyin, $option);
- }
- }
- return array_values($split);
- }
- /**
- * @param int $option
- * @param int $check
- *
- * @return bool
- */
- public function hasOption($option, $check)
- {
- return ($option & $check) === $check;
- }
- /**
- * Pre-process.
- *
- * @param string $string
- * @param int $option
- *
- * @return string
- */
- protected function prepare($string, $option = \PINYIN_DEFAULT)
- {
- $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
- return "\t".$matches[0];
- }, $string);
- $regex = array('\p{Han}', '\p{Z}', '\p{M}', "\t");
- if ($this->hasOption($option, \PINYIN_KEEP_NUMBER)) {
- \array_push($regex, '0-9');
- }
- if ($this->hasOption($option, \PINYIN_KEEP_ENGLISH)) {
- \array_push($regex, 'a-zA-Z');
- }
- if ($this->hasOption($option, \PINYIN_KEEP_PUNCTUATION)) {
- $punctuations = array_merge($this->punctuations, array("\t" => ' ', ' ' => ' '));
- $string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
- \array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
- }
- return preg_replace(\sprintf('~[^%s]~u', implode($regex)), '', $string);
- }
- /**
- * Format.
- *
- * @param string $pinyin
- * @param int $option
- *
- * @return string
- */
- protected function formatTone($pinyin, $option = \PINYIN_NO_TONE)
- {
- $replacements = array(
- 'üē' => array('ue', 1), 'üé' => array('ue', 2), 'üě' => array('ue', 3), 'üè' => array('ue', 4),
- 'ā' => array('a', 1), 'ē' => array('e', 1), 'ī' => array('i', 1), 'ō' => array('o', 1), 'ū' => array('u', 1), 'ǖ' => array('yu', 1),
- 'á' => array('a', 2), 'é' => array('e', 2), 'í' => array('i', 2), 'ó' => array('o', 2), 'ú' => array('u', 2), 'ǘ' => array('yu', 2),
- 'ǎ' => array('a', 3), 'ě' => array('e', 3), 'ǐ' => array('i', 3), 'ǒ' => array('o', 3), 'ǔ' => array('u', 3), 'ǚ' => array('yu', 3),
- 'à' => array('a', 4), 'è' => array('e', 4), 'ì' => array('i', 4), 'ò' => array('o', 4), 'ù' => array('u', 4), 'ǜ' => array('yu', 4),
- );
- foreach ($replacements as $unicode => $replacement) {
- if (false !== strpos($pinyin, $unicode)) {
- $umlaut = $replacement[0];
- // https://zh.wikipedia.org/wiki/%C3%9C
- if ($this->hasOption($option, \PINYIN_UMLAUT_V) && 'yu' == $umlaut) {
- $umlaut = 'v';
- }
- $pinyin = str_replace($unicode, $umlaut, $pinyin).($this->hasOption($option, PINYIN_ASCII_TONE) ? $replacement[1] : '');
- }
- }
- return $pinyin;
- }
- }
|