VicWord.php 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. <?php
  2. /**
  3. * 使用分词
  4. */
  5. namespace Lizhichao\Word;
  6. class VicWord
  7. {
  8. private $dict = [];
  9. private $end = '\\';
  10. private $auto = false;
  11. private $count = 0;
  12. /**
  13. * @var string 词性
  14. */
  15. private $x = '\\x';
  16. public function __construct($dictPath = '')
  17. {
  18. if($dictPath === ''){
  19. $dictPath = dirname(__DIR__) . '/Data/dict.json';
  20. }
  21. $type = pathinfo($dictPath)['extension'];
  22. if ( ! \file_exists($dictPath)) {
  23. throw new \Exception("Invalid dict file: {$dictPath}");
  24. }
  25. // check dict type
  26. switch ($type) {
  27. case 'igb':
  28. if ( ! \function_exists('\\igbinary_unserialize')) {
  29. throw new \Exception('Requires igbinary PHP extension.');
  30. }
  31. $this->dict = \igbinary_unserialize(\file_get_contents($dictPath));
  32. break;
  33. case 'json':
  34. $this->dict = \json_decode(\file_get_contents($dictPath), true);
  35. break;
  36. default:
  37. throw new \Exception('Invalid dict type.');
  38. }
  39. }
  40. /**
  41. * @param string $str
  42. */
  43. public function getWord($str)
  44. {
  45. $this->auto = false;
  46. $str = $this->filter($str);
  47. return $this->find($str);
  48. }
  49. /**
  50. * @param string $str
  51. */
  52. public function getShortWord($str)
  53. {
  54. $this->auto = false;
  55. $str = $this->filter($str);
  56. return $this->shortfind($str);
  57. }
  58. /**
  59. * @param string $str
  60. */
  61. public function getAutoWord($str)
  62. {
  63. $this->auto = true;
  64. $str = $this->filter($str);
  65. return $this->autoFind($str, ['long' => 1]);
  66. }
  67. private function filter($str)
  68. {
  69. return \strtolower($str);
  70. }
  71. private function getD(&$str, $i)
  72. {
  73. $o = \ord($str[$i]);
  74. if ($o < 128) {
  75. $d = $str[$i];
  76. } else {
  77. $o = $o >> 4;
  78. if (12 === $o) {
  79. $d = $str[$i] . $str[++$i];
  80. } elseif (14 === $o) {
  81. $d = $str[$i] . $str[++$i] . $str[++$i];
  82. } elseif (15 === $o) {
  83. $d = $str[$i] . $str[++$i] . $str[++$i] . $str[++$i];
  84. } else {
  85. throw new \Exception('Error: unknow charset.');
  86. }
  87. }
  88. return [$d, $i];
  89. }
  90. private function autoFind($str, $autoInfo = [])
  91. {
  92. if ($autoInfo['long']) {
  93. return $this->find($str, $autoInfo);
  94. }
  95. return $this->shortfind($str, $autoInfo);
  96. }
  97. private function reGet(&$r, $autoInfo)
  98. {
  99. $autoInfo['c'] = isset($autoInfo['c']) ? $autoInfo['c']++ : 1;
  100. $l = \count($r) - 1;
  101. $p = [];
  102. $str = '';
  103. for ($i = $l; $i >= 0; --$i) {
  104. $str = $r[$i][0] . $str;
  105. $f = $r[$i][3];
  106. \array_unshift($p, $r[$i]);
  107. unset($r[$i]);
  108. if (1 === (int) $f) {
  109. break;
  110. }
  111. }
  112. ++$this->count;
  113. $l = \strlen($str);
  114. if (isset($r[$i - 1])) {
  115. $w = $r[$i - 1][1];
  116. } else {
  117. $w = 0;
  118. }
  119. if (isset($autoInfo['pl']) && $l === (int) $autoInfo['pl']) {
  120. $r = $p;
  121. return false;
  122. }
  123. if ($str && $autoInfo['c'] < 3) {
  124. $autoInfo['pl'] = $l;
  125. $autoInfo['long'] = ! $autoInfo['long'];
  126. $sr = $this->autoFind($str, $autoInfo);
  127. $sr = \array_map(function ($v) use ($w) {
  128. $v[1] += $w;
  129. return $v;
  130. }, $sr);
  131. $r = \array_merge($r, $this->getGoodWord($p, $sr));
  132. }
  133. }
  134. private function getGoodWord($old, $new)
  135. {
  136. if ( ! $new) {
  137. return $old;
  138. }
  139. if ($this->getUnknowCount($old) > $this->getUnknowCount($new)) {
  140. return $new;
  141. }
  142. return $old;
  143. }
  144. private function getUnknowCount($ar)
  145. {
  146. $i = 0;
  147. foreach ($ar as $v) {
  148. if (0 === (int) $v[3]) {
  149. $i += \strlen($v[0]);
  150. }
  151. }
  152. return $i;
  153. }
  154. private function find($str, $autoInfo = [])
  155. {
  156. $len = \strlen($str);
  157. $s = '';
  158. $n = '';
  159. $j = 0;
  160. $r = [];
  161. $wr = [];
  162. for ($i = 0; $i < $len; ++$i) {
  163. list($d, $i) = $this->getD($str, $i);
  164. if (isset($wr[$d])) {
  165. $s .= $d;
  166. $wr = $wr[$d];
  167. } else {
  168. if (isset($wr[$this->end])) {
  169. $this->addNotFind($r, $n, $s, $j, $autoInfo);
  170. $this->addResult($r, $s, $j, $wr[$this->x]);
  171. $n = '';
  172. }
  173. $wr = $this->dict;
  174. if (isset($wr[$d])) {
  175. $s = $d;
  176. $wr = $wr[$d];
  177. } else {
  178. $s = '';
  179. }
  180. }
  181. $n .= $d;
  182. $j = $i;
  183. }
  184. if (isset($wr[$this->end])) {
  185. $this->addNotFind($r, $n, $s, $i, $autoInfo);
  186. $this->addResult($r, $s, $i, $wr[$this->x]);
  187. } else {
  188. $this->addNotFind($r, $n, '', $i, $autoInfo);
  189. }
  190. return $r;
  191. }
  192. private function addNotFind(&$r, $n, $s, $i, $autoInfo = [])
  193. {
  194. if ($n !== $s) {
  195. $n = \str_replace($s, '', $n);
  196. $this->addResult($r, $n, $i - \strlen($s), null, 0);
  197. if ($this->auto) {
  198. $this->reGet($r, $autoInfo);
  199. }
  200. }
  201. }
  202. private function shortFind($str, $autoInfo = [])
  203. {
  204. $len = \strlen($str);
  205. $s = '';
  206. $n = '';
  207. $r = [];
  208. $wr = [];
  209. for ($i = 0; $i < $len; ++$i) {
  210. $j = $i;
  211. list($d, $i) = $this->getD($str, $i);
  212. if (isset($wr[$d])) {
  213. $s .= $d;
  214. $wr = $wr[$d];
  215. } else {
  216. if (isset($wr[$this->end])) {
  217. $this->addNotFind($r, $n, $s, $j, $autoInfo);
  218. $this->addResult($r, $s, $j, $wr[$this->x]);
  219. $n = '';
  220. }
  221. $wr = $this->dict;
  222. if (isset($wr[$d])) {
  223. $s = $d;
  224. $wr = $wr[$d];
  225. } else {
  226. $s = '';
  227. }
  228. }
  229. $n .= $d;
  230. if (isset($wr[$this->end])) {
  231. $this->addNotFind($r, $n, $s, $i, $autoInfo);
  232. $this->addResult($r, $s, $i, $wr[$this->x]);
  233. $wr = $this->dict;
  234. $s = '';
  235. $n = '';
  236. }
  237. }
  238. if (isset($wr[$this->end])) {
  239. $this->addNotFind($r, $n, $s, $i, $autoInfo);
  240. $this->addResult($r, $s, $i, $wr[$this->x]);
  241. } else {
  242. $this->addNotFind($r, $n, '', $i, $autoInfo);
  243. }
  244. return $r;
  245. }
  246. private function addResult(&$r, $k, $i, $x, $find = 1)
  247. {
  248. $r[] = [$k, $i, $x, $find];
  249. }
  250. }