MpHtmlParser.js 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. <<<<<<< HEAD
  2. /**
  3. * html 解析器
  4. * @tutorial https://github.com/jin-yufeng/Parser
  5. * @version 20200728
  6. * @author JinYufeng
  7. * @listens MIT
  8. */
  9. const cfg = require('./config.js'),
  10. blankChar = cfg.blankChar,
  11. CssHandler = require('./CssHandler.js'),
  12. windowWidth = uni.getSystemInfoSync().windowWidth;
  13. var emoji;
  14. function MpHtmlParser(data, options = {}) {
  15. this.attrs = {};
  16. this.CssHandler = new CssHandler(options.tagStyle, windowWidth);
  17. this.data = data;
  18. this.domain = options.domain;
  19. this.DOM = [];
  20. this.i = this.start = this.audioNum = this.imgNum = this.videoNum = 0;
  21. options.prot = (this.domain || '').includes('://') ? this.domain.split('://')[0] : 'http';
  22. this.options = options;
  23. this.state = this.Text;
  24. this.STACK = [];
  25. // 工具函数
  26. this.bubble = () => {
  27. for (var i = this.STACK.length, item; item = this.STACK[--i];) {
  28. if (cfg.richOnlyTags[item.name]) {
  29. if (item.name == 'table' && !Object.hasOwnProperty.call(item, 'c')) item.c = 1;
  30. return false;
  31. }
  32. item.c = 1;
  33. }
  34. return true;
  35. }
  36. this.decode = (val, amp) => {
  37. var i = -1,
  38. j, en;
  39. while (1) {
  40. if ((i = val.indexOf('&', i + 1)) == -1) break;
  41. if ((j = val.indexOf(';', i + 2)) == -1) break;
  42. if (val[i + 1] == '#') {
  43. en = parseInt((val[i + 2] == 'x' ? '0' : '') + val.substring(i + 2, j));
  44. if (!isNaN(en)) val = val.substr(0, i) + String.fromCharCode(en) + val.substr(j + 1);
  45. } else {
  46. en = val.substring(i + 1, j);
  47. if (cfg.entities[en] || en == amp)
  48. val = val.substr(0, i) + (cfg.entities[en] || '&') + val.substr(j + 1);
  49. }
  50. }
  51. return val;
  52. }
  53. this.getUrl = url => {
  54. if (url[0] == '/') {
  55. if (url[1] == '/') url = this.options.prot + ':' + url;
  56. else if (this.domain) url = this.domain + url;
  57. } else if (this.domain && url.indexOf('data:') != 0 && !url.includes('://'))
  58. url = this.domain + '/' + url;
  59. return url;
  60. }
  61. this.isClose = () => this.data[this.i] == '>' || (this.data[this.i] == '/' && this.data[this.i + 1] == '>');
  62. this.section = () => this.data.substring(this.start, this.i);
  63. this.parent = () => this.STACK[this.STACK.length - 1];
  64. this.siblings = () => this.STACK.length ? this.parent().children : this.DOM;
  65. }
  66. MpHtmlParser.prototype.parse = function() {
  67. if (emoji) this.data = emoji.parseEmoji(this.data);
  68. for (var c; c = this.data[this.i]; this.i++)
  69. this.state(c);
  70. if (this.state == this.Text) this.setText();
  71. while (this.STACK.length) this.popNode(this.STACK.pop());
  72. return this.DOM;
  73. }
  74. // 设置属性
  75. MpHtmlParser.prototype.setAttr = function() {
  76. var name = this.attrName.toLowerCase(),
  77. val = this.attrVal;
  78. if (cfg.boolAttrs[name]) this.attrs[name] = 'T';
  79. else if (val) {
  80. if (name == 'src' || (name == 'data-src' && !this.attrs.src)) this.attrs.src = this.getUrl(this.decode(val, 'amp'));
  81. else if (name == 'href' || name == 'style') this.attrs[name] = this.decode(val, 'amp');
  82. else if (name.substr(0, 5) != 'data-') this.attrs[name] = val;
  83. }
  84. this.attrVal = '';
  85. while (blankChar[this.data[this.i]]) this.i++;
  86. if (this.isClose()) this.setNode();
  87. else {
  88. this.start = this.i;
  89. this.state = this.AttrName;
  90. }
  91. }
  92. // 设置文本节点
  93. MpHtmlParser.prototype.setText = function() {
  94. var back, text = this.section();
  95. if (!text) return;
  96. text = (cfg.onText && cfg.onText(text, () => back = true)) || text;
  97. if (back) {
  98. this.data = this.data.substr(0, this.start) + text + this.data.substr(this.i);
  99. let j = this.start + text.length;
  100. for (this.i = this.start; this.i < j; this.i++) this.state(this.data[this.i]);
  101. return;
  102. }
  103. if (!this.pre) {
  104. // 合并空白符
  105. var flag, tmp = [];
  106. for (let i = text.length, c; c = text[--i];)
  107. if (!blankChar[c]) {
  108. tmp.unshift(c);
  109. if (!flag) flag = 1;
  110. } else {
  111. if (tmp[0] != ' ') tmp.unshift(' ');
  112. if (c == '\n' && flag == void 0) flag = 0;
  113. }
  114. if (flag == 0) return;
  115. text = tmp.join('');
  116. }
  117. this.siblings().push({
  118. type: 'text',
  119. text: this.decode(text)
  120. });
  121. }
  122. // 设置元素节点
  123. MpHtmlParser.prototype.setNode = function() {
  124. var node = {
  125. name: this.tagName.toLowerCase(),
  126. attrs: this.attrs
  127. },
  128. close = cfg.selfClosingTags[node.name];
  129. if (this.options.nodes.length) node.type = 'node';
  130. this.attrs = {};
  131. if (!cfg.ignoreTags[node.name]) {
  132. // 处理属性
  133. var attrs = node.attrs,
  134. style = this.CssHandler.match(node.name, attrs, node) + (attrs.style || ''),
  135. styleObj = {};
  136. if (attrs.id) {
  137. if (this.options.compress & 1) attrs.id = void 0;
  138. else if (this.options.useAnchor) this.bubble();
  139. }
  140. if ((this.options.compress & 2) && attrs.class) attrs.class = void 0;
  141. switch (node.name) {
  142. case 'a':
  143. case 'ad': // #ifdef APP-PLUS
  144. case 'iframe':
  145. // #endif
  146. this.bubble();
  147. break;
  148. case 'font':
  149. if (attrs.color) {
  150. styleObj['color'] = attrs.color;
  151. attrs.color = void 0;
  152. }
  153. if (attrs.face) {
  154. styleObj['font-family'] = attrs.face;
  155. attrs.face = void 0;
  156. }
  157. if (attrs.size) {
  158. var size = parseInt(attrs.size);
  159. if (size < 1) size = 1;
  160. else if (size > 7) size = 7;
  161. var map = ['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'];
  162. styleObj['font-size'] = map[size - 1];
  163. attrs.size = void 0;
  164. }
  165. break;
  166. case 'embed':
  167. // #ifndef APP-PLUS
  168. var src = node.attrs.src || '',
  169. type = node.attrs.type || '';
  170. if (type.includes('video') || src.includes('.mp4') || src.includes('.3gp') || src.includes('.m3u8'))
  171. node.name = 'video';
  172. else if (type.includes('audio') || src.includes('.m4a') || src.includes('.wav') || src.includes('.mp3') || src.includes(
  173. '.aac'))
  174. node.name = 'audio';
  175. else break;
  176. if (node.attrs.autostart)
  177. node.attrs.autoplay = 'T';
  178. node.attrs.controls = 'T';
  179. // #endif
  180. // #ifdef APP-PLUS
  181. this.bubble();
  182. break;
  183. // #endif
  184. case 'video':
  185. case 'audio':
  186. if (!attrs.id) attrs.id = node.name + (++this[`${node.name}Num`]);
  187. else this[`${node.name}Num`]++;
  188. if (node.name == 'video') {
  189. if (this.videoNum > 3)
  190. node.lazyLoad = 1;
  191. if (attrs.width) {
  192. styleObj.width = parseFloat(attrs.width) + (attrs.width.includes('%') ? '%' : 'px');
  193. attrs.width = void 0;
  194. }
  195. if (attrs.height) {
  196. styleObj.height = parseFloat(attrs.height) + (attrs.height.includes('%') ? '%' : 'px');
  197. attrs.height = void 0;
  198. }
  199. }
  200. if (!attrs.controls && !attrs.autoplay) attrs.controls = 'T';
  201. attrs.source = [];
  202. if (attrs.src) {
  203. attrs.source.push(attrs.src);
  204. attrs.src = void 0;
  205. }
  206. this.bubble();
  207. break;
  208. case 'td':
  209. case 'th':
  210. if (attrs.colspan || attrs.rowspan)
  211. for (var k = this.STACK.length, item; item = this.STACK[--k];)
  212. if (item.name == 'table') {
  213. item.c = void 0;
  214. break;
  215. }
  216. }
  217. if (attrs.align) {
  218. styleObj['text-align'] = attrs.align;
  219. attrs.align = void 0;
  220. }
  221. // 压缩 style
  222. var styles = style.split(';');
  223. style = '';
  224. for (var i = 0, len = styles.length; i < len; i++) {
  225. var info = styles[i].split(':');
  226. if (info.length < 2) continue;
  227. let key = info[0].trim().toLowerCase(),
  228. value = info.slice(1).join(':').trim();
  229. if (value[0] == '-' || value.includes('safe'))
  230. style += `;${key}:${value}`;
  231. else if (!styleObj[key] || value.includes('import') || !styleObj[key].includes('import'))
  232. styleObj[key] = value;
  233. }
  234. if (node.name == 'img') {
  235. if (attrs.src && !attrs.ignore) {
  236. if (this.bubble())
  237. attrs.i = (this.imgNum++).toString();
  238. else attrs.ignore = 'T';
  239. }
  240. if (attrs.ignore) {
  241. style += ';-webkit-touch-callout:none';
  242. styleObj['max-width'] = '100%';
  243. }
  244. var width;
  245. if (styleObj.width) width = styleObj.width;
  246. else if (attrs.width) width = attrs.width.includes('%') ? attrs.width : attrs.width + 'px';
  247. if (width) {
  248. styleObj.width = width;
  249. attrs.width = '100%';
  250. if (parseInt(width) > windowWidth) {
  251. styleObj.height = '';
  252. if (attrs.height) attrs.height = void 0;
  253. }
  254. }
  255. if (styleObj.height) {
  256. attrs.height = styleObj.height;
  257. styleObj.height = '';
  258. } else if (attrs.height && !attrs.height.includes('%'))
  259. attrs.height += 'px';
  260. }
  261. for (var key in styleObj) {
  262. var value = styleObj[key];
  263. if (!value) continue;
  264. if (key.includes('flex') || key == 'order' || key == 'self-align') node.c = 1;
  265. // 填充链接
  266. if (value.includes('url')) {
  267. var j = value.indexOf('(');
  268. if (j++ != -1) {
  269. while (value[j] == '"' || value[j] == "'" || blankChar[value[j]]) j++;
  270. value = value.substr(0, j) + this.getUrl(value.substr(j));
  271. }
  272. }
  273. // 转换 rpx
  274. else if (value.includes('rpx'))
  275. value = value.replace(/[0-9.]+\s*rpx/g, $ => parseFloat($) * windowWidth / 750 + 'px');
  276. else if (key == 'white-space' && value.includes('pre') && !close)
  277. this.pre = node.pre = true;
  278. style += `;${key}:${value}`;
  279. }
  280. style = style.substr(1);
  281. if (style) attrs.style = style;
  282. if (!close) {
  283. node.children = [];
  284. if (node.name == 'pre' && cfg.highlight) {
  285. this.remove(node);
  286. this.pre = node.pre = true;
  287. }
  288. this.siblings().push(node);
  289. this.STACK.push(node);
  290. } else if (!cfg.filter || cfg.filter(node, this) != false)
  291. this.siblings().push(node);
  292. } else {
  293. if (!close) this.remove(node);
  294. else if (node.name == 'source') {
  295. var parent = this.parent();
  296. if (parent && (parent.name == 'video' || parent.name == 'audio') && node.attrs.src)
  297. parent.attrs.source.push(node.attrs.src);
  298. } else if (node.name == 'base' && !this.domain) this.domain = node.attrs.href;
  299. }
  300. if (this.data[this.i] == '/') this.i++;
  301. this.start = this.i + 1;
  302. this.state = this.Text;
  303. }
  304. // 移除标签
  305. MpHtmlParser.prototype.remove = function(node) {
  306. var name = node.name,
  307. j = this.i;
  308. // 处理 svg
  309. var handleSvg = () => {
  310. var src = this.data.substring(j, this.i + 1);
  311. if (!node.attrs.xmlns) src = ' xmlns="http://www.w3.org/2000/svg"' + src;
  312. var i = j;
  313. while (this.data[j] != '<') j--;
  314. src = this.data.substring(j, i).replace("viewbox", "viewBox") + src;
  315. var parent = this.parent();
  316. if (node.attrs.width == '100%' && parent && (parent.attrs.style || '').includes('inline'))
  317. parent.attrs.style = 'width:300px;max-width:100%;' + parent.attrs.style;
  318. this.siblings().push({
  319. name: 'img',
  320. attrs: {
  321. src: 'data:image/svg+xml;utf8,' + src.replace(/#/g, '%23'),
  322. style: (/vertical[^;]+/.exec(node.attrs.style) || []).shift(),
  323. ignore: 'T'
  324. }
  325. })
  326. }
  327. if (node.name == 'svg' && this.data[j] == '/') return handleSvg(this.i++);
  328. while (1) {
  329. if ((this.i = this.data.indexOf('</', this.i + 1)) == -1) {
  330. if (name == 'pre' || name == 'svg') this.i = j;
  331. else this.i = this.data.length;
  332. return;
  333. }
  334. this.start = (this.i += 2);
  335. while (!blankChar[this.data[this.i]] && !this.isClose()) this.i++;
  336. if (this.section().toLowerCase() == name) {
  337. // 代码块高亮
  338. if (name == 'pre') {
  339. this.data = this.data.substr(0, j + 1) + cfg.highlight(this.data.substring(j + 1, this.i - 5), node.attrs) + this.data
  340. .substr(this.i - 5);
  341. return this.i = j;
  342. } else if (name == 'style')
  343. this.CssHandler.getStyle(this.data.substring(j + 1, this.i - 7));
  344. else if (name == 'title')
  345. this.DOM.title = this.data.substring(j + 1, this.i - 7);
  346. if ((this.i = this.data.indexOf('>', this.i)) == -1) this.i = this.data.length;
  347. if (name == 'svg') handleSvg();
  348. return;
  349. }
  350. }
  351. }
  352. // 节点出栈处理
  353. MpHtmlParser.prototype.popNode = function(node) {
  354. // 空白符处理
  355. if (node.pre) {
  356. node.pre = this.pre = void 0;
  357. for (let i = this.STACK.length; i--;)
  358. if (this.STACK[i].pre)
  359. this.pre = true;
  360. }
  361. var siblings = this.siblings(),
  362. len = siblings.length,
  363. childs = node.children;
  364. if (node.name == 'head' || (cfg.filter && cfg.filter(node, this) == false))
  365. return siblings.pop();
  366. var attrs = node.attrs;
  367. // 替换一些标签名
  368. if (cfg.blockTags[node.name]) node.name = 'div';
  369. else if (!cfg.trustTags[node.name]) node.name = 'span';
  370. // 处理列表
  371. if (node.c && (node.name == 'ul' || node.name == 'ol')) {
  372. if ((node.attrs.style || '').includes('list-style:none')) {
  373. for (let i = 0, child; child = childs[i++];)
  374. if (child.name == 'li')
  375. child.name = 'div';
  376. } else if (node.name == 'ul') {
  377. var floor = 1;
  378. for (let i = this.STACK.length; i--;)
  379. if (this.STACK[i].name == 'ul') floor++;
  380. if (floor != 1)
  381. for (let i = childs.length; i--;)
  382. childs[i].floor = floor;
  383. } else {
  384. for (let i = 0, num = 1, child; child = childs[i++];)
  385. if (child.name == 'li') {
  386. child.type = 'ol';
  387. child.num = ((num, type) => {
  388. if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26);
  389. if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26);
  390. if (type == 'i' || type == 'I') {
  391. num = (num - 1) % 99 + 1;
  392. var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'],
  393. ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'],
  394. res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || '');
  395. if (type == 'i') return res.toLowerCase();
  396. return res;
  397. }
  398. return num;
  399. })(num++, attrs.type) + '.';
  400. }
  401. }
  402. }
  403. // 处理表格的边框
  404. if (node.name == 'table') {
  405. var padding = attrs.cellpadding,
  406. spacing = attrs.cellspacing,
  407. border = attrs.border;
  408. if (node.c) {
  409. this.bubble();
  410. attrs.style = (attrs.style || '') + ';display:table';
  411. if (!padding) padding = 2;
  412. if (!spacing) spacing = 2;
  413. }
  414. if (border) attrs.style = `border:${border}px solid gray;${attrs.style || ''}`;
  415. if (spacing) attrs.style = `border-spacing:${spacing}px;${attrs.style || ''}`;
  416. if (border || padding || node.c)
  417. (function f(ns) {
  418. for (var i = 0, n; n = ns[i]; i++) {
  419. if (n.type == 'text') continue;
  420. var style = n.attrs.style || '';
  421. if (node.c && n.name[0] == 't') {
  422. n.c = 1;
  423. style += ';display:table-' + (n.name == 'th' || n.name == 'td' ? 'cell' : (n.name == 'tr' ? 'row' : 'row-group'));
  424. }
  425. if (n.name == 'th' || n.name == 'td') {
  426. if (border) style = `border:${border}px solid gray;${style}`;
  427. if (padding) style = `padding:${padding}px;${style}`;
  428. } else f(n.children || []);
  429. if (style) n.attrs.style = style;
  430. }
  431. })(childs)
  432. if (this.options.autoscroll) {
  433. var table = Object.assign({}, node);
  434. node.name = 'div';
  435. node.attrs = {
  436. style: 'overflow:scroll'
  437. }
  438. node.children = [table];
  439. }
  440. }
  441. this.CssHandler.pop && this.CssHandler.pop(node);
  442. // 自动压缩
  443. if (node.name == 'div' && !Object.keys(attrs).length && childs.length == 1 && childs[0].name == 'div')
  444. siblings[len - 1] = childs[0];
  445. }
  446. // 状态机
  447. MpHtmlParser.prototype.Text = function(c) {
  448. if (c == '<') {
  449. var next = this.data[this.i + 1],
  450. isLetter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  451. if (isLetter(next)) {
  452. this.setText();
  453. this.start = this.i + 1;
  454. this.state = this.TagName;
  455. } else if (next == '/') {
  456. this.setText();
  457. if (isLetter(this.data[++this.i + 1])) {
  458. this.start = this.i + 1;
  459. this.state = this.EndTag;
  460. } else this.Comment();
  461. } else if (next == '!' || next == '?') {
  462. this.setText();
  463. this.Comment();
  464. }
  465. }
  466. }
  467. MpHtmlParser.prototype.Comment = function() {
  468. var key;
  469. if (this.data.substring(this.i + 2, this.i + 4) == '--') key = '-->';
  470. else if (this.data.substring(this.i + 2, this.i + 9) == '[CDATA[') key = ']]>';
  471. else key = '>';
  472. if ((this.i = this.data.indexOf(key, this.i + 2)) == -1) this.i = this.data.length;
  473. else this.i += key.length - 1;
  474. this.start = this.i + 1;
  475. this.state = this.Text;
  476. }
  477. MpHtmlParser.prototype.TagName = function(c) {
  478. if (blankChar[c]) {
  479. this.tagName = this.section();
  480. while (blankChar[this.data[this.i]]) this.i++;
  481. if (this.isClose()) this.setNode();
  482. else {
  483. this.start = this.i;
  484. this.state = this.AttrName;
  485. }
  486. } else if (this.isClose()) {
  487. this.tagName = this.section();
  488. this.setNode();
  489. }
  490. }
  491. MpHtmlParser.prototype.AttrName = function(c) {
  492. if (c == '=' || blankChar[c] || this.isClose()) {
  493. this.attrName = this.section();
  494. if (blankChar[c])
  495. while (blankChar[this.data[++this.i]]);
  496. if (this.data[this.i] == '=') {
  497. while (blankChar[this.data[++this.i]]);
  498. this.start = this.i--;
  499. this.state = this.AttrValue;
  500. } else this.setAttr();
  501. }
  502. }
  503. MpHtmlParser.prototype.AttrValue = function(c) {
  504. if (c == '"' || c == "'") {
  505. this.start++;
  506. if ((this.i = this.data.indexOf(c, this.i + 1)) == -1) return this.i = this.data.length;
  507. this.attrVal = this.section();
  508. this.i++;
  509. } else {
  510. for (; !blankChar[this.data[this.i]] && !this.isClose(); this.i++);
  511. this.attrVal = this.section();
  512. }
  513. this.setAttr();
  514. }
  515. MpHtmlParser.prototype.EndTag = function(c) {
  516. if (blankChar[c] || c == '>' || c == '/') {
  517. var name = this.section().toLowerCase();
  518. for (var i = this.STACK.length; i--;)
  519. if (this.STACK[i].name == name) break;
  520. if (i != -1) {
  521. var node;
  522. while ((node = this.STACK.pop()).name != name) this.popNode(node);
  523. this.popNode(node);
  524. } else if (name == 'p' || name == 'br')
  525. this.siblings().push({
  526. name,
  527. attrs: {}
  528. });
  529. this.i = this.data.indexOf('>', this.i);
  530. this.start = this.i + 1;
  531. if (this.i == -1) this.i = this.data.length;
  532. else this.state = this.Text;
  533. }
  534. }
  535. =======
  536. /**
  537. * html 解析器
  538. * @tutorial https://github.com/jin-yufeng/Parser
  539. * @version 20200728
  540. * @author JinYufeng
  541. * @listens MIT
  542. */
  543. const cfg = require('./config.js'),
  544. blankChar = cfg.blankChar,
  545. CssHandler = require('./CssHandler.js'),
  546. windowWidth = uni.getSystemInfoSync().windowWidth;
  547. var emoji;
  548. function MpHtmlParser(data, options = {}) {
  549. this.attrs = {};
  550. this.CssHandler = new CssHandler(options.tagStyle, windowWidth);
  551. this.data = data;
  552. this.domain = options.domain;
  553. this.DOM = [];
  554. this.i = this.start = this.audioNum = this.imgNum = this.videoNum = 0;
  555. options.prot = (this.domain || '').includes('://') ? this.domain.split('://')[0] : 'http';
  556. this.options = options;
  557. this.state = this.Text;
  558. this.STACK = [];
  559. // 工具函数
  560. this.bubble = () => {
  561. for (var i = this.STACK.length, item; item = this.STACK[--i];) {
  562. if (cfg.richOnlyTags[item.name]) {
  563. if (item.name == 'table' && !Object.hasOwnProperty.call(item, 'c')) item.c = 1;
  564. return false;
  565. }
  566. item.c = 1;
  567. }
  568. return true;
  569. }
  570. this.decode = (val, amp) => {
  571. var i = -1,
  572. j, en;
  573. while (1) {
  574. if ((i = val.indexOf('&', i + 1)) == -1) break;
  575. if ((j = val.indexOf(';', i + 2)) == -1) break;
  576. if (val[i + 1] == '#') {
  577. en = parseInt((val[i + 2] == 'x' ? '0' : '') + val.substring(i + 2, j));
  578. if (!isNaN(en)) val = val.substr(0, i) + String.fromCharCode(en) + val.substr(j + 1);
  579. } else {
  580. en = val.substring(i + 1, j);
  581. if (cfg.entities[en] || en == amp)
  582. val = val.substr(0, i) + (cfg.entities[en] || '&') + val.substr(j + 1);
  583. }
  584. }
  585. return val;
  586. }
  587. this.getUrl = url => {
  588. if (url[0] == '/') {
  589. if (url[1] == '/') url = this.options.prot + ':' + url;
  590. else if (this.domain) url = this.domain + url;
  591. } else if (this.domain && url.indexOf('data:') != 0 && !url.includes('://'))
  592. url = this.domain + '/' + url;
  593. return url;
  594. }
  595. this.isClose = () => this.data[this.i] == '>' || (this.data[this.i] == '/' && this.data[this.i + 1] == '>');
  596. this.section = () => this.data.substring(this.start, this.i);
  597. this.parent = () => this.STACK[this.STACK.length - 1];
  598. this.siblings = () => this.STACK.length ? this.parent().children : this.DOM;
  599. }
  600. MpHtmlParser.prototype.parse = function() {
  601. if (emoji) this.data = emoji.parseEmoji(this.data);
  602. for (var c; c = this.data[this.i]; this.i++)
  603. this.state(c);
  604. if (this.state == this.Text) this.setText();
  605. while (this.STACK.length) this.popNode(this.STACK.pop());
  606. return this.DOM;
  607. }
  608. // 设置属性
  609. MpHtmlParser.prototype.setAttr = function() {
  610. var name = this.attrName.toLowerCase(),
  611. val = this.attrVal;
  612. if (cfg.boolAttrs[name]) this.attrs[name] = 'T';
  613. else if (val) {
  614. if (name == 'src' || (name == 'data-src' && !this.attrs.src)) this.attrs.src = this.getUrl(this.decode(val, 'amp'));
  615. else if (name == 'href' || name == 'style') this.attrs[name] = this.decode(val, 'amp');
  616. else if (name.substr(0, 5) != 'data-') this.attrs[name] = val;
  617. }
  618. this.attrVal = '';
  619. while (blankChar[this.data[this.i]]) this.i++;
  620. if (this.isClose()) this.setNode();
  621. else {
  622. this.start = this.i;
  623. this.state = this.AttrName;
  624. }
  625. }
  626. // 设置文本节点
  627. MpHtmlParser.prototype.setText = function() {
  628. var back, text = this.section();
  629. if (!text) return;
  630. text = (cfg.onText && cfg.onText(text, () => back = true)) || text;
  631. if (back) {
  632. this.data = this.data.substr(0, this.start) + text + this.data.substr(this.i);
  633. let j = this.start + text.length;
  634. for (this.i = this.start; this.i < j; this.i++) this.state(this.data[this.i]);
  635. return;
  636. }
  637. if (!this.pre) {
  638. // 合并空白符
  639. var flag, tmp = [];
  640. for (let i = text.length, c; c = text[--i];)
  641. if (!blankChar[c]) {
  642. tmp.unshift(c);
  643. if (!flag) flag = 1;
  644. } else {
  645. if (tmp[0] != ' ') tmp.unshift(' ');
  646. if (c == '\n' && flag == void 0) flag = 0;
  647. }
  648. if (flag == 0) return;
  649. text = tmp.join('');
  650. }
  651. this.siblings().push({
  652. type: 'text',
  653. text: this.decode(text)
  654. });
  655. }
  656. // 设置元素节点
  657. MpHtmlParser.prototype.setNode = function() {
  658. var node = {
  659. name: this.tagName.toLowerCase(),
  660. attrs: this.attrs
  661. },
  662. close = cfg.selfClosingTags[node.name];
  663. if (this.options.nodes.length) node.type = 'node';
  664. this.attrs = {};
  665. if (!cfg.ignoreTags[node.name]) {
  666. // 处理属性
  667. var attrs = node.attrs,
  668. style = this.CssHandler.match(node.name, attrs, node) + (attrs.style || ''),
  669. styleObj = {};
  670. if (attrs.id) {
  671. if (this.options.compress & 1) attrs.id = void 0;
  672. else if (this.options.useAnchor) this.bubble();
  673. }
  674. if ((this.options.compress & 2) && attrs.class) attrs.class = void 0;
  675. switch (node.name) {
  676. case 'a':
  677. case 'ad': // #ifdef APP-PLUS
  678. case 'iframe':
  679. // #endif
  680. this.bubble();
  681. break;
  682. case 'font':
  683. if (attrs.color) {
  684. styleObj['color'] = attrs.color;
  685. attrs.color = void 0;
  686. }
  687. if (attrs.face) {
  688. styleObj['font-family'] = attrs.face;
  689. attrs.face = void 0;
  690. }
  691. if (attrs.size) {
  692. var size = parseInt(attrs.size);
  693. if (size < 1) size = 1;
  694. else if (size > 7) size = 7;
  695. var map = ['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'];
  696. styleObj['font-size'] = map[size - 1];
  697. attrs.size = void 0;
  698. }
  699. break;
  700. case 'embed':
  701. // #ifndef APP-PLUS
  702. var src = node.attrs.src || '',
  703. type = node.attrs.type || '';
  704. if (type.includes('video') || src.includes('.mp4') || src.includes('.3gp') || src.includes('.m3u8'))
  705. node.name = 'video';
  706. else if (type.includes('audio') || src.includes('.m4a') || src.includes('.wav') || src.includes('.mp3') || src.includes(
  707. '.aac'))
  708. node.name = 'audio';
  709. else break;
  710. if (node.attrs.autostart)
  711. node.attrs.autoplay = 'T';
  712. node.attrs.controls = 'T';
  713. // #endif
  714. // #ifdef APP-PLUS
  715. this.bubble();
  716. break;
  717. // #endif
  718. case 'video':
  719. case 'audio':
  720. if (!attrs.id) attrs.id = node.name + (++this[`${node.name}Num`]);
  721. else this[`${node.name}Num`]++;
  722. if (node.name == 'video') {
  723. if (this.videoNum > 3)
  724. node.lazyLoad = 1;
  725. if (attrs.width) {
  726. styleObj.width = parseFloat(attrs.width) + (attrs.width.includes('%') ? '%' : 'px');
  727. attrs.width = void 0;
  728. }
  729. if (attrs.height) {
  730. styleObj.height = parseFloat(attrs.height) + (attrs.height.includes('%') ? '%' : 'px');
  731. attrs.height = void 0;
  732. }
  733. }
  734. if (!attrs.controls && !attrs.autoplay) attrs.controls = 'T';
  735. attrs.source = [];
  736. if (attrs.src) {
  737. attrs.source.push(attrs.src);
  738. attrs.src = void 0;
  739. }
  740. this.bubble();
  741. break;
  742. case 'td':
  743. case 'th':
  744. if (attrs.colspan || attrs.rowspan)
  745. for (var k = this.STACK.length, item; item = this.STACK[--k];)
  746. if (item.name == 'table') {
  747. item.c = void 0;
  748. break;
  749. }
  750. }
  751. if (attrs.align) {
  752. styleObj['text-align'] = attrs.align;
  753. attrs.align = void 0;
  754. }
  755. // 压缩 style
  756. var styles = style.split(';');
  757. style = '';
  758. for (var i = 0, len = styles.length; i < len; i++) {
  759. var info = styles[i].split(':');
  760. if (info.length < 2) continue;
  761. let key = info[0].trim().toLowerCase(),
  762. value = info.slice(1).join(':').trim();
  763. if (value[0] == '-' || value.includes('safe'))
  764. style += `;${key}:${value}`;
  765. else if (!styleObj[key] || value.includes('import') || !styleObj[key].includes('import'))
  766. styleObj[key] = value;
  767. }
  768. if (node.name == 'img') {
  769. if (attrs.src && !attrs.ignore) {
  770. if (this.bubble())
  771. attrs.i = (this.imgNum++).toString();
  772. else attrs.ignore = 'T';
  773. }
  774. if (attrs.ignore) {
  775. style += ';-webkit-touch-callout:none';
  776. styleObj['max-width'] = '100%';
  777. }
  778. var width;
  779. if (styleObj.width) width = styleObj.width;
  780. else if (attrs.width) width = attrs.width.includes('%') ? attrs.width : attrs.width + 'px';
  781. if (width) {
  782. styleObj.width = width;
  783. attrs.width = '100%';
  784. if (parseInt(width) > windowWidth) {
  785. styleObj.height = '';
  786. if (attrs.height) attrs.height = void 0;
  787. }
  788. }
  789. if (styleObj.height) {
  790. attrs.height = styleObj.height;
  791. styleObj.height = '';
  792. } else if (attrs.height && !attrs.height.includes('%'))
  793. attrs.height += 'px';
  794. }
  795. for (var key in styleObj) {
  796. var value = styleObj[key];
  797. if (!value) continue;
  798. if (key.includes('flex') || key == 'order' || key == 'self-align') node.c = 1;
  799. // 填充链接
  800. if (value.includes('url')) {
  801. var j = value.indexOf('(');
  802. if (j++ != -1) {
  803. while (value[j] == '"' || value[j] == "'" || blankChar[value[j]]) j++;
  804. value = value.substr(0, j) + this.getUrl(value.substr(j));
  805. }
  806. }
  807. // 转换 rpx
  808. else if (value.includes('rpx'))
  809. value = value.replace(/[0-9.]+\s*rpx/g, $ => parseFloat($) * windowWidth / 750 + 'px');
  810. else if (key == 'white-space' && value.includes('pre') && !close)
  811. this.pre = node.pre = true;
  812. style += `;${key}:${value}`;
  813. }
  814. style = style.substr(1);
  815. if (style) attrs.style = style;
  816. if (!close) {
  817. node.children = [];
  818. if (node.name == 'pre' && cfg.highlight) {
  819. this.remove(node);
  820. this.pre = node.pre = true;
  821. }
  822. this.siblings().push(node);
  823. this.STACK.push(node);
  824. } else if (!cfg.filter || cfg.filter(node, this) != false)
  825. this.siblings().push(node);
  826. } else {
  827. if (!close) this.remove(node);
  828. else if (node.name == 'source') {
  829. var parent = this.parent();
  830. if (parent && (parent.name == 'video' || parent.name == 'audio') && node.attrs.src)
  831. parent.attrs.source.push(node.attrs.src);
  832. } else if (node.name == 'base' && !this.domain) this.domain = node.attrs.href;
  833. }
  834. if (this.data[this.i] == '/') this.i++;
  835. this.start = this.i + 1;
  836. this.state = this.Text;
  837. }
  838. // 移除标签
  839. MpHtmlParser.prototype.remove = function(node) {
  840. var name = node.name,
  841. j = this.i;
  842. // 处理 svg
  843. var handleSvg = () => {
  844. var src = this.data.substring(j, this.i + 1);
  845. if (!node.attrs.xmlns) src = ' xmlns="http://www.w3.org/2000/svg"' + src;
  846. var i = j;
  847. while (this.data[j] != '<') j--;
  848. src = this.data.substring(j, i).replace("viewbox", "viewBox") + src;
  849. var parent = this.parent();
  850. if (node.attrs.width == '100%' && parent && (parent.attrs.style || '').includes('inline'))
  851. parent.attrs.style = 'width:300px;max-width:100%;' + parent.attrs.style;
  852. this.siblings().push({
  853. name: 'img',
  854. attrs: {
  855. src: 'data:image/svg+xml;utf8,' + src.replace(/#/g, '%23'),
  856. style: (/vertical[^;]+/.exec(node.attrs.style) || []).shift(),
  857. ignore: 'T'
  858. }
  859. })
  860. }
  861. if (node.name == 'svg' && this.data[j] == '/') return handleSvg(this.i++);
  862. while (1) {
  863. if ((this.i = this.data.indexOf('</', this.i + 1)) == -1) {
  864. if (name == 'pre' || name == 'svg') this.i = j;
  865. else this.i = this.data.length;
  866. return;
  867. }
  868. this.start = (this.i += 2);
  869. while (!blankChar[this.data[this.i]] && !this.isClose()) this.i++;
  870. if (this.section().toLowerCase() == name) {
  871. // 代码块高亮
  872. if (name == 'pre') {
  873. this.data = this.data.substr(0, j + 1) + cfg.highlight(this.data.substring(j + 1, this.i - 5), node.attrs) + this.data
  874. .substr(this.i - 5);
  875. return this.i = j;
  876. } else if (name == 'style')
  877. this.CssHandler.getStyle(this.data.substring(j + 1, this.i - 7));
  878. else if (name == 'title')
  879. this.DOM.title = this.data.substring(j + 1, this.i - 7);
  880. if ((this.i = this.data.indexOf('>', this.i)) == -1) this.i = this.data.length;
  881. if (name == 'svg') handleSvg();
  882. return;
  883. }
  884. }
  885. }
  886. // 节点出栈处理
  887. MpHtmlParser.prototype.popNode = function(node) {
  888. // 空白符处理
  889. if (node.pre) {
  890. node.pre = this.pre = void 0;
  891. for (let i = this.STACK.length; i--;)
  892. if (this.STACK[i].pre)
  893. this.pre = true;
  894. }
  895. var siblings = this.siblings(),
  896. len = siblings.length,
  897. childs = node.children;
  898. if (node.name == 'head' || (cfg.filter && cfg.filter(node, this) == false))
  899. return siblings.pop();
  900. var attrs = node.attrs;
  901. // 替换一些标签名
  902. if (cfg.blockTags[node.name]) node.name = 'div';
  903. else if (!cfg.trustTags[node.name]) node.name = 'span';
  904. // 处理列表
  905. if (node.c && (node.name == 'ul' || node.name == 'ol')) {
  906. if ((node.attrs.style || '').includes('list-style:none')) {
  907. for (let i = 0, child; child = childs[i++];)
  908. if (child.name == 'li')
  909. child.name = 'div';
  910. } else if (node.name == 'ul') {
  911. var floor = 1;
  912. for (let i = this.STACK.length; i--;)
  913. if (this.STACK[i].name == 'ul') floor++;
  914. if (floor != 1)
  915. for (let i = childs.length; i--;)
  916. childs[i].floor = floor;
  917. } else {
  918. for (let i = 0, num = 1, child; child = childs[i++];)
  919. if (child.name == 'li') {
  920. child.type = 'ol';
  921. child.num = ((num, type) => {
  922. if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26);
  923. if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26);
  924. if (type == 'i' || type == 'I') {
  925. num = (num - 1) % 99 + 1;
  926. var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'],
  927. ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'],
  928. res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || '');
  929. if (type == 'i') return res.toLowerCase();
  930. return res;
  931. }
  932. return num;
  933. })(num++, attrs.type) + '.';
  934. }
  935. }
  936. }
  937. // 处理表格的边框
  938. if (node.name == 'table') {
  939. var padding = attrs.cellpadding,
  940. spacing = attrs.cellspacing,
  941. border = attrs.border;
  942. if (node.c) {
  943. this.bubble();
  944. attrs.style = (attrs.style || '') + ';display:table';
  945. if (!padding) padding = 2;
  946. if (!spacing) spacing = 2;
  947. }
  948. if (border) attrs.style = `border:${border}px solid gray;${attrs.style || ''}`;
  949. if (spacing) attrs.style = `border-spacing:${spacing}px;${attrs.style || ''}`;
  950. if (border || padding || node.c)
  951. (function f(ns) {
  952. for (var i = 0, n; n = ns[i]; i++) {
  953. if (n.type == 'text') continue;
  954. var style = n.attrs.style || '';
  955. if (node.c && n.name[0] == 't') {
  956. n.c = 1;
  957. style += ';display:table-' + (n.name == 'th' || n.name == 'td' ? 'cell' : (n.name == 'tr' ? 'row' : 'row-group'));
  958. }
  959. if (n.name == 'th' || n.name == 'td') {
  960. if (border) style = `border:${border}px solid gray;${style}`;
  961. if (padding) style = `padding:${padding}px;${style}`;
  962. } else f(n.children || []);
  963. if (style) n.attrs.style = style;
  964. }
  965. })(childs)
  966. if (this.options.autoscroll) {
  967. var table = Object.assign({}, node);
  968. node.name = 'div';
  969. node.attrs = {
  970. style: 'overflow:scroll'
  971. }
  972. node.children = [table];
  973. }
  974. }
  975. this.CssHandler.pop && this.CssHandler.pop(node);
  976. // 自动压缩
  977. if (node.name == 'div' && !Object.keys(attrs).length && childs.length == 1 && childs[0].name == 'div')
  978. siblings[len - 1] = childs[0];
  979. }
  980. // 状态机
  981. MpHtmlParser.prototype.Text = function(c) {
  982. if (c == '<') {
  983. var next = this.data[this.i + 1],
  984. isLetter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  985. if (isLetter(next)) {
  986. this.setText();
  987. this.start = this.i + 1;
  988. this.state = this.TagName;
  989. } else if (next == '/') {
  990. this.setText();
  991. if (isLetter(this.data[++this.i + 1])) {
  992. this.start = this.i + 1;
  993. this.state = this.EndTag;
  994. } else this.Comment();
  995. } else if (next == '!' || next == '?') {
  996. this.setText();
  997. this.Comment();
  998. }
  999. }
  1000. }
  1001. MpHtmlParser.prototype.Comment = function() {
  1002. var key;
  1003. if (this.data.substring(this.i + 2, this.i + 4) == '--') key = '-->';
  1004. else if (this.data.substring(this.i + 2, this.i + 9) == '[CDATA[') key = ']]>';
  1005. else key = '>';
  1006. if ((this.i = this.data.indexOf(key, this.i + 2)) == -1) this.i = this.data.length;
  1007. else this.i += key.length - 1;
  1008. this.start = this.i + 1;
  1009. this.state = this.Text;
  1010. }
  1011. MpHtmlParser.prototype.TagName = function(c) {
  1012. if (blankChar[c]) {
  1013. this.tagName = this.section();
  1014. while (blankChar[this.data[this.i]]) this.i++;
  1015. if (this.isClose()) this.setNode();
  1016. else {
  1017. this.start = this.i;
  1018. this.state = this.AttrName;
  1019. }
  1020. } else if (this.isClose()) {
  1021. this.tagName = this.section();
  1022. this.setNode();
  1023. }
  1024. }
  1025. MpHtmlParser.prototype.AttrName = function(c) {
  1026. if (c == '=' || blankChar[c] || this.isClose()) {
  1027. this.attrName = this.section();
  1028. if (blankChar[c])
  1029. while (blankChar[this.data[++this.i]]);
  1030. if (this.data[this.i] == '=') {
  1031. while (blankChar[this.data[++this.i]]);
  1032. this.start = this.i--;
  1033. this.state = this.AttrValue;
  1034. } else this.setAttr();
  1035. }
  1036. }
  1037. MpHtmlParser.prototype.AttrValue = function(c) {
  1038. if (c == '"' || c == "'") {
  1039. this.start++;
  1040. if ((this.i = this.data.indexOf(c, this.i + 1)) == -1) return this.i = this.data.length;
  1041. this.attrVal = this.section();
  1042. this.i++;
  1043. } else {
  1044. for (; !blankChar[this.data[this.i]] && !this.isClose(); this.i++);
  1045. this.attrVal = this.section();
  1046. }
  1047. this.setAttr();
  1048. }
  1049. MpHtmlParser.prototype.EndTag = function(c) {
  1050. if (blankChar[c] || c == '>' || c == '/') {
  1051. var name = this.section().toLowerCase();
  1052. for (var i = this.STACK.length; i--;)
  1053. if (this.STACK[i].name == name) break;
  1054. if (i != -1) {
  1055. var node;
  1056. while ((node = this.STACK.pop()).name != name) this.popNode(node);
  1057. this.popNode(node);
  1058. } else if (name == 'p' || name == 'br')
  1059. this.siblings().push({
  1060. name,
  1061. attrs: {}
  1062. });
  1063. this.i = this.data.indexOf('>', this.i);
  1064. this.start = this.i + 1;
  1065. if (this.i == -1) this.i = this.data.length;
  1066. else this.state = this.Text;
  1067. }
  1068. }
  1069. >>>>>>> 5b465a14bac2c1448cc18a0b08b88844fc895cd5
  1070. module.exports = MpHtmlParser;