123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577 |
- /*
- 将 html 解析为适用于小程序 rich-text 的 DOM 结构
- github:https://github.com/jin-yufeng/Parser
- docs:https://jin-yufeng.github.io/Parser
- author:JinYufeng
- update:2020/04/13
- */
- var cfg = require('./config.js'),
- blankChar = cfg.blankChar,
- CssHandler = require('./CssHandler.js'),
- {
- screenWidth,
- system
- } = wx.getSystemInfoSync();
- // #ifdef MP-BAIDU || MP-ALIPAY || MP-TOUTIAO
- var entities = {
- lt: '<',
- gt: '>',
- amp: '&',
- quot: '"',
- apos: "'",
- nbsp: '\xA0',
- ensp: '\u2002',
- emsp: '\u2003',
- ndash: '–',
- mdash: '—',
- middot: '·',
- lsquo: '‘',
- rsquo: '’',
- ldquo: '“',
- rdquo: '”',
- bull: '•',
- hellip: '…',
- permil: '‰',
- copy: '©',
- reg: '®',
- trade: '™',
- times: '×',
- divide: '÷',
- cent: '¢',
- pound: '£',
- yen: '¥',
- euro: '€',
- sect: '§'
- };
- // #endif
- var emoji; // emoji 补丁包 https://jin-yufeng.github.io/Parser/#/instructions?id=emoji
- class MpHtmlParser {
- constructor(data, options = {}) {
- this.attrs = {};
- this.compress = options.compress;
- this.CssHandler = new CssHandler(options.tagStyle, screenWidth);
- this.data = data;
- this.domain = options.domain;
- this.DOM = [];
- this.i = this.start = this.audioNum = this.imgNum = this.videoNum = 0;
- this.protocol = this.domain && this.domain.includes('://') ? this.domain.split('://')[0] : '';
- this.state = this.Text;
- this.STACK = [];
- this.useAnchor = options.useAnchor;
- this.xml = options.xml;
- }
- parse() {
- if (emoji) this.data = emoji.parseEmoji(this.data);
- for (var c; c = this.data[this.i]; this.i++)
- this.state(c);
- if (this.state == this.Text) this.setText();
- while (this.STACK.length) this.popNode(this.STACK.pop());
- // #ifdef MP-BAIDU || MP-TOUTIAO
- // 将顶层标签的一些样式提取出来给 rich-text
- (function f(ns) {
- for (var i = ns.length, n; n = ns[--i];) {
- if (n.type == 'text') continue;
- if (!n.c) {
- var style = n.attrs.style;
- if (style) {
- var j, k, res;
- if ((j = style.indexOf('display')) != -1)
- res = style.substring(j, (k = style.indexOf(';', j)) == -1 ? style.length : k);
- if ((j = style.indexOf('float')) != -1)
- res += ';' + style.substring(j, (k = style.indexOf(';', j)) == -1 ? style.length : k);
- n.attrs.contain = res;
- }
- } else f(n.children);
- }
- })(this.DOM);
- // #endif
- if (this.DOM.length) {
- this.DOM[0].PoweredBy = 'Parser';
- if (this.title) this.DOM[0].title = this.title;
- }
- return this.DOM;
- }
- // 设置属性
- setAttr() {
- var name = this.getName(this.attrName);
- if (cfg.trustAttrs[name]) {
- if (!this.attrVal) {
- if (cfg.boolAttrs[name]) this.attrs[name] = 'T';
- } else if (name == 'src') this.attrs[name] = this.getUrl(this.attrVal.replace(/&/g, '&'));
- else this.attrs[name] = this.attrVal;
- }
- this.attrVal = '';
- while (blankChar[this.data[this.i]]) this.i++;
- if (this.isClose()) this.setNode();
- else {
- this.start = this.i;
- this.state = this.AttrName;
- }
- }
- // 设置文本节点
- setText() {
- var back, text = this.section();
- if (!text) return;
- text = (cfg.onText && cfg.onText(text, () => back = true)) || text;
- if (back) {
- this.data = this.data.substr(0, this.start) + text + this.data.substr(this.i);
- let j = this.start + text.length;
- for (this.i = this.start; this.i < j; this.i++) this.state(this.data[this.i]);
- return;
- }
- if (!this.pre) {
- // 合并空白符
- var tmp = [];
- for (let i = text.length, c; c = text[--i];)
- if (!blankChar[c] || (!blankChar[tmp[0]] && (c = ' '))) tmp.unshift(c);
- text = tmp.join('');
- if (text == ' ') return;
- }
- // 处理实体
- var siblings = this.siblings(),
- i = -1,
- j, en;
- while (1) {
- if ((i = text.indexOf('&', i + 1)) == -1) break;
- if ((j = text.indexOf(';', i + 2)) == -1) break;
- if (text[i + 1] == '#') {
- en = parseInt((text[i + 2] == 'x' ? '0' : '') + text.substring(i + 2, j));
- if (!isNaN(en)) text = text.substr(0, i) + String.fromCharCode(en) + text.substring(j + 1);
- } else {
- en = text.substring(i + 1, j);
- // #ifdef MP-WEIXIN || MP-QQ || APP-PLUS
- if (en == 'nbsp') text = text.substr(0, i) + '\xA0' + text.substr(j + 1); // 解决 失效
- else if (en != 'lt' && en != 'gt' && en != 'amp' && en != 'ensp' && en != 'emsp' && en != 'quot' && en != 'apos') {
- i && siblings.push({
- type: 'text',
- text: text.substr(0, i)
- })
- siblings.push({
- type: 'text',
- text: `&${en};`,
- en: 1
- })
- text = text.substr(j + 1);
- i = -1;
- }
- // #endif
- // #ifdef MP-BAIDU || MP-ALIPAY || MP-TOUTIAO
- if (entities[en]) text = text.substr(0, i) + entities[en] + text.substr(j + 1);
- // #endif
- }
- }
- text && siblings.push({
- type: 'text',
- text
- })
- }
- // 设置元素节点
- setNode() {
- var node = {
- name: this.tagName.toLowerCase(),
- attrs: this.attrs
- },
- close = cfg.selfClosingTags[node.name] || (this.xml && this.data[this.i] == '/');
- this.attrs = {};
- if (!cfg.ignoreTags[node.name]) {
- this.matchAttr(node);
- if (!close) {
- node.children = [];
- if (node.name == 'pre' && cfg.highlight) {
- this.remove(node);
- this.pre = node.pre = true;
- }
- this.siblings().push(node);
- this.STACK.push(node);
- } else if (!cfg.filter || cfg.filter(node, this) != false)
- this.siblings().push(node);
- } else {
- if (!close) this.remove(node);
- else if (node.name == 'source') {
- var parent = this.STACK[this.STACK.length - 1],
- attrs = node.attrs;
- if (parent && attrs.src)
- if (parent.name == 'video' || parent.name == 'audio')
- parent.attrs.source.push(attrs.src);
- else {
- var i, media = attrs.media;
- if (parent.name == 'picture' && !parent.attrs.src && !(attrs.src.indexOf('.webp') && system.includes('iOS')) &&
- (!media || (media.includes('px') &&
- (((i = media.indexOf('min-width')) != -1 && (i = media.indexOf(':', i + 8)) != -1 && screenWidth > parseInt(
- media.substr(i + 1))) ||
- ((i = media.indexOf('max-width')) != -1 && (i = media.indexOf(':', i + 8)) != -1 && screenWidth < parseInt(
- media.substr(i + 1)))))))
- parent.attrs.src = attrs.src;
- }
- } else if (node.name == 'base' && !this.domain) this.domain = node.attrs.href;
- }
- if (this.data[this.i] == '/') this.i++;
- this.start = this.i + 1;
- this.state = this.Text;
- }
- // 移除标签
- remove(node) {
- var name = node.name,
- j = this.i;
- while (1) {
- if ((this.i = this.data.indexOf('</', this.i + 1)) == -1) {
- if (name == 'pre' || name == 'svg') this.i = j;
- else this.i = this.data.length;
- return;
- }
- this.start = (this.i += 2);
- while (!blankChar[this.data[this.i]] && !this.isClose()) this.i++;
- if (this.getName(this.section()) == name) {
- // 代码块高亮
- if (name == 'pre') {
- this.data = this.data.substr(0, j + 1) + cfg.highlight(this.data.substring(j + 1, this.i - 5), node.attrs) +
- this.data.substr(this.i - 5);
- return this.i = j;
- } else if (name == 'style')
- this.CssHandler.getStyle(this.data.substring(j + 1, this.i - 7));
- else if (name == 'title')
- this.title = this.data.substring(j + 1, this.i - 7);
- if ((this.i = this.data.indexOf('>', this.i)) == -1) this.i = this.data.length;
- // 处理 svg
- if (name == 'svg') {
- var src = this.data.substring(j, this.i + 1);
- if (!node.attrs.xmlns) src = ' xmlns="http://www.w3.org/2000/svg"' + src;
- var i = j;
- while (this.data[j] != '<') j--;
- src = this.data.substring(j, i) + src;
- var parent = this.STACK[this.STACK.length - 1];
- if (node.attrs.width == '100%' && parent && (parent.attrs.style || '').includes('inline'))
- parent.attrs.style = 'width:300px;max-width:100%;' + parent.attrs.style;
- this.siblings().push({
- name: 'img',
- attrs: {
- src: 'data:image/svg+xml;utf8,' + src.replace(/#/g, '%23'),
- ignore: 'T'
- }
- })
- }
- return;
- }
- }
- }
- // 处理属性
- matchAttr(node) {
- var attrs = node.attrs,
- style = this.CssHandler.match(node.name, attrs, node) + (attrs.style || ''),
- styleObj = {};
- if (attrs.id) {
- if (this.compress & 1) attrs.id = void 0;
- else if (this.useAnchor) this.bubble();
- }
- if ((this.compress & 2) && attrs.class) attrs.class = void 0;
- switch (node.name) {
- case 'img':
- if (attrs['data-src']) {
- attrs.src = attrs.src || attrs['data-src'];
- attrs['data-src'] = void 0;
- }
- if (attrs.src && !attrs.ignore) {
- if (this.bubble()) attrs.i = (this.imgNum++).toString();
- else attrs.ignore = 'T';
- }
- break;
- case 'a':
- case 'ad':
- // #ifdef APP-PLUS
- case 'iframe':
- case 'embed':
- // #endif
- this.bubble();
- break;
- case 'font':
- if (attrs.color) {
- styleObj['color'] = attrs.color;
- attrs.color = void 0;
- }
- if (attrs.face) {
- styleObj['font-family'] = attrs.face;
- attrs.face = void 0;
- }
- if (attrs.size) {
- var size = parseInt(attrs.size);
- if (size < 1) size = 1;
- else if (size > 7) size = 7;
- var map = ['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'];
- styleObj['font-size'] = map[size - 1];
- attrs.size = void 0;
- }
- break;
- case 'video':
- case 'audio':
- if (!attrs.id) attrs.id = node.name + (++this[`${node.name}Num`]);
- else this[`${node.name}Num`]++;
- if (node.name == 'video') {
- if (attrs.width) {
- style = `width:${parseFloat(attrs.width) + (attrs.width.includes('%') ? '%' : 'px')};${style}`;
- attrs.width = void 0;
- }
- if (attrs.height) {
- style = `height:${parseFloat(attrs.height) + (attrs.height.includes('%') ? '%' : 'px')};${style}`;
- attrs.height = void 0;
- }
- if (this.videoNum > 3) node.lazyLoad = true;
- }
- attrs.source = [];
- if (attrs.src) attrs.source.push(attrs.src);
- if (!attrs.controls && !attrs.autoplay)
- console.warn(`存在没有 controls 属性的 ${node.name} 标签,可能导致无法播放`, node);
- this.bubble();
- break;
- case 'td':
- case 'th':
- if (attrs.colspan || attrs.rowspan)
- for (var k = this.STACK.length, item; item = this.STACK[--k];)
- if (item.name == 'table') {
- item.c = void 0;
- break;
- }
- }
- if (attrs.align) {
- styleObj['text-align'] = attrs.align;
- attrs.align = void 0;
- }
- // 压缩 style
- var styles = style.replace(/"/g, '"').replace(/&/g, '&').split(';');
- style = '';
- for (var i = 0, len = styles.length; i < len; i++) {
- var info = styles[i].split(':');
- if (info.length < 2) continue;
- let key = info[0].trim().toLowerCase(),
- value = info.slice(1).join(':').trim();
- if (value.includes('-webkit') || value.includes('-moz') || value.includes('-ms') || value.includes('-o') || value
- .includes(
- 'safe'))
- style += `;${key}:${value}`;
- else if (!styleObj[key] || value.includes('import') || !styleObj[key].includes('import'))
- styleObj[key] = value;
- }
- if (node.name == 'img' && parseInt(styleObj.width || attrs.width) > screenWidth)
- styleObj.height = 'auto';
- for (var key in styleObj) {
- var value = styleObj[key];
- if (key.includes('flex') || key == 'order' || key == 'self-align') node.c = 1;
- // 填充链接
- if (value.includes('url')) {
- var j = value.indexOf('(');
- if (j++ != -1) {
- while (value[j] == '"' || value[j] == "'" || blankChar[value[j]]) j++;
- value = value.substr(0, j) + this.getUrl(value.substr(j));
- }
- }
- // 转换 rpx
- else if (value.includes('rpx'))
- value = value.replace(/[0-9.]+\s*rpx/g, $ => parseFloat($) * screenWidth / 750 + 'px');
- else if (key == 'white-space' && value.includes('pre'))
- this.pre = node.pre = true;
- style += `;${key}:${value}`;
- }
- style = style.substr(1);
- if (style) attrs.style = style;
- }
- // 节点出栈处理
- popNode(node) {
- // 空白符处理
- if (node.pre) {
- node.pre = this.pre = void 0;
- for (let i = this.STACK.length; i--;)
- if (this.STACK[i].pre)
- this.pre = true;
- }
- if (node.name == 'head' || (cfg.filter && cfg.filter(node, this) == false))
- return this.siblings().pop();
- var attrs = node.attrs;
- // 替换一些标签名
- if (node.name == 'picture') {
- node.name = 'img';
- if (!attrs.src && (node.children[0] || '').name == 'img')
- attrs.src = node.children[0].attrs.src;
- if (attrs.src && !attrs.ignore)
- attrs.i = (this.imgNum++).toString();
- return node.children = void 0;
- }
- if (cfg.blockTags[node.name]) node.name = 'div';
- else if (!cfg.trustTags[node.name]) node.name = 'span';
- // 处理列表
- if (node.c) {
- if (node.name == 'ul') {
- var floor = 1;
- for (let i = this.STACK.length; i--;)
- if (this.STACK[i].name == 'ul') floor++;
- if (floor != 1)
- for (let i = node.children.length; i--;)
- node.children[i].floor = floor;
- } else if (node.name == 'ol') {
- for (let i = 0, num = 1, child; child = node.children[i++];)
- if (child.name == 'li') {
- child.type = 'ol';
- child.num = ((num, type) => {
- if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26);
- if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26);
- if (type == 'i' || type == 'I') {
- num = (num - 1) % 99 + 1;
- var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'],
- ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'],
- res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || '');
- if (type == 'i') return res.toLowerCase();
- return res;
- }
- return num;
- })(num++, attrs.type) + '.';
- }
- }
- }
- // 处理表格的边框
- if (node.name == 'table') {
- var padding = attrs.cellpadding,
- spacing = attrs.cellspacing,
- border = attrs.border;
- if (node.c) {
- this.bubble();
- if (!padding) padding = 2;
- if (!spacing) spacing = 2;
- }
- if (border) attrs.style = `border:${border}px solid gray;${attrs.style || ''}`;
- if (spacing) attrs.style = `border-spacing:${spacing}px;${attrs.style || ''}`;
- if (border || padding)
- (function f(ns) {
- for (var i = 0, n; n = ns[i]; i++) {
- if (n.name == 'th' || n.name == 'td') {
- if (border) n.attrs.style = `border:${border}px solid gray;${n.attrs.style}`;
- if (padding) n.attrs.style = `padding:${padding}px;${n.attrs.style}`;
- } else f(n.children || []);
- }
- })(node.children)
- }
- this.CssHandler.pop && this.CssHandler.pop(node);
- // 自动压缩
- if (node.name == 'div' && !Object.keys(attrs).length) {
- var siblings = this.siblings();
- if (node.children.length == 1 && node.children[0].name == 'div')
- siblings[siblings.length - 1] = node.children[0];
- }
- }
- // 工具函数
- bubble() {
- for (var i = this.STACK.length, item; item = this.STACK[--i];) {
- if (cfg.richOnlyTags[item.name]) {
- if (item.name == 'table' && !Object.hasOwnProperty.call(item, 'c')) item.c = 1;
- return false;
- }
- item.c = 1;
- }
- return true;
- }
- getName = val => this.xml ? val : val.toLowerCase();
- getUrl(url) {
- if (url[0] == '/') {
- if (url[1] == '/') url = this.protocol + ':' + url;
- else if (this.domain) url = this.domain + url;
- } else if (this.domain && url.indexOf('data:') != 0 && !url.includes('://'))
- url = this.domain + '/' + url;
- return url;
- }
- isClose = () => this.data[this.i] == '>' || (this.data[this.i] == '/' && this.data[this.i + 1] == '>');
- section = () => this.data.substring(this.start, this.i);
- siblings = () => this.STACK.length ? this.STACK[this.STACK.length - 1].children : this.DOM;
- // 状态机
- Text(c) {
- if (c == '<') {
- var next = this.data[this.i + 1],
- isLetter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
- if (isLetter(next)) {
- this.setText();
- this.start = this.i + 1;
- this.state = this.TagName;
- } else if (next == '/') {
- this.setText();
- if (isLetter(this.data[++this.i + 1])) {
- this.start = this.i + 1;
- this.state = this.EndTag;
- } else
- this.Comment();
- } else if (next == '!') {
- this.setText();
- this.Comment();
- }
- }
- }
- Comment() {
- var key;
- if (this.data.substring(this.i + 2, this.i + 4) == '--') key = '-->';
- else if (this.data.substring(this.i + 2, this.i + 9) == '[CDATA[') key = ']]>';
- else key = '>';
- if ((this.i = this.data.indexOf(key, this.i + 2)) == -1) this.i = this.data.length;
- else this.i += key.length - 1;
- this.start = this.i + 1;
- this.state = this.Text;
- }
- TagName(c) {
- if (blankChar[c]) {
- this.tagName = this.section();
- while (blankChar[this.data[this.i]]) this.i++;
- if (this.isClose()) this.setNode();
- else {
- this.start = this.i;
- this.state = this.AttrName;
- }
- } else if (this.isClose()) {
- this.tagName = this.section();
- this.setNode();
- }
- }
- AttrName(c) {
- var blank = blankChar[c];
- if (blank) {
- this.attrName = this.section();
- c = this.data[this.i];
- }
- if (c == '=') {
- if (!blank) this.attrName = this.section();
- while (blankChar[this.data[++this.i]]);
- this.start = this.i--;
- this.state = this.AttrValue;
- } else if (blank) this.setAttr();
- else if (this.isClose()) {
- this.attrName = this.section();
- this.setAttr();
- }
- }
- AttrValue(c) {
- if (c == '"' || c == "'") {
- this.start++;
- if ((this.i = this.data.indexOf(c, this.i + 1)) == -1) return this.i = this.data.length;
- this.attrVal = this.section();
- this.i++;
- } else {
- for (; !blankChar[this.data[this.i]] && !this.isClose(); this.i++);
- this.attrVal = this.section();
- }
- this.setAttr();
- }
- EndTag(c) {
- if (blankChar[c] || c == '>' || c == '/') {
- var name = this.getName(this.section());
- for (var i = this.STACK.length; i--;)
- if (this.STACK[i].name == name) break;
- if (i != -1) {
- var node;
- while ((node = this.STACK.pop()).name != name);
- this.popNode(node);
- } else if (name == 'p' || name == 'br')
- this.siblings().push({
- name,
- attrs: {}
- });
- this.i = this.data.indexOf('>', this.i);
- this.start = this.i + 1;
- if (this.i == -1) this.i = this.data.length;
- else this.state = this.Text;
- }
- }
- }
- module.exports = MpHtmlParser;
|