htmltodom.js 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. "use strict";
  2. const parse5 = require("parse5");
  3. const sax = require("sax");
  4. const attributes = require("../living/attributes");
  5. const DocumentType = require("../living/generated/DocumentType");
  6. const JSDOMParse5Adapter = require("./parse5-adapter-parsing");
  7. const { HTML_NS } = require("../living/helpers/namespaces");
  8. // Horrible monkey-patch to implement https://github.com/inikulin/parse5/issues/237
  9. const OpenElementStack = require("parse5/lib/parser/open_element_stack");
  10. const originalPop = OpenElementStack.prototype.pop;
  11. OpenElementStack.prototype.pop = function (...args) {
  12. const before = this.items[this.stackTop];
  13. originalPop.apply(this, args);
  14. if (before._poppedOffStackOfOpenElements) {
  15. before._poppedOffStackOfOpenElements();
  16. }
  17. };
  18. const originalPush = OpenElementStack.prototype.push;
  19. OpenElementStack.prototype.push = function (...args) {
  20. originalPush.apply(this, args);
  21. const after = this.items[this.stackTop];
  22. if (after._pushedOnStackOfOpenElements) {
  23. after._pushedOnStackOfOpenElements();
  24. }
  25. };
  26. module.exports = class HTMLToDOM {
  27. constructor(parsingMode) {
  28. this.parser = parsingMode === "xml" ? sax : parse5;
  29. }
  30. appendToNode(html, node) {
  31. html = String(html);
  32. return this._doParse(html, true, node);
  33. }
  34. appendToDocument(html, documentImpl) {
  35. html = String(html);
  36. return this._doParse(html, false, documentImpl, documentImpl._parseOptions);
  37. }
  38. _doParse(...args) {
  39. return this.parser === parse5 ? this._parseWithParse5(...args) : this._parseWithSax(...args);
  40. }
  41. _parseWithParse5(html, isFragment, contextNode, options = {}) {
  42. const adapter = new JSDOMParse5Adapter(contextNode._ownerDocument || contextNode);
  43. options.treeAdapter = adapter;
  44. if (isFragment) {
  45. const fragment = this.parser.parseFragment(contextNode, html, options);
  46. if (contextNode._templateContents) {
  47. contextNode._templateContents.appendChild(fragment);
  48. } else {
  49. contextNode.appendChild(fragment);
  50. }
  51. } else {
  52. this.parser.parse(html, options);
  53. }
  54. return contextNode;
  55. }
  56. _parseWithSax(html, isFragment, contextNode) {
  57. const SaxParser = this.parser.parser;
  58. const parser = new SaxParser(/* strict = */true, { xmlns: true, strictEntities: true });
  59. parser.noscript = false;
  60. parser.looseCase = "toString";
  61. const openStack = [contextNode];
  62. parser.ontext = text => {
  63. setChildForSax(openStack[openStack.length - 1], {
  64. type: "text",
  65. data: text
  66. });
  67. };
  68. parser.oncdata = cdata => {
  69. setChildForSax(openStack[openStack.length - 1], {
  70. type: "cdata",
  71. data: cdata
  72. });
  73. };
  74. parser.onopentag = arg => {
  75. const attrs = Object.keys(arg.attributes).map(key => {
  76. const rawAttribute = arg.attributes[key];
  77. let { prefix } = rawAttribute;
  78. let localName = rawAttribute.local;
  79. if (prefix === "xmlns" && localName === "") {
  80. // intended weirdness in node-sax, see https://github.com/isaacs/sax-js/issues/165
  81. localName = prefix;
  82. prefix = null;
  83. }
  84. if (prefix === "") {
  85. prefix = null;
  86. }
  87. const namespace = rawAttribute.uri === "" ? null : rawAttribute.uri;
  88. return { name: rawAttribute.name, value: rawAttribute.value, prefix, localName, namespace };
  89. });
  90. const tag = {
  91. type: "tag",
  92. name: arg.local,
  93. prefix: arg.prefix,
  94. namespace: arg.uri,
  95. attributes: attrs
  96. };
  97. if (arg.local === "script" && arg.uri === HTML_NS) {
  98. openStack.push(tag);
  99. } else {
  100. const elem = setChildForSax(openStack[openStack.length - 1], tag);
  101. openStack.push(elem);
  102. }
  103. };
  104. parser.onclosetag = () => {
  105. const elem = openStack.pop();
  106. if (elem.constructor.name === "Object") { // we have an empty script tag
  107. setChildForSax(openStack[openStack.length - 1], elem);
  108. }
  109. };
  110. parser.onscript = scriptText => {
  111. const tag = openStack.pop();
  112. tag.children = [{ type: "text", data: scriptText }];
  113. const elem = setChildForSax(openStack[openStack.length - 1], tag);
  114. openStack.push(elem);
  115. };
  116. parser.oncomment = comment => {
  117. setChildForSax(openStack[openStack.length - 1], {
  118. type: "comment",
  119. data: comment
  120. });
  121. };
  122. parser.onprocessinginstruction = pi => {
  123. setChildForSax(openStack[openStack.length - 1], {
  124. type: "directive",
  125. name: "?" + pi.name,
  126. data: "?" + pi.name + " " + pi.body + "?"
  127. });
  128. };
  129. parser.ondoctype = dt => {
  130. setChildForSax(openStack[openStack.length - 1], {
  131. type: "directive",
  132. name: "!doctype",
  133. data: "!doctype " + dt
  134. });
  135. const entityMatcher = /<!ENTITY ([^ ]+) "([^"]+)">/g;
  136. let result;
  137. while ((result = entityMatcher.exec(dt))) {
  138. const [, name, value] = result;
  139. if (!(name in parser.ENTITIES)) {
  140. parser.ENTITIES[name] = value;
  141. }
  142. }
  143. };
  144. parser.onerror = err => {
  145. throw err;
  146. };
  147. parser.write(html).close();
  148. }
  149. };
  150. function setChildForSax(parentImpl, node) {
  151. const currentDocument = (parentImpl && parentImpl._ownerDocument) || parentImpl;
  152. let newNode;
  153. let isTemplateContents = false;
  154. switch (node.type) {
  155. case "tag":
  156. case "script":
  157. case "style":
  158. newNode = currentDocument._createElementWithCorrectElementInterface(node.name, node.namespace);
  159. newNode._prefix = node.prefix || null;
  160. newNode._namespaceURI = node.namespace || null;
  161. break;
  162. case "root":
  163. // If we are in <template> then add all children to the parent's _templateContents; skip this virtual root node.
  164. if (parentImpl.tagName === "TEMPLATE" && parentImpl._namespaceURI === HTML_NS) {
  165. newNode = parentImpl._templateContents;
  166. isTemplateContents = true;
  167. }
  168. break;
  169. case "text":
  170. // HTML entities should already be decoded by the parser, so no need to decode them
  171. newNode = currentDocument.createTextNode(node.data);
  172. break;
  173. case "cdata":
  174. newNode = currentDocument.createCDATASection(node.data);
  175. break;
  176. case "comment":
  177. newNode = currentDocument.createComment(node.data);
  178. break;
  179. case "directive":
  180. if (node.name[0] === "?" && node.name.toLowerCase() !== "?xml") {
  181. const data = node.data.slice(node.name.length + 1, -1);
  182. newNode = currentDocument.createProcessingInstruction(node.name.substring(1), data);
  183. } else if (node.name.toLowerCase() === "!doctype") {
  184. newNode = parseDocType(currentDocument, "<" + node.data + ">");
  185. }
  186. break;
  187. }
  188. if (!newNode) {
  189. return null;
  190. }
  191. if (node.attributes) {
  192. for (const a of node.attributes) {
  193. attributes.setAttributeValue(newNode, a.localName, a.value, a.prefix, a.namespace);
  194. }
  195. }
  196. if (node.children) {
  197. for (let c = 0; c < node.children.length; c++) {
  198. setChildForSax(newNode, node.children[c]);
  199. }
  200. }
  201. if (!isTemplateContents) {
  202. if (parentImpl._templateContents) {
  203. // Setting innerHTML on a <template>
  204. parentImpl._templateContents.appendChild(newNode);
  205. } else {
  206. parentImpl.appendChild(newNode);
  207. }
  208. }
  209. return newNode;
  210. }
  211. const HTML5_DOCTYPE = /<!doctype html>/i;
  212. const PUBLIC_DOCTYPE = /<!doctype\s+([^\s]+)\s+public\s+"([^"]+)"\s+"([^"]+)"/i;
  213. const SYSTEM_DOCTYPE = /<!doctype\s+([^\s]+)\s+system\s+"([^"]+)"/i;
  214. function parseDocType(doc, html) {
  215. if (HTML5_DOCTYPE.test(html)) {
  216. return createDocumentTypeInternal(doc, "html", "", "");
  217. }
  218. const publicPieces = PUBLIC_DOCTYPE.exec(html);
  219. if (publicPieces) {
  220. return createDocumentTypeInternal(doc, publicPieces[1], publicPieces[2], publicPieces[3]);
  221. }
  222. const systemPieces = SYSTEM_DOCTYPE.exec(html);
  223. if (systemPieces) {
  224. return createDocumentTypeInternal(doc, systemPieces[1], "", systemPieces[2]);
  225. }
  226. // Shouldn't get here (the parser shouldn't let us know about invalid doctypes), but our logic likely isn't
  227. // real-world perfect, so let's fallback.
  228. return createDocumentTypeInternal(doc, "html", "", "");
  229. }
  230. function createDocumentTypeInternal(ownerDocument, name, publicId, systemId) {
  231. return DocumentType.createImpl([], { ownerDocument, name, publicId, systemId });
  232. }