workbook-reader.js 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. const fs = require('fs');
  2. const {EventEmitter} = require('events');
  3. const {PassThrough, Readable} = require('readable-stream');
  4. const nodeStream = require('stream');
  5. const unzip = require('unzipper');
  6. const tmp = require('tmp');
  7. const iterateStream = require('../../utils/iterate-stream');
  8. const parseSax = require('../../utils/parse-sax');
  9. const StyleManager = require('../../xlsx/xform/style/styles-xform');
  10. const WorkbookXform = require('../../xlsx/xform/book/workbook-xform');
  11. const RelationshipsXform = require('../../xlsx/xform/core/relationships-xform');
  12. const WorksheetReader = require('./worksheet-reader');
  13. const HyperlinkReader = require('./hyperlink-reader');
  14. tmp.setGracefulCleanup();
  15. class WorkbookReader extends EventEmitter {
  16. constructor(input, options = {}) {
  17. super();
  18. this.input = input;
  19. this.options = {
  20. worksheets: 'emit',
  21. sharedStrings: 'cache',
  22. hyperlinks: 'ignore',
  23. styles: 'ignore',
  24. entries: 'ignore',
  25. ...options,
  26. };
  27. this.styles = new StyleManager();
  28. this.styles.init();
  29. }
  30. _getStream(input) {
  31. if (input instanceof nodeStream.Readable || input instanceof Readable) {
  32. return input;
  33. }
  34. if (typeof input === 'string') {
  35. return fs.createReadStream(input);
  36. }
  37. throw new Error(`Could not recognise input: ${input}`);
  38. }
  39. async read(input, options) {
  40. try {
  41. for await (const {eventType, value} of this.parse(input, options)) {
  42. switch (eventType) {
  43. case 'shared-strings':
  44. this.emit(eventType, value);
  45. break;
  46. case 'worksheet':
  47. this.emit(eventType, value);
  48. await value.read();
  49. break;
  50. case 'hyperlinks':
  51. this.emit(eventType, value);
  52. break;
  53. }
  54. }
  55. this.emit('end');
  56. this.emit('finished');
  57. } catch (error) {
  58. this.emit('error', error);
  59. }
  60. }
  61. async *[Symbol.asyncIterator]() {
  62. for await (const {eventType, value} of this.parse()) {
  63. if (eventType === 'worksheet') {
  64. yield value;
  65. }
  66. }
  67. }
  68. async *parse(input, options) {
  69. if (options) this.options = options;
  70. const stream = (this.stream = this._getStream(input || this.input));
  71. const zip = unzip.Parse({forceStream: true});
  72. stream.pipe(zip);
  73. // worksheets, deferred for parsing after shared strings reading
  74. const waitingWorkSheets = [];
  75. for await (const entry of iterateStream(zip)) {
  76. let match;
  77. let sheetNo;
  78. switch (entry.path) {
  79. case '_rels/.rels':
  80. break;
  81. case 'xl/_rels/workbook.xml.rels':
  82. await this._parseRels(entry);
  83. break;
  84. case 'xl/workbook.xml':
  85. await this._parseWorkbook(entry);
  86. break;
  87. case 'xl/sharedStrings.xml':
  88. yield* this._parseSharedStrings(entry);
  89. break;
  90. case 'xl/styles.xml':
  91. await this._parseStyles(entry);
  92. break;
  93. default:
  94. if (entry.path.match(/xl\/worksheets\/sheet\d+[.]xml/)) {
  95. match = entry.path.match(/xl\/worksheets\/sheet(\d+)[.]xml/);
  96. sheetNo = match[1];
  97. if (this.sharedStrings && this.workbookRels) {
  98. yield* this._parseWorksheet(iterateStream(entry), sheetNo);
  99. } else {
  100. // create temp file for each worksheet
  101. await new Promise((resolve, reject) => {
  102. tmp.file((err, path, fd, tempFileCleanupCallback) => {
  103. if (err) {
  104. return reject(err);
  105. }
  106. waitingWorkSheets.push({sheetNo, path, tempFileCleanupCallback});
  107. const tempStream = fs.createWriteStream(path);
  108. tempStream.on('error', reject);
  109. entry.pipe(tempStream);
  110. return tempStream.on('finish', () => {
  111. return resolve();
  112. });
  113. });
  114. });
  115. }
  116. } else if (entry.path.match(/xl\/worksheets\/_rels\/sheet\d+[.]xml.rels/)) {
  117. match = entry.path.match(/xl\/worksheets\/_rels\/sheet(\d+)[.]xml.rels/);
  118. sheetNo = match[1];
  119. yield* this._parseHyperlinks(iterateStream(entry), sheetNo);
  120. }
  121. break;
  122. }
  123. entry.autodrain();
  124. }
  125. for (const {sheetNo, path, tempFileCleanupCallback} of waitingWorkSheets) {
  126. let fileStream = fs.createReadStream(path);
  127. // TODO: Remove once node v8 is deprecated
  128. // Detect and upgrade old fileStreams
  129. if (!fileStream[Symbol.asyncIterator]) {
  130. fileStream = fileStream.pipe(new PassThrough());
  131. }
  132. yield* this._parseWorksheet(fileStream, sheetNo);
  133. tempFileCleanupCallback();
  134. }
  135. }
  136. _emitEntry(payload) {
  137. if (this.options.entries === 'emit') {
  138. this.emit('entry', payload);
  139. }
  140. }
  141. async _parseRels(entry) {
  142. const xform = new RelationshipsXform();
  143. this.workbookRels = await xform.parseStream(iterateStream(entry));
  144. }
  145. async _parseWorkbook(entry) {
  146. this._emitEntry({type: 'workbook'});
  147. const workbook = new WorkbookXform();
  148. await workbook.parseStream(iterateStream(entry));
  149. this.properties = workbook.map.workbookPr;
  150. this.model = workbook.model;
  151. }
  152. async *_parseSharedStrings(entry) {
  153. this._emitEntry({type: 'shared-strings'});
  154. switch (this.options.sharedStrings) {
  155. case 'cache':
  156. this.sharedStrings = [];
  157. break;
  158. case 'emit':
  159. break;
  160. default:
  161. return;
  162. }
  163. let text = null;
  164. let richText = [];
  165. let index = 0;
  166. let font = null;
  167. for await (const events of parseSax(iterateStream(entry))) {
  168. for (const {eventType, value} of events) {
  169. if (eventType === 'opentag') {
  170. const node = value;
  171. switch (node.name) {
  172. case 'b':
  173. font = font || {};
  174. font.bold = true;
  175. break;
  176. case 'charset':
  177. font = font || {};
  178. font.charset = parseInt(node.attributes.charset, 10);
  179. break;
  180. case 'color':
  181. font = font || {};
  182. font.color = {};
  183. if (node.attributes.rgb) {
  184. font.color.argb = node.attributes.argb;
  185. }
  186. if (node.attributes.val) {
  187. font.color.argb = node.attributes.val;
  188. }
  189. if (node.attributes.theme) {
  190. font.color.theme = node.attributes.theme;
  191. }
  192. break;
  193. case 'family':
  194. font = font || {};
  195. font.family = parseInt(node.attributes.val, 10);
  196. break;
  197. case 'i':
  198. font = font || {};
  199. font.italic = true;
  200. break;
  201. case 'outline':
  202. font = font || {};
  203. font.outline = true;
  204. break;
  205. case 'rFont':
  206. font = font || {};
  207. font.name = node.value;
  208. break;
  209. case 'si':
  210. font = null;
  211. richText = [];
  212. text = null;
  213. break;
  214. case 'sz':
  215. font = font || {};
  216. font.size = parseInt(node.attributes.val, 10);
  217. break;
  218. case 'strike':
  219. break;
  220. case 't':
  221. text = null;
  222. break;
  223. case 'u':
  224. font = font || {};
  225. font.underline = true;
  226. break;
  227. case 'vertAlign':
  228. font = font || {};
  229. font.vertAlign = node.attributes.val;
  230. break;
  231. }
  232. } else if (eventType === 'text') {
  233. text = text ? text + value : value;
  234. } else if (eventType === 'closetag') {
  235. const node = value;
  236. switch (node.name) {
  237. case 'r':
  238. richText.push({
  239. font,
  240. text,
  241. });
  242. font = null;
  243. text = null;
  244. break;
  245. case 'si':
  246. if (this.options.sharedStrings === 'cache') {
  247. this.sharedStrings.push(richText.length ? {richText} : text);
  248. } else if (this.options.sharedStrings === 'emit') {
  249. yield {index: index++, text: richText.length ? {richText} : text};
  250. }
  251. richText = [];
  252. font = null;
  253. text = null;
  254. break;
  255. }
  256. }
  257. }
  258. }
  259. }
  260. async _parseStyles(entry) {
  261. this._emitEntry({type: 'styles'});
  262. if (this.options.styles === 'cache') {
  263. this.styles = new StyleManager();
  264. await this.styles.parseStream(iterateStream(entry));
  265. }
  266. }
  267. *_parseWorksheet(iterator, sheetNo) {
  268. this._emitEntry({type: 'worksheet', id: sheetNo});
  269. const worksheetReader = new WorksheetReader({
  270. workbook: this,
  271. id: sheetNo,
  272. iterator,
  273. options: this.options,
  274. });
  275. const matchingRel = (this.workbookRels || []).find(rel => rel.Target === `worksheets/sheet${sheetNo}.xml`);
  276. const matchingSheet = matchingRel && (this.model.sheets || []).find(sheet => sheet.rId === matchingRel.Id);
  277. if (matchingSheet) {
  278. worksheetReader.id = matchingSheet.id;
  279. worksheetReader.name = matchingSheet.name;
  280. worksheetReader.state = matchingSheet.state;
  281. }
  282. if (this.options.worksheets === 'emit') {
  283. yield {eventType: 'worksheet', value: worksheetReader};
  284. }
  285. }
  286. *_parseHyperlinks(iterator, sheetNo) {
  287. this._emitEntry({type: 'hyperlinks', id: sheetNo});
  288. const hyperlinksReader = new HyperlinkReader({
  289. workbook: this,
  290. id: sheetNo,
  291. iterator,
  292. options: this.options,
  293. });
  294. if (this.options.hyperlinks === 'emit') {
  295. yield {eventType: 'hyperlinks', value: hyperlinksReader};
  296. }
  297. }
  298. }
  299. // for reference - these are the valid values for options
  300. WorkbookReader.Options = {
  301. worksheets: ['emit', 'ignore'],
  302. sharedStrings: ['cache', 'emit', 'ignore'],
  303. hyperlinks: ['cache', 'emit', 'ignore'],
  304. styles: ['cache', 'ignore'],
  305. entries: ['emit', 'ignore'],
  306. };
  307. module.exports = WorkbookReader;