You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
9.3 KiB

11 months ago
  1. const fs = require("fs");
  2. const os = require("os");
  3. const path = require("path");
  4. class OCRLoader {
  5. constructor() {
  6. this.cacheDir = path.resolve(
  7. process.env.STORAGE_DIR
  8. ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
  9. : path.resolve(__dirname, `../../../server/storage/models/tesseract`)
  10. );
  11. }
  12. log(text, ...args) {
  13. console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
  14. }
  15. /**
  16. * Loads a PDF file and returns an array of documents.
  17. * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
  18. * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
  19. */
  20. async ocrPDF(
  21. filePath,
  22. { maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
  23. ) {
  24. if (
  25. !filePath ||
  26. !fs.existsSync(filePath) ||
  27. !fs.statSync(filePath).isFile()
  28. ) {
  29. this.log(`File ${filePath} does not exist. Skipping OCR.`);
  30. return [];
  31. }
  32. const documentTitle = path.basename(filePath);
  33. this.log(`Starting OCR of ${documentTitle}`);
  34. const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
  35. let buffer = fs.readFileSync(filePath);
  36. const pdfDocument = await pdfjs.getDocument({ data: buffer });
  37. const documents = [];
  38. const meta = await pdfDocument.getMetadata().catch(() => null);
  39. const metadata = {
  40. source: filePath,
  41. pdf: {
  42. version: "v2.0.550",
  43. info: meta?.info,
  44. metadata: meta?.metadata,
  45. totalPages: pdfDocument.numPages,
  46. },
  47. };
  48. const pdfSharp = new PDFSharp({
  49. validOps: [
  50. pdfjs.OPS.paintJpegXObject,
  51. pdfjs.OPS.paintImageXObject,
  52. pdfjs.OPS.paintInlineImageXObject,
  53. ],
  54. });
  55. await pdfSharp.init();
  56. const { createWorker, OEM } = require("tesseract.js");
  57. const BATCH_SIZE = batchSize;
  58. const MAX_EXECUTION_TIME = maxExecutionTime;
  59. const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
  60. const totalPages = pdfDocument.numPages;
  61. const workerPool = await Promise.all(
  62. Array(NUM_WORKERS)
  63. .fill(0)
  64. .map(() =>
  65. createWorker("eng", OEM.LSTM_ONLY, {
  66. cachePath: this.cacheDir,
  67. })
  68. )
  69. );
  70. const startTime = Date.now();
  71. try {
  72. this.log("Bootstrapping OCR completed successfully!", {
  73. MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
  74. BATCH_SIZE,
  75. MAX_CONCURRENT_WORKERS: NUM_WORKERS,
  76. TOTAL_PAGES: totalPages,
  77. });
  78. const timeoutPromise = new Promise((_, reject) => {
  79. setTimeout(() => {
  80. reject(
  81. new Error(
  82. `OCR job took too long to complete (${
  83. MAX_EXECUTION_TIME / 1000
  84. } seconds)`
  85. )
  86. );
  87. }, MAX_EXECUTION_TIME);
  88. });
  89. const processPages = async () => {
  90. for (
  91. let startPage = 1;
  92. startPage <= totalPages;
  93. startPage += BATCH_SIZE
  94. ) {
  95. const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
  96. const pageNumbers = Array.from(
  97. { length: endPage - startPage + 1 },
  98. (_, i) => startPage + i
  99. );
  100. this.log(`Working on pages ${startPage} - ${endPage}`);
  101. const pageQueue = [...pageNumbers];
  102. const results = [];
  103. const workerPromises = workerPool.map(async (worker, workerIndex) => {
  104. while (pageQueue.length > 0) {
  105. const pageNum = pageQueue.shift();
  106. this.log(
  107. `\x1b[34m[Worker ${
  108. workerIndex + 1
  109. }]\x1b[0m assigned pg${pageNum}`
  110. );
  111. const page = await pdfDocument.getPage(pageNum);
  112. const imageBuffer = await pdfSharp.pageToBuffer({ page });
  113. if (!imageBuffer) continue;
  114. const { data } = await worker.recognize(imageBuffer, {}, "text");
  115. this.log(
  116. `\x1b[34m[Worker ${
  117. workerIndex + 1
  118. }]\x1b[0m completed pg${pageNum}`
  119. );
  120. results.push({
  121. pageContent: data.text,
  122. metadata: {
  123. ...metadata,
  124. loc: { pageNumber: pageNum },
  125. },
  126. });
  127. }
  128. });
  129. await Promise.all(workerPromises);
  130. documents.push(
  131. ...results.sort(
  132. (a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
  133. )
  134. );
  135. }
  136. return documents;
  137. };
  138. await Promise.race([timeoutPromise, processPages()]);
  139. } catch (e) {
  140. this.log(`Error: ${e.message}`, e.stack);
  141. } finally {
  142. global.Image = undefined;
  143. await Promise.all(workerPool.map((worker) => worker.terminate()));
  144. }
  145. this.log(`Completed OCR of ${documentTitle}!`, {
  146. documentsParsed: documents.length,
  147. totalPages: totalPages,
  148. executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
  149. });
  150. return documents;
  151. }
  152. /**
  153. * Loads an image file and returns the OCRed text.
  154. * @param {string} filePath - The path to the image file.
  155. * @param {Object} options - The options for the OCR.
  156. * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
  157. * @returns {Promise<string>} The OCRed text.
  158. */
  159. async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
  160. let content = "";
  161. let worker = null;
  162. if (
  163. !filePath ||
  164. !fs.existsSync(filePath) ||
  165. !fs.statSync(filePath).isFile()
  166. ) {
  167. this.log(`File ${filePath} does not exist. Skipping OCR.`);
  168. return null;
  169. }
  170. const documentTitle = path.basename(filePath);
  171. try {
  172. this.log(`Starting OCR of ${documentTitle}`);
  173. const startTime = Date.now();
  174. const { createWorker, OEM } = require("tesseract.js");
  175. worker = await createWorker("eng", OEM.LSTM_ONLY, {
  176. cachePath: this.cacheDir,
  177. });
  178. // Race the timeout with the OCR
  179. const timeoutPromise = new Promise((_, reject) => {
  180. setTimeout(() => {
  181. reject(
  182. new Error(
  183. `OCR job took too long to complete (${
  184. maxExecutionTime / 1000
  185. } seconds)`
  186. )
  187. );
  188. }, maxExecutionTime);
  189. });
  190. const processImage = async () => {
  191. const { data } = await worker.recognize(filePath, {}, "text");
  192. content = data.text;
  193. };
  194. await Promise.race([timeoutPromise, processImage()]);
  195. this.log(`Completed OCR of ${documentTitle}!`, {
  196. executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
  197. });
  198. return content;
  199. } catch (e) {
  200. this.log(`Error: ${e.message}`);
  201. return null;
  202. } finally {
  203. if (!worker) return;
  204. await worker.terminate();
  205. }
  206. }
  207. }
  208. /**
  209. * Converts a PDF page to a buffer using Sharp.
  210. * @param {Object} options - The options for the Sharp PDF page object.
  211. * @param {Object} options.page - The PDFJS page proxy object.
  212. * @returns {Promise<Buffer>} The buffer of the page.
  213. */
  214. class PDFSharp {
  215. constructor({ validOps = [] } = {}) {
  216. this.sharp = null;
  217. this.validOps = validOps;
  218. }
  219. log(text, ...args) {
  220. console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
  221. }
  222. async init() {
  223. this.sharp = (await import("sharp")).default;
  224. }
  225. /**
  226. * Converts a PDF page to a buffer.
  227. * @param {Object} options - The options for the Sharp PDF page object.
  228. * @param {Object} options.page - The PDFJS page proxy object.
  229. * @returns {Promise<Buffer>} The buffer of the page.
  230. */
  231. async pageToBuffer({ page }) {
  232. if (!this.sharp) await this.init();
  233. try {
  234. this.log(`Converting page ${page.pageNumber} to image...`);
  235. const ops = await page.getOperatorList();
  236. const pageImages = ops.fnArray.length;
  237. for (let i = 0; i < pageImages; i++) {
  238. try {
  239. if (!this.validOps.includes(ops.fnArray[i])) continue;
  240. const name = ops.argsArray[i][0];
  241. const img = await page.objs.get(name);
  242. const { width, height } = img;
  243. const size = img.data.length;
  244. const channels = size / width / height;
  245. const targetDPI = 70;
  246. const targetWidth = Math.floor(width * (targetDPI / 72));
  247. const targetHeight = Math.floor(height * (targetDPI / 72));
  248. const image = this.sharp(img.data, {
  249. raw: { width, height, channels },
  250. density: targetDPI,
  251. })
  252. .resize({
  253. width: targetWidth,
  254. height: targetHeight,
  255. fit: "fill",
  256. })
  257. .withMetadata({
  258. density: targetDPI,
  259. resolution: targetDPI,
  260. })
  261. .png();
  262. // For debugging purposes
  263. // await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
  264. return await image.toBuffer();
  265. } catch (error) {
  266. this.log(`Iteration error: ${error.message}`, error.stack);
  267. continue;
  268. }
  269. }
  270. this.log(`No valid images found on page ${page.pageNumber}`);
  271. return null;
  272. } catch (error) {
  273. this.log(`Error: ${error.message}`, error.stack);
  274. return null;
  275. }
  276. }
  277. }
  278. module.exports = OCRLoader;