chtech-anythingllm/collector/utils/OCRLoader/index.js


								const fs = require("fs");

								const os = require("os");

								const path = require("path");


								class OCRLoader {

								  constructor() {

								    this.cacheDir = path.resolve(

								      process.env.STORAGE_DIR

								        ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)

								        : path.resolve(__dirname, `../../../server/storage/models/tesseract`)

								    );

								  }


								  log(text, ...args) {

								    console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);

								  }


								  /**

								   * Loads a PDF file and returns an array of documents.

								   * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function

								   * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.

								   */

								  async ocrPDF(

								    filePath,

								    { maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}

								  ) {

								    if (

								      !filePath ||

								      !fs.existsSync(filePath) ||

								      !fs.statSync(filePath).isFile()

								    ) {

								      this.log(`File ${filePath} does not exist. Skipping OCR.`);

								      return [];

								    }


								    const documentTitle = path.basename(filePath);

								    this.log(`Starting OCR of ${documentTitle}`);

								    const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");

								    let buffer = fs.readFileSync(filePath);


								    const pdfDocument = await pdfjs.getDocument({ data: buffer });


								    const documents = [];

								    const meta = await pdfDocument.getMetadata().catch(() => null);

								    const metadata = {

								      source: filePath,

								      pdf: {

								        version: "v2.0.550",

								        info: meta?.info,

								        metadata: meta?.metadata,

								        totalPages: pdfDocument.numPages,

								      },

								    };


								    const pdfSharp = new PDFSharp({

								      validOps: [

								        pdfjs.OPS.paintJpegXObject,

								        pdfjs.OPS.paintImageXObject,

								        pdfjs.OPS.paintInlineImageXObject,

								      ],

								    });

								    await pdfSharp.init();


								    const { createWorker, OEM } = require("tesseract.js");

								    const BATCH_SIZE = batchSize;

								    const MAX_EXECUTION_TIME = maxExecutionTime;

								    const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);

								    const totalPages = pdfDocument.numPages;

								    const workerPool = await Promise.all(

								      Array(NUM_WORKERS)

								        .fill(0)

								        .map(() =>

								          createWorker("eng", OEM.LSTM_ONLY, {

								            cachePath: this.cacheDir,

								          })

								        )

								    );


								    const startTime = Date.now();

								    try {

								      this.log("Bootstrapping OCR completed successfully!", {

								        MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,

								        BATCH_SIZE,

								        MAX_CONCURRENT_WORKERS: NUM_WORKERS,

								        TOTAL_PAGES: totalPages,

								      });

								      const timeoutPromise = new Promise((_, reject) => {

								        setTimeout(() => {

								          reject(

								            new Error(

								              `OCR job took too long to complete (${

								                MAX_EXECUTION_TIME / 1000

								              } seconds)`

								            )

								          );

								        }, MAX_EXECUTION_TIME);

								      });


								      const processPages = async () => {

								        for (

								          let startPage = 1;

								          startPage <= totalPages;

								          startPage += BATCH_SIZE

								        ) {

								          const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);

								          const pageNumbers = Array.from(

								            { length: endPage - startPage + 1 },

								            (_, i) => startPage + i

								          );

								          this.log(`Working on pages ${startPage} - ${endPage}`);


								          const pageQueue = [...pageNumbers];

								          const results = [];

								          const workerPromises = workerPool.map(async (worker, workerIndex) => {

								            while (pageQueue.length > 0) {

								              const pageNum = pageQueue.shift();

								              this.log(

								                `\x1b[34m[Worker ${

								                  workerIndex + 1

								                }]\x1b[0m assigned pg${pageNum}`

								              );

								              const page = await pdfDocument.getPage(pageNum);

								              const imageBuffer = await pdfSharp.pageToBuffer({ page });

								              if (!imageBuffer) continue;

								              const { data } = await worker.recognize(imageBuffer, {}, "text");

								              this.log(

								                `✅ \x1b[34m[Worker ${

								                  workerIndex + 1

								                }]\x1b[0m completed pg${pageNum}`

								              );

								              results.push({

								                pageContent: data.text,

								                metadata: {

								                  ...metadata,

								                  loc: { pageNumber: pageNum },

								                },

								              });

								            }

								          });


								          await Promise.all(workerPromises);

								          documents.push(

								            ...results.sort(

								              (a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber

								            )

								          );

								        }

								        return documents;

								      };


								      await Promise.race([timeoutPromise, processPages()]);

								    } catch (e) {

								      this.log(`Error: ${e.message}`, e.stack);

								    } finally {

								      global.Image = undefined;

								      await Promise.all(workerPool.map((worker) => worker.terminate()));

								    }


								    this.log(`Completed OCR of ${documentTitle}!`, {

								      documentsParsed: documents.length,

								      totalPages: totalPages,

								      executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,

								    });

								    return documents;

								  }


								  /**

								   * Loads an image file and returns the OCRed text.

								   * @param {string} filePath - The path to the image file.

								   * @param {Object} options - The options for the OCR.

								   * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.

								   * @returns {Promise<string>} The OCRed text.

								   */

								  async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {

								    let content = "";

								    let worker = null;

								    if (

								      !filePath ||

								      !fs.existsSync(filePath) ||

								      !fs.statSync(filePath).isFile()

								    ) {

								      this.log(`File ${filePath} does not exist. Skipping OCR.`);

								      return null;

								    }


								    const documentTitle = path.basename(filePath);

								    try {

								      this.log(`Starting OCR of ${documentTitle}`);

								      const startTime = Date.now();

								      const { createWorker, OEM } = require("tesseract.js");

								      worker = await createWorker("eng", OEM.LSTM_ONLY, {

								        cachePath: this.cacheDir,

								      });


								      // Race the timeout with the OCR

								      const timeoutPromise = new Promise((_, reject) => {

								        setTimeout(() => {

								          reject(

								            new Error(

								              `OCR job took too long to complete (${

								                maxExecutionTime / 1000

								              } seconds)`

								            )

								          );

								        }, maxExecutionTime);

								      });


								      const processImage = async () => {

								        const { data } = await worker.recognize(filePath, {}, "text");

								        content = data.text;

								      };


								      await Promise.race([timeoutPromise, processImage()]);

								      this.log(`Completed OCR of ${documentTitle}!`, {

								        executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,

								      });


								      return content;

								    } catch (e) {

								      this.log(`Error: ${e.message}`);

								      return null;

								    } finally {

								      if (!worker) return;

								      await worker.terminate();

								    }

								  }

								}


								/**

								 * Converts a PDF page to a buffer using Sharp.

								 * @param {Object} options - The options for the Sharp PDF page object.

								 * @param {Object} options.page - The PDFJS page proxy object.

								 * @returns {Promise<Buffer>} The buffer of the page.

								 */

								class PDFSharp {

								  constructor({ validOps = [] } = {}) {

								    this.sharp = null;

								    this.validOps = validOps;

								  }


								  log(text, ...args) {

								    console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);

								  }


								  async init() {

								    this.sharp = (await import("sharp")).default;

								  }


								  /**

								   * Converts a PDF page to a buffer.

								   * @param {Object} options - The options for the Sharp PDF page object.

								   * @param {Object} options.page - The PDFJS page proxy object.

								   * @returns {Promise<Buffer>} The buffer of the page.

								   */

								  async pageToBuffer({ page }) {

								    if (!this.sharp) await this.init();

								    try {

								      this.log(`Converting page ${page.pageNumber} to image...`);

								      const ops = await page.getOperatorList();

								      const pageImages = ops.fnArray.length;


								      for (let i = 0; i < pageImages; i++) {

								        try {

								          if (!this.validOps.includes(ops.fnArray[i])) continue;


								          const name = ops.argsArray[i][0];

								          const img = await page.objs.get(name);

								          const { width, height } = img;

								          const size = img.data.length;

								          const channels = size / width / height;

								          const targetDPI = 70;

								          const targetWidth = Math.floor(width * (targetDPI / 72));

								          const targetHeight = Math.floor(height * (targetDPI / 72));


								          const image = this.sharp(img.data, {

								            raw: { width, height, channels },

								            density: targetDPI,

								          })

								            .resize({

								              width: targetWidth,

								              height: targetHeight,

								              fit: "fill",

								            })

								            .withMetadata({

								              density: targetDPI,

								              resolution: targetDPI,

								            })

								            .png();


								          // For debugging purposes

								          // await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));

								          return await image.toBuffer();

								        } catch (error) {

								          this.log(`Iteration error: ${error.message}`, error.stack);

								          continue;

								        }

								      }

								      this.log(`No valid images found on page ${page.pageNumber}`);

								      return null;

								    } catch (error) {

								      this.log(`Error: ${error.message}`, error.stack);

								      return null;

								    }

								  }

								}


								module.exports = OCRLoader;