You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

307 lines
9.3 KiB

const fs = require("fs");
const os = require("os");
const path = require("path");
class OCRLoader {
constructor() {
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
);
}
log(text, ...args) {
console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
}
/**
* Loads a PDF file and returns an array of documents.
* This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
* @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
*/
async ocrPDF(
filePath,
{ maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
) {
if (
!filePath ||
!fs.existsSync(filePath) ||
!fs.statSync(filePath).isFile()
) {
this.log(`File ${filePath} does not exist. Skipping OCR.`);
return [];
}
const documentTitle = path.basename(filePath);
this.log(`Starting OCR of ${documentTitle}`);
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
let buffer = fs.readFileSync(filePath);
const pdfDocument = await pdfjs.getDocument({ data: buffer });
const documents = [];
const meta = await pdfDocument.getMetadata().catch(() => null);
const metadata = {
source: filePath,
pdf: {
version: "v2.0.550",
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdfDocument.numPages,
},
};
const pdfSharp = new PDFSharp({
validOps: [
pdfjs.OPS.paintJpegXObject,
pdfjs.OPS.paintImageXObject,
pdfjs.OPS.paintInlineImageXObject,
],
});
await pdfSharp.init();
const { createWorker, OEM } = require("tesseract.js");
const BATCH_SIZE = batchSize;
const MAX_EXECUTION_TIME = maxExecutionTime;
const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
const totalPages = pdfDocument.numPages;
const workerPool = await Promise.all(
Array(NUM_WORKERS)
.fill(0)
.map(() =>
createWorker("eng", OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
})
)
);
const startTime = Date.now();
try {
this.log("Bootstrapping OCR completed successfully!", {
MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
BATCH_SIZE,
MAX_CONCURRENT_WORKERS: NUM_WORKERS,
TOTAL_PAGES: totalPages,
});
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(
new Error(
`OCR job took too long to complete (${
MAX_EXECUTION_TIME / 1000
} seconds)`
)
);
}, MAX_EXECUTION_TIME);
});
const processPages = async () => {
for (
let startPage = 1;
startPage <= totalPages;
startPage += BATCH_SIZE
) {
const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
const pageNumbers = Array.from(
{ length: endPage - startPage + 1 },
(_, i) => startPage + i
);
this.log(`Working on pages ${startPage} - ${endPage}`);
const pageQueue = [...pageNumbers];
const results = [];
const workerPromises = workerPool.map(async (worker, workerIndex) => {
while (pageQueue.length > 0) {
const pageNum = pageQueue.shift();
this.log(
`\x1b[34m[Worker ${
workerIndex + 1
}]\x1b[0m assigned pg${pageNum}`
);
const page = await pdfDocument.getPage(pageNum);
const imageBuffer = await pdfSharp.pageToBuffer({ page });
if (!imageBuffer) continue;
const { data } = await worker.recognize(imageBuffer, {}, "text");
this.log(
`\x1b[34m[Worker ${
workerIndex + 1
}]\x1b[0m completed pg${pageNum}`
);
results.push({
pageContent: data.text,
metadata: {
...metadata,
loc: { pageNumber: pageNum },
},
});
}
});
await Promise.all(workerPromises);
documents.push(
...results.sort(
(a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
)
);
}
return documents;
};
await Promise.race([timeoutPromise, processPages()]);
} catch (e) {
this.log(`Error: ${e.message}`, e.stack);
} finally {
global.Image = undefined;
await Promise.all(workerPool.map((worker) => worker.terminate()));
}
this.log(`Completed OCR of ${documentTitle}!`, {
documentsParsed: documents.length,
totalPages: totalPages,
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
});
return documents;
}
/**
* Loads an image file and returns the OCRed text.
* @param {string} filePath - The path to the image file.
* @param {Object} options - The options for the OCR.
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
* @returns {Promise<string>} The OCRed text.
*/
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
let content = "";
let worker = null;
if (
!filePath ||
!fs.existsSync(filePath) ||
!fs.statSync(filePath).isFile()
) {
this.log(`File ${filePath} does not exist. Skipping OCR.`);
return null;
}
const documentTitle = path.basename(filePath);
try {
this.log(`Starting OCR of ${documentTitle}`);
const startTime = Date.now();
const { createWorker, OEM } = require("tesseract.js");
worker = await createWorker("eng", OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
});
// Race the timeout with the OCR
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(
new Error(
`OCR job took too long to complete (${
maxExecutionTime / 1000
} seconds)`
)
);
}, maxExecutionTime);
});
const processImage = async () => {
const { data } = await worker.recognize(filePath, {}, "text");
content = data.text;
};
await Promise.race([timeoutPromise, processImage()]);
this.log(`Completed OCR of ${documentTitle}!`, {
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
});
return content;
} catch (e) {
this.log(`Error: ${e.message}`);
return null;
} finally {
if (!worker) return;
await worker.terminate();
}
}
}
/**
* Converts a PDF page to a buffer using Sharp.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
class PDFSharp {
constructor({ validOps = [] } = {}) {
this.sharp = null;
this.validOps = validOps;
}
log(text, ...args) {
console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
}
async init() {
this.sharp = (await import("sharp")).default;
}
/**
* Converts a PDF page to a buffer.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
async pageToBuffer({ page }) {
if (!this.sharp) await this.init();
try {
this.log(`Converting page ${page.pageNumber} to image...`);
const ops = await page.getOperatorList();
const pageImages = ops.fnArray.length;
for (let i = 0; i < pageImages; i++) {
try {
if (!this.validOps.includes(ops.fnArray[i])) continue;
const name = ops.argsArray[i][0];
const img = await page.objs.get(name);
const { width, height } = img;
const size = img.data.length;
const channels = size / width / height;
const targetDPI = 70;
const targetWidth = Math.floor(width * (targetDPI / 72));
const targetHeight = Math.floor(height * (targetDPI / 72));
const image = this.sharp(img.data, {
raw: { width, height, channels },
density: targetDPI,
})
.resize({
width: targetWidth,
height: targetHeight,
fit: "fill",
})
.withMetadata({
density: targetDPI,
resolution: targetDPI,
})
.png();
// For debugging purposes
// await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
return await image.toBuffer();
} catch (error) {
this.log(`Iteration error: ${error.message}`, error.stack);
continue;
}
}
this.log(`No valid images found on page ${page.pageNumber}`);
return null;
} catch (error) {
this.log(`Error: ${error.message}`, error.stack);
return null;
}
}
}
module.exports = OCRLoader;