You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
307 lines
9.3 KiB
307 lines
9.3 KiB
const fs = require("fs");
|
|
const os = require("os");
|
|
const path = require("path");
|
|
|
|
class OCRLoader {
|
|
constructor() {
|
|
this.cacheDir = path.resolve(
|
|
process.env.STORAGE_DIR
|
|
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
|
|
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
|
|
);
|
|
}
|
|
|
|
log(text, ...args) {
|
|
console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
|
|
}
|
|
|
|
/**
|
|
* Loads a PDF file and returns an array of documents.
|
|
* This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
|
|
* @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
|
|
*/
|
|
async ocrPDF(
|
|
filePath,
|
|
{ maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
|
|
) {
|
|
if (
|
|
!filePath ||
|
|
!fs.existsSync(filePath) ||
|
|
!fs.statSync(filePath).isFile()
|
|
) {
|
|
this.log(`File ${filePath} does not exist. Skipping OCR.`);
|
|
return [];
|
|
}
|
|
|
|
const documentTitle = path.basename(filePath);
|
|
this.log(`Starting OCR of ${documentTitle}`);
|
|
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
|
|
let buffer = fs.readFileSync(filePath);
|
|
|
|
const pdfDocument = await pdfjs.getDocument({ data: buffer });
|
|
|
|
const documents = [];
|
|
const meta = await pdfDocument.getMetadata().catch(() => null);
|
|
const metadata = {
|
|
source: filePath,
|
|
pdf: {
|
|
version: "v2.0.550",
|
|
info: meta?.info,
|
|
metadata: meta?.metadata,
|
|
totalPages: pdfDocument.numPages,
|
|
},
|
|
};
|
|
|
|
const pdfSharp = new PDFSharp({
|
|
validOps: [
|
|
pdfjs.OPS.paintJpegXObject,
|
|
pdfjs.OPS.paintImageXObject,
|
|
pdfjs.OPS.paintInlineImageXObject,
|
|
],
|
|
});
|
|
await pdfSharp.init();
|
|
|
|
const { createWorker, OEM } = require("tesseract.js");
|
|
const BATCH_SIZE = batchSize;
|
|
const MAX_EXECUTION_TIME = maxExecutionTime;
|
|
const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
|
|
const totalPages = pdfDocument.numPages;
|
|
const workerPool = await Promise.all(
|
|
Array(NUM_WORKERS)
|
|
.fill(0)
|
|
.map(() =>
|
|
createWorker("eng", OEM.LSTM_ONLY, {
|
|
cachePath: this.cacheDir,
|
|
})
|
|
)
|
|
);
|
|
|
|
const startTime = Date.now();
|
|
try {
|
|
this.log("Bootstrapping OCR completed successfully!", {
|
|
MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
|
|
BATCH_SIZE,
|
|
MAX_CONCURRENT_WORKERS: NUM_WORKERS,
|
|
TOTAL_PAGES: totalPages,
|
|
});
|
|
const timeoutPromise = new Promise((_, reject) => {
|
|
setTimeout(() => {
|
|
reject(
|
|
new Error(
|
|
`OCR job took too long to complete (${
|
|
MAX_EXECUTION_TIME / 1000
|
|
} seconds)`
|
|
)
|
|
);
|
|
}, MAX_EXECUTION_TIME);
|
|
});
|
|
|
|
const processPages = async () => {
|
|
for (
|
|
let startPage = 1;
|
|
startPage <= totalPages;
|
|
startPage += BATCH_SIZE
|
|
) {
|
|
const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
|
|
const pageNumbers = Array.from(
|
|
{ length: endPage - startPage + 1 },
|
|
(_, i) => startPage + i
|
|
);
|
|
this.log(`Working on pages ${startPage} - ${endPage}`);
|
|
|
|
const pageQueue = [...pageNumbers];
|
|
const results = [];
|
|
const workerPromises = workerPool.map(async (worker, workerIndex) => {
|
|
while (pageQueue.length > 0) {
|
|
const pageNum = pageQueue.shift();
|
|
this.log(
|
|
`\x1b[34m[Worker ${
|
|
workerIndex + 1
|
|
}]\x1b[0m assigned pg${pageNum}`
|
|
);
|
|
const page = await pdfDocument.getPage(pageNum);
|
|
const imageBuffer = await pdfSharp.pageToBuffer({ page });
|
|
if (!imageBuffer) continue;
|
|
const { data } = await worker.recognize(imageBuffer, {}, "text");
|
|
this.log(
|
|
`✅ \x1b[34m[Worker ${
|
|
workerIndex + 1
|
|
}]\x1b[0m completed pg${pageNum}`
|
|
);
|
|
results.push({
|
|
pageContent: data.text,
|
|
metadata: {
|
|
...metadata,
|
|
loc: { pageNumber: pageNum },
|
|
},
|
|
});
|
|
}
|
|
});
|
|
|
|
await Promise.all(workerPromises);
|
|
documents.push(
|
|
...results.sort(
|
|
(a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
|
|
)
|
|
);
|
|
}
|
|
return documents;
|
|
};
|
|
|
|
await Promise.race([timeoutPromise, processPages()]);
|
|
} catch (e) {
|
|
this.log(`Error: ${e.message}`, e.stack);
|
|
} finally {
|
|
global.Image = undefined;
|
|
await Promise.all(workerPool.map((worker) => worker.terminate()));
|
|
}
|
|
|
|
this.log(`Completed OCR of ${documentTitle}!`, {
|
|
documentsParsed: documents.length,
|
|
totalPages: totalPages,
|
|
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
|
|
});
|
|
return documents;
|
|
}
|
|
|
|
/**
|
|
* Loads an image file and returns the OCRed text.
|
|
* @param {string} filePath - The path to the image file.
|
|
* @param {Object} options - The options for the OCR.
|
|
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
|
|
* @returns {Promise<string>} The OCRed text.
|
|
*/
|
|
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
|
|
let content = "";
|
|
let worker = null;
|
|
if (
|
|
!filePath ||
|
|
!fs.existsSync(filePath) ||
|
|
!fs.statSync(filePath).isFile()
|
|
) {
|
|
this.log(`File ${filePath} does not exist. Skipping OCR.`);
|
|
return null;
|
|
}
|
|
|
|
const documentTitle = path.basename(filePath);
|
|
try {
|
|
this.log(`Starting OCR of ${documentTitle}`);
|
|
const startTime = Date.now();
|
|
const { createWorker, OEM } = require("tesseract.js");
|
|
worker = await createWorker("eng", OEM.LSTM_ONLY, {
|
|
cachePath: this.cacheDir,
|
|
});
|
|
|
|
// Race the timeout with the OCR
|
|
const timeoutPromise = new Promise((_, reject) => {
|
|
setTimeout(() => {
|
|
reject(
|
|
new Error(
|
|
`OCR job took too long to complete (${
|
|
maxExecutionTime / 1000
|
|
} seconds)`
|
|
)
|
|
);
|
|
}, maxExecutionTime);
|
|
});
|
|
|
|
const processImage = async () => {
|
|
const { data } = await worker.recognize(filePath, {}, "text");
|
|
content = data.text;
|
|
};
|
|
|
|
await Promise.race([timeoutPromise, processImage()]);
|
|
this.log(`Completed OCR of ${documentTitle}!`, {
|
|
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
|
|
});
|
|
|
|
return content;
|
|
} catch (e) {
|
|
this.log(`Error: ${e.message}`);
|
|
return null;
|
|
} finally {
|
|
if (!worker) return;
|
|
await worker.terminate();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Converts a PDF page to a buffer using Sharp.
|
|
* @param {Object} options - The options for the Sharp PDF page object.
|
|
* @param {Object} options.page - The PDFJS page proxy object.
|
|
* @returns {Promise<Buffer>} The buffer of the page.
|
|
*/
|
|
class PDFSharp {
|
|
constructor({ validOps = [] } = {}) {
|
|
this.sharp = null;
|
|
this.validOps = validOps;
|
|
}
|
|
|
|
log(text, ...args) {
|
|
console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
|
|
}
|
|
|
|
async init() {
|
|
this.sharp = (await import("sharp")).default;
|
|
}
|
|
|
|
/**
|
|
* Converts a PDF page to a buffer.
|
|
* @param {Object} options - The options for the Sharp PDF page object.
|
|
* @param {Object} options.page - The PDFJS page proxy object.
|
|
* @returns {Promise<Buffer>} The buffer of the page.
|
|
*/
|
|
async pageToBuffer({ page }) {
|
|
if (!this.sharp) await this.init();
|
|
try {
|
|
this.log(`Converting page ${page.pageNumber} to image...`);
|
|
const ops = await page.getOperatorList();
|
|
const pageImages = ops.fnArray.length;
|
|
|
|
for (let i = 0; i < pageImages; i++) {
|
|
try {
|
|
if (!this.validOps.includes(ops.fnArray[i])) continue;
|
|
|
|
const name = ops.argsArray[i][0];
|
|
const img = await page.objs.get(name);
|
|
const { width, height } = img;
|
|
const size = img.data.length;
|
|
const channels = size / width / height;
|
|
const targetDPI = 70;
|
|
const targetWidth = Math.floor(width * (targetDPI / 72));
|
|
const targetHeight = Math.floor(height * (targetDPI / 72));
|
|
|
|
const image = this.sharp(img.data, {
|
|
raw: { width, height, channels },
|
|
density: targetDPI,
|
|
})
|
|
.resize({
|
|
width: targetWidth,
|
|
height: targetHeight,
|
|
fit: "fill",
|
|
})
|
|
.withMetadata({
|
|
density: targetDPI,
|
|
resolution: targetDPI,
|
|
})
|
|
.png();
|
|
|
|
// For debugging purposes
|
|
// await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
|
|
return await image.toBuffer();
|
|
} catch (error) {
|
|
this.log(`Iteration error: ${error.message}`, error.stack);
|
|
continue;
|
|
}
|
|
}
|
|
this.log(`No valid images found on page ${page.pageNumber}`);
|
|
return null;
|
|
} catch (error) {
|
|
this.log(`Error: ${error.message}`, error.stack);
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = OCRLoader;
|