You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
2.1 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const {
  3. createdDate,
  4. trashFile,
  5. writeToServerDocuments,
  6. } = require("../../../utils/files");
  7. const { tokenizeString } = require("../../../utils/tokenizer");
  8. const { default: slugify } = require("slugify");
  9. const PDFLoader = require("./PDFLoader");
  10. const OCRLoader = require("../../../utils/OCRLoader");
  11. async function asPdf({ fullFilePath = "", filename = "" }) {
  12. const pdfLoader = new PDFLoader(fullFilePath, {
  13. splitPages: true,
  14. });
  15. console.log(`-- Working ${filename} --`);
  16. const pageContent = [];
  17. let docs = await pdfLoader.load();
  18. if (docs.length === 0) {
  19. console.log(
  20. `[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
  21. );
  22. docs = await new OCRLoader().ocrPDF(fullFilePath);
  23. }
  24. for (const doc of docs) {
  25. console.log(
  26. `-- Parsing content from pg ${
  27. doc.metadata?.loc?.pageNumber || "unknown"
  28. } --`
  29. );
  30. if (!doc.pageContent || !doc.pageContent.length) continue;
  31. pageContent.push(doc.pageContent);
  32. }
  33. if (!pageContent.length) {
  34. console.error(`[asPDF] Resulting text content was empty for ${filename}.`);
  35. trashFile(fullFilePath);
  36. return {
  37. success: false,
  38. reason: `No text content found in ${filename}.`,
  39. documents: [],
  40. };
  41. }
  42. const content = pageContent.join("");
  43. const data = {
  44. id: v4(),
  45. url: "file://" + fullFilePath,
  46. title: filename,
  47. docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
  48. description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
  49. docSource: "pdf file uploaded by the user.",
  50. chunkSource: "",
  51. published: createdDate(fullFilePath),
  52. wordCount: content.split(" ").length,
  53. pageContent: content,
  54. token_count_estimate: tokenizeString(content),
  55. };
  56. const document = writeToServerDocuments(
  57. data,
  58. `${slugify(filename)}-${data.id}`
  59. );
  60. trashFile(fullFilePath);
  61. console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  62. return { success: true, reason: null, documents: [document] };
  63. }
  64. module.exports = asPdf;