You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
1.6 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const { DocxLoader } = require("langchain/document_loaders/fs/docx");
  3. const {
  4. createdDate,
  5. trashFile,
  6. writeToServerDocuments,
  7. } = require("../../utils/files");
  8. const { tokenizeString } = require("../../utils/tokenizer");
  9. const { default: slugify } = require("slugify");
  10. async function asDocX({ fullFilePath = "", filename = "" }) {
  11. const loader = new DocxLoader(fullFilePath);
  12. console.log(`-- Working ${filename} --`);
  13. let pageContent = [];
  14. const docs = await loader.load();
  15. for (const doc of docs) {
  16. console.log(`-- Parsing content from docx page --`);
  17. if (!doc.pageContent.length) continue;
  18. pageContent.push(doc.pageContent);
  19. }
  20. if (!pageContent.length) {
  21. console.error(`Resulting text content was empty for ${filename}.`);
  22. trashFile(fullFilePath);
  23. return {
  24. success: false,
  25. reason: `No text content found in ${filename}.`,
  26. documents: [],
  27. };
  28. }
  29. const content = pageContent.join("");
  30. const data = {
  31. id: v4(),
  32. url: "file://" + fullFilePath,
  33. title: filename,
  34. docAuthor: "no author found",
  35. description: "No description found.",
  36. docSource: "pdf file uploaded by the user.",
  37. chunkSource: "",
  38. published: createdDate(fullFilePath),
  39. wordCount: content.split(" ").length,
  40. pageContent: content,
  41. token_count_estimate: tokenizeString(content),
  42. };
  43. const document = writeToServerDocuments(
  44. data,
  45. `${slugify(filename)}-${data.id}`
  46. );
  47. trashFile(fullFilePath);
  48. console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  49. return { success: true, reason: null, documents: [document] };
  50. }
  51. module.exports = asDocX;