You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.6 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const { EPubLoader } = require("langchain/document_loaders/fs/epub");
  3. const { tokenizeString } = require("../../utils/tokenizer");
  4. const {
  5. createdDate,
  6. trashFile,
  7. writeToServerDocuments,
  8. } = require("../../utils/files");
  9. const { default: slugify } = require("slugify");
  10. async function asEPub({ fullFilePath = "", filename = "" }) {
  11. let content = "";
  12. try {
  13. const loader = new EPubLoader(fullFilePath, { splitChapters: false });
  14. const docs = await loader.load();
  15. docs.forEach((doc) => (content += doc.pageContent));
  16. } catch (err) {
  17. console.error("Could not read epub file!", err);
  18. }
  19. if (!content?.length) {
  20. console.error(`Resulting text content was empty for ${filename}.`);
  21. trashFile(fullFilePath);
  22. return {
  23. success: false,
  24. reason: `No text content found in ${filename}.`,
  25. documents: [],
  26. };
  27. }
  28. console.log(`-- Working ${filename} --`);
  29. const data = {
  30. id: v4(),
  31. url: "file://" + fullFilePath,
  32. title: filename,
  33. docAuthor: "Unknown", // TODO: Find a better author
  34. description: "Unknown", // TODO: Find a better description
  35. docSource: "a epub file uploaded by the user.",
  36. chunkSource: "",
  37. published: createdDate(fullFilePath),
  38. wordCount: content.split(" ").length,
  39. pageContent: content,
  40. token_count_estimate: tokenizeString(content),
  41. };
  42. const document = writeToServerDocuments(
  43. data,
  44. `${slugify(filename)}-${data.id}`
  45. );
  46. trashFile(fullFilePath);
  47. console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  48. return { success: true, reason: null, documents: [document] };
  49. }
  50. module.exports = asEPub;