You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

53 lines
1.5 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const officeParser = require("officeparser");
  3. const {
  4. createdDate,
  5. trashFile,
  6. writeToServerDocuments,
  7. } = require("../../utils/files");
  8. const { tokenizeString } = require("../../utils/tokenizer");
  9. const { default: slugify } = require("slugify");
  10. async function asOfficeMime({ fullFilePath = "", filename = "" }) {
  11. console.log(`-- Working ${filename} --`);
  12. let content = "";
  13. try {
  14. content = await officeParser.parseOfficeAsync(fullFilePath);
  15. } catch (error) {
  16. console.error(`Could not parse office or office-like file`, error);
  17. }
  18. if (!content.length) {
  19. console.error(`Resulting text content was empty for ${filename}.`);
  20. trashFile(fullFilePath);
  21. return {
  22. success: false,
  23. reason: `No text content found in ${filename}.`,
  24. documents: [],
  25. };
  26. }
  27. const data = {
  28. id: v4(),
  29. url: "file://" + fullFilePath,
  30. title: filename,
  31. docAuthor: "no author found",
  32. description: "No description found.",
  33. docSource: "Office file uploaded by the user.",
  34. chunkSource: "",
  35. published: createdDate(fullFilePath),
  36. wordCount: content.split(" ").length,
  37. pageContent: content,
  38. token_count_estimate: tokenizeString(content),
  39. };
  40. const document = writeToServerDocuments(
  41. data,
  42. `${slugify(filename)}-${data.id}`
  43. );
  44. trashFile(fullFilePath);
  45. console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  46. return { success: true, reason: null, documents: [document] };
  47. }
  48. module.exports = asOfficeMime;