You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

73 lines
2.0 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const {
  3. createdDate,
  4. trashFile,
  5. writeToServerDocuments,
  6. } = require("../../utils/files");
  7. const { tokenizeString } = require("../../utils/tokenizer");
  8. const { default: slugify } = require("slugify");
  9. const { LocalWhisper } = require("../../utils/WhisperProviders/localWhisper");
  10. const { OpenAiWhisper } = require("../../utils/WhisperProviders/OpenAiWhisper");
  11. const WHISPER_PROVIDERS = {
  12. openai: OpenAiWhisper,
  13. local: LocalWhisper,
  14. };
  15. async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
  16. const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
  17. options?.whisperProvider
  18. )
  19. ? WHISPER_PROVIDERS[options?.whisperProvider]
  20. : WHISPER_PROVIDERS.local;
  21. console.log(`-- Working ${filename} --`);
  22. const whisper = new WhisperProvider({ options });
  23. const { content, error } = await whisper.processFile(fullFilePath, filename);
  24. if (!!error) {
  25. console.error(`Error encountered for parsing of ${filename}.`);
  26. trashFile(fullFilePath);
  27. return {
  28. success: false,
  29. reason: error,
  30. documents: [],
  31. };
  32. }
  33. if (!content?.length) {
  34. console.error(`Resulting text content was empty for ${filename}.`);
  35. trashFile(fullFilePath);
  36. return {
  37. success: false,
  38. reason: `No text content found in ${filename}.`,
  39. documents: [],
  40. };
  41. }
  42. const data = {
  43. id: v4(),
  44. url: "file://" + fullFilePath,
  45. title: filename,
  46. docAuthor: "no author found",
  47. description: "No description found.",
  48. docSource: "pdf file uploaded by the user.",
  49. chunkSource: "",
  50. published: createdDate(fullFilePath),
  51. wordCount: content.split(" ").length,
  52. pageContent: content,
  53. token_count_estimate: tokenizeString(content),
  54. };
  55. const document = writeToServerDocuments(
  56. data,
  57. `${slugify(filename)}-${data.id}`
  58. );
  59. trashFile(fullFilePath);
  60. console.log(
  61. `[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n`
  62. );
  63. return { success: true, reason: null, documents: [document] };
  64. }
  65. module.exports = asAudio;