You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
3.6 KiB

11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { default: slugify } = require("slugify");
  4. const { v4 } = require("uuid");
  5. const { writeToServerDocuments } = require("../../files");
  6. const { tokenizeString } = require("../../tokenizer");
  7. const { YoutubeLoader } = require("./YoutubeLoader");
  8. function validYoutubeVideoUrl(link) {
  9. const UrlPattern = require("url-pattern");
  10. const opts = new URL(link);
  11. const url = `${opts.protocol}//${opts.host}${opts.pathname}${
  12. opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
  13. }`;
  14. const shortPatternMatch = new UrlPattern(
  15. "https\\://(www.)youtu.be/(:videoId)"
  16. ).match(url);
  17. const fullPatternMatch = new UrlPattern(
  18. "https\\://(www.)youtube.com/watch?v=(:videoId)"
  19. ).match(url);
  20. const videoId =
  21. shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
  22. if (!!videoId) return true;
  23. return false;
  24. }
  25. async function fetchVideoTranscriptContent({ url }) {
  26. if (!validYoutubeVideoUrl(url)) {
  27. return {
  28. success: false,
  29. reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
  30. content: null,
  31. metadata: {},
  32. };
  33. }
  34. console.log(`-- Working YouTube ${url} --`);
  35. const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
  36. const { docs, error } = await loader
  37. .load()
  38. .then((docs) => {
  39. return { docs, error: null };
  40. })
  41. .catch((e) => {
  42. return {
  43. docs: [],
  44. error: e.message?.split("Error:")?.[1] || e.message,
  45. };
  46. });
  47. if (!docs.length || !!error) {
  48. return {
  49. success: false,
  50. reason: error ?? "No transcript found for that YouTube video.",
  51. content: null,
  52. metadata: {},
  53. };
  54. }
  55. const metadata = docs[0].metadata;
  56. const content = docs[0].pageContent;
  57. if (!content.length) {
  58. return {
  59. success: false,
  60. reason: "No transcript could be parsed for that YouTube video.",
  61. content: null,
  62. metadata: {},
  63. };
  64. }
  65. return {
  66. success: true,
  67. reason: null,
  68. content,
  69. metadata,
  70. };
  71. }
  72. async function loadYouTubeTranscript({ url }) {
  73. const transcriptResults = await fetchVideoTranscriptContent({ url });
  74. if (!transcriptResults.success) {
  75. return {
  76. success: false,
  77. reason:
  78. transcriptResults.reason ||
  79. "An unknown error occurred during transcription retrieval",
  80. };
  81. }
  82. const { content, metadata } = transcriptResults;
  83. const outFolder = slugify(
  84. `${metadata.author} YouTube transcripts`
  85. ).toLowerCase();
  86. const outFolderPath =
  87. process.env.NODE_ENV === "development"
  88. ? path.resolve(
  89. __dirname,
  90. `../../../../server/storage/documents/${outFolder}`
  91. )
  92. : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
  93. if (!fs.existsSync(outFolderPath))
  94. fs.mkdirSync(outFolderPath, { recursive: true });
  95. const data = {
  96. id: v4(),
  97. url: url + ".youtube",
  98. title: metadata.title || url,
  99. docAuthor: metadata.author,
  100. description: metadata.description,
  101. docSource: url,
  102. chunkSource: `youtube://${url}`,
  103. published: new Date().toLocaleString(),
  104. wordCount: content.split(" ").length,
  105. pageContent: content,
  106. token_count_estimate: tokenizeString(content),
  107. };
  108. console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
  109. writeToServerDocuments(
  110. data,
  111. `${slugify(metadata.title)}-${data.id}`,
  112. outFolderPath
  113. );
  114. return {
  115. success: true,
  116. reason: "test",
  117. data: {
  118. title: metadata.title,
  119. author: metadata.author,
  120. destination: outFolder,
  121. },
  122. };
  123. }
  124. module.exports = {
  125. loadYouTubeTranscript,
  126. fetchVideoTranscriptContent,
  127. };