You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
2.9 KiB

11 months ago
  1. /*
  2. * This is just a custom implementation of the Langchain JS YouTubeLoader class
  3. * as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
  4. * and instead of waiting for patches we can just bring this simple script in-house and at least
  5. * be able to patch it since its so flaky. When we have more connectors we can kill this because
  6. * it will be a pain to maintain over time.
  7. */
  8. class YoutubeLoader {
  9. #videoId;
  10. #language;
  11. #addVideoInfo;
  12. constructor({ videoId = null, language = null, addVideoInfo = false } = {}) {
  13. if (!videoId) throw new Error("Invalid video id!");
  14. this.#videoId = videoId;
  15. this.#language = language;
  16. this.#addVideoInfo = addVideoInfo;
  17. }
  18. /**
  19. * Extracts the videoId from a YouTube video URL.
  20. * @param url The URL of the YouTube video.
  21. * @returns The videoId of the YouTube video.
  22. */
  23. static getVideoID(url) {
  24. const match = url.match(
  25. /.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
  26. );
  27. if (match !== null && match[1].length === 11) {
  28. return match[1];
  29. } else {
  30. throw new Error("Failed to get youtube video id from the url");
  31. }
  32. }
  33. /**
  34. * Creates a new instance of the YoutubeLoader class from a YouTube video
  35. * URL.
  36. * @param url The URL of the YouTube video.
  37. * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
  38. * @returns A new instance of the YoutubeLoader class.
  39. */
  40. static createFromUrl(url, config = {}) {
  41. const videoId = YoutubeLoader.getVideoID(url);
  42. return new YoutubeLoader({ ...config, videoId });
  43. }
  44. /**
  45. * Loads the transcript and video metadata from the specified YouTube
  46. * video. It uses the youtube-transcript library to fetch the transcript
  47. * and the youtubei.js library to fetch the video metadata.
  48. * @returns Langchain like doc that is 1 element with PageContent and
  49. */
  50. async load() {
  51. let transcript;
  52. const metadata = {
  53. source: this.#videoId,
  54. };
  55. try {
  56. const { YoutubeTranscript } = require("./youtube-transcript");
  57. transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
  58. lang: this.#language,
  59. });
  60. if (!transcript) {
  61. throw new Error("Transcription not found");
  62. }
  63. if (this.#addVideoInfo) {
  64. const { Innertube } = require("youtubei.js");
  65. const youtube = await Innertube.create();
  66. const info = (await youtube.getBasicInfo(this.#videoId)).basic_info;
  67. metadata.description = info.short_description;
  68. metadata.title = info.title;
  69. metadata.view_count = info.view_count;
  70. metadata.author = info.author;
  71. }
  72. } catch (e) {
  73. throw new Error(
  74. `Failed to get YouTube video transcription: ${e?.message}`
  75. );
  76. }
  77. return [
  78. {
  79. pageContent: transcript,
  80. metadata,
  81. },
  82. ];
  83. }
  84. }
  85. module.exports.YoutubeLoader = YoutubeLoader;