You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
const { parse } = require("node-html-parser");const RE_YOUTUBE = /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
class YoutubeTranscriptError extends Error { constructor(message) { super(`[YoutubeTranscript] ${message}`); }}
/** * Class to retrieve transcript if exist */class YoutubeTranscript { /** * Fetch transcript from YTB Video * @param videoId Video url or video identifier * @param config Object with lang param (eg: en, es, hk, uk) format. * Will just the grab first caption if it can find one, so no special lang caption support. */ static async fetchTranscript(videoId, config = {}) { const identifier = this.retrieveVideoId(videoId); const lang = config?.lang ?? "en"; try { const transcriptUrl = await fetch( `https://www.youtube.com/watch?v=${identifier}`, { headers: { "User-Agent": USER_AGENT, }, } ) .then((res) => res.text()) .then((html) => parse(html)) .then((html) => this.#parseTranscriptEndpoint(html, lang));
if (!transcriptUrl) throw new Error("Failed to locate a transcript for this video!");
// Result is hopefully some XML.
const transcriptXML = await fetch(transcriptUrl) .then((res) => res.text()) .then((xml) => parse(xml));
let transcript = ""; const chunks = transcriptXML.getElementsByTagName("text"); for (const chunk of chunks) { // Add space after each text chunk
transcript += chunk.textContent + " "; }
// Trim extra whitespace
return transcript.trim().replace(/\s+/g, " "); } catch (e) { throw new YoutubeTranscriptError(e); } }
static #parseTranscriptEndpoint(document, langCode = null) { try { // Get all script tags on document page
const scripts = document.getElementsByTagName("script");
// find the player data script.
const playerScript = scripts.find((script) => script.textContent.includes("var ytInitialPlayerResponse = {") );
const dataString = playerScript.textContent ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
?.split("};")?.[0] + // chunk off any code after object closure.
"}"; // add back that curly brace we just cut.
const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
const availableCaptions = data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
// If languageCode was specified then search for it's code, otherwise get the first.
let captionTrack = availableCaptions?.[0]; if (langCode) captionTrack = availableCaptions.find((track) => track.languageCode.includes(langCode) ) ?? availableCaptions?.[0];
return captionTrack?.baseUrl; } catch (e) { console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`); return null; } }
/** * Retrieve video id from url or string * @param videoId video url or video id */ static retrieveVideoId(videoId) { if (videoId.length === 11) { return videoId; } const matchId = videoId.match(RE_YOUTUBE); if (matchId && matchId.length) { return matchId[1]; } throw new YoutubeTranscriptError( "Impossible to retrieve Youtube video ID." ); }}
module.exports = { YoutubeTranscript, YoutubeTranscriptError,};
|