|
|
const { v4 } = require("uuid");const { PuppeteerWebBaseLoader,} = require("langchain/document_loaders/web/puppeteer");const { writeToServerDocuments } = require("../../utils/files");const { tokenizeString } = require("../../utils/tokenizer");const { default: slugify } = require("slugify");
/** * Scrape a generic URL and return the content in the specified format * @param {string} link - The URL to scrape * @param {('html' | 'text')} captureAs - The format to capture the page content as * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly * @returns {Promise<Object>} - The content of the page */async function scrapeGenericUrl( link, captureAs = "text", processAsDocument = true) { console.log(`-- Working URL ${link} => (${captureAs}) --`); const content = await getPageContent(link, captureAs);
if (!content.length) { console.error(`Resulting URL content was empty at ${link}.`); return { success: false, reason: `No URL content found at ${link}.`, documents: [], }; }
if (!processAsDocument) { return { success: true, content, }; }
const url = new URL(link); const decodedPathname = decodeURIComponent(url.pathname); const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
const data = { id: v4(), url: "file://" + slugify(filename) + ".html", title: slugify(filename) + ".html", docAuthor: "no author found", description: "No description found.", docSource: "URL link uploaded by the user.", chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content), };
const document = writeToServerDocuments( data, `url-${slugify(filename)}-${data.id}` ); console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] };}
/** * Get the content of a page * @param {string} link - The URL to get the content of * @param {('html' | 'text')} captureAs - The format to capture the page content as * @returns {Promise<string>} - The content of the page */async function getPageContent(link, captureAs = "text") { try { let pageContents = []; const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", ignoreHTTPSErrors: true, }, gotoOptions: { waitUntil: "networkidle2", }, async evaluate(page, browser) { const result = await page.evaluate((captureAs) => { if (captureAs === "text") return document.body.innerText; if (captureAs === "html") return document.documentElement.innerHTML; return document.body.innerText; }, captureAs); await browser.close(); return result; }, });
const docs = await loader.load();
for (const doc of docs) { pageContents.push(doc.pageContent); }
return pageContents.join(" "); } catch (error) { console.error( "getPageContent failed to be fetched by puppeteer - falling back to fetch!", error ); }
try { const pageText = await fetch(link, { method: "GET", headers: { "Content-Type": "text/plain", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)", }, }).then((res) => res.text()); return pageText; } catch (error) { console.error("getPageContent failed to be fetched by any method.", error); }
return null;}
module.exports = { scrapeGenericUrl,};
|