You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

166 lines
5.0 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const {
  3. PuppeteerWebBaseLoader,
  4. } = require("langchain/document_loaders/web/puppeteer");
  5. const { default: slugify } = require("slugify");
  6. const { parse } = require("node-html-parser");
  7. const { writeToServerDocuments } = require("../../files");
  8. const { tokenizeString } = require("../../tokenizer");
  9. const path = require("path");
  10. const fs = require("fs");
  11. async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) {
  12. const baseUrl = new URL(startUrl);
  13. const discoveredLinks = new Set([startUrl]);
  14. let queue = [[startUrl, 0]]; // [url, currentDepth]
  15. const scrapedUrls = new Set();
  16. for (let currentDepth = 0; currentDepth < maxDepth; currentDepth++) {
  17. const levelSize = queue.length;
  18. const nextQueue = [];
  19. for (let i = 0; i < levelSize && discoveredLinks.size < maxLinks; i++) {
  20. const [currentUrl, urlDepth] = queue[i];
  21. if (!scrapedUrls.has(currentUrl)) {
  22. scrapedUrls.add(currentUrl);
  23. const newLinks = await getPageLinks(currentUrl, baseUrl);
  24. for (const link of newLinks) {
  25. if (!discoveredLinks.has(link) && discoveredLinks.size < maxLinks) {
  26. discoveredLinks.add(link);
  27. if (urlDepth + 1 < maxDepth) {
  28. nextQueue.push([link, urlDepth + 1]);
  29. }
  30. }
  31. }
  32. }
  33. }
  34. queue = nextQueue;
  35. if (queue.length === 0 || discoveredLinks.size >= maxLinks) break;
  36. }
  37. return Array.from(discoveredLinks);
  38. }
  39. async function getPageLinks(url, baseUrl) {
  40. try {
  41. const loader = new PuppeteerWebBaseLoader(url, {
  42. launchOptions: { headless: "new" },
  43. gotoOptions: { waitUntil: "networkidle2" },
  44. });
  45. const docs = await loader.load();
  46. const html = docs[0].pageContent;
  47. const links = extractLinks(html, baseUrl);
  48. return links;
  49. } catch (error) {
  50. console.error(`Failed to get page links from ${url}.`, error);
  51. return [];
  52. }
  53. }
  54. function extractLinks(html, baseUrl) {
  55. const root = parse(html);
  56. const links = root.querySelectorAll("a");
  57. const extractedLinks = new Set();
  58. for (const link of links) {
  59. const href = link.getAttribute("href");
  60. if (href) {
  61. const absoluteUrl = new URL(href, baseUrl.href).href;
  62. if (
  63. absoluteUrl.startsWith(
  64. baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/")
  65. )
  66. ) {
  67. extractedLinks.add(absoluteUrl);
  68. }
  69. }
  70. }
  71. return Array.from(extractedLinks);
  72. }
  73. async function bulkScrapePages(links, outFolderPath) {
  74. const scrapedData = [];
  75. for (let i = 0; i < links.length; i++) {
  76. const link = links[i];
  77. console.log(`Scraping ${i + 1}/${links.length}: ${link}`);
  78. try {
  79. const loader = new PuppeteerWebBaseLoader(link, {
  80. launchOptions: { headless: "new" },
  81. gotoOptions: { waitUntil: "networkidle2" },
  82. async evaluate(page, browser) {
  83. const result = await page.evaluate(() => document.body.innerText);
  84. await browser.close();
  85. return result;
  86. },
  87. });
  88. const docs = await loader.load();
  89. const content = docs[0].pageContent;
  90. if (!content.length) {
  91. console.warn(`Empty content for ${link}. Skipping.`);
  92. continue;
  93. }
  94. const url = new URL(link);
  95. const decodedPathname = decodeURIComponent(url.pathname);
  96. const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
  97. const data = {
  98. id: v4(),
  99. url: "file://" + slugify(filename) + ".html",
  100. title: slugify(filename) + ".html",
  101. docAuthor: "no author found",
  102. description: "No description found.",
  103. docSource: "URL link uploaded by the user.",
  104. chunkSource: `link://${link}`,
  105. published: new Date().toLocaleString(),
  106. wordCount: content.split(" ").length,
  107. pageContent: content,
  108. token_count_estimate: tokenizeString(content),
  109. };
  110. writeToServerDocuments(data, data.title, outFolderPath);
  111. scrapedData.push(data);
  112. console.log(`Successfully scraped ${link}.`);
  113. } catch (error) {
  114. console.error(`Failed to scrape ${link}.`, error);
  115. }
  116. }
  117. return scrapedData;
  118. }
  119. async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
  120. const websiteName = new URL(startUrl).hostname;
  121. const outFolder = slugify(
  122. `${slugify(websiteName)}-${v4().slice(0, 4)}`
  123. ).toLowerCase();
  124. const outFolderPath =
  125. process.env.NODE_ENV === "development"
  126. ? path.resolve(
  127. __dirname,
  128. `../../../../server/storage/documents/${outFolder}`
  129. )
  130. : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
  131. console.log("Discovering links...");
  132. const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
  133. console.log(`Found ${linksToScrape.length} links to scrape.`);
  134. if (!fs.existsSync(outFolderPath))
  135. fs.mkdirSync(outFolderPath, { recursive: true });
  136. console.log("Starting bulk scraping...");
  137. const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath);
  138. console.log(`Scraped ${scrapedData.length} pages.`);
  139. return scrapedData;
  140. }
  141. module.exports = websiteScraper;