You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

96 lines
3.0 KiB

11 months ago
  1. const { CollectorApi } = require("../../collectorApi");
  2. const { TokenManager } = require("../../helpers/tiktoken");
  3. const Provider = require("../../agents/aibitat/providers/ai-provider");
  4. const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
  5. /**
  6. * Execute a web scraping flow step
  7. * @param {Object} config Flow step configuration
  8. * @param {Object} context Execution context with introspect function
  9. * @returns {Promise<string>} Scraped content
  10. */
  11. async function executeWebScraping(config, context) {
  12. const { url, captureAs = "text" } = config;
  13. const { introspect, model, provider } = context;
  14. if (!url) {
  15. throw new Error("URL is required for web scraping");
  16. }
  17. // Remap the captureAs to the correct mode for the CollectorApi
  18. const captureMode = captureAs === "querySelector" ? "html" : captureAs;
  19. introspect(`Scraping the content of ${url} as ${captureAs}`);
  20. const { success, content } = await new CollectorApi()
  21. .getLinkContent(url, captureMode)
  22. .then((res) => {
  23. if (captureAs !== "querySelector") return res;
  24. return parseHTMLwithSelector(res.content, config.querySelector, context);
  25. });
  26. if (!success) {
  27. introspect(`Could not scrape ${url}. Cannot use this page's content.`);
  28. throw new Error("URL could not be scraped and no content was found.");
  29. }
  30. introspect(`Successfully scraped content from ${url}`);
  31. if (!content || content?.length === 0) {
  32. throw new Error("There was no content to be collected or read.");
  33. }
  34. const tokenCount = new TokenManager(model).countFromString(content);
  35. const contextLimit = Provider.contextLimit(provider, model);
  36. if (tokenCount < contextLimit) {
  37. return content;
  38. }
  39. introspect(
  40. `This page's content is way too long. I will summarize it right now.`
  41. );
  42. const summary = await summarizeContent({
  43. provider,
  44. model,
  45. content,
  46. });
  47. introspect(`Successfully summarized content`);
  48. return summary;
  49. }
  50. /**
  51. * Parse HTML with a CSS selector
  52. * @param {string} html - The HTML to parse
  53. * @param {string|null} selector - The CSS selector to use (as text string)
  54. * @param {{introspect: Function}} context - The context object
  55. * @returns {Object} The parsed content
  56. */
  57. function parseHTMLwithSelector(html, selector = null, context) {
  58. if (!selector || selector.length === 0) {
  59. context.introspect("No selector provided. Returning the entire HTML.");
  60. return { success: true, content: html };
  61. }
  62. const Cheerio = require("cheerio");
  63. const $ = Cheerio.load(html);
  64. const selectedElements = $(selector);
  65. let content;
  66. if (selectedElements.length === 0) {
  67. return { success: false, content: null };
  68. } else if (selectedElements.length === 1) {
  69. content = selectedElements.html();
  70. } else {
  71. context.introspect(
  72. `Found ${selectedElements.length} elements matching selector: ${selector}`
  73. );
  74. content = selectedElements
  75. .map((_, element) => $(element).html())
  76. .get()
  77. .join("\n");
  78. }
  79. return { success: true, content };
  80. }
  81. module.exports = executeWebScraping;