You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
6.4 KiB

11 months ago
  1. /**
  2. * @typedef {object} DocumentMetadata
  3. * @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
  4. * @property {string} url - eg; "file://example.com/index.html"
  5. * @property {string} title - eg; "example.com/index.html"
  6. * @property {string} docAuthor - eg; "no author found"
  7. * @property {string} description - eg; "No description found."
  8. * @property {string} docSource - eg; "URL link uploaded by the user."
  9. * @property {string} chunkSource - eg; link://https://example.com
  10. * @property {string} published - ISO 8601 date string
  11. * @property {number} wordCount - Number of words in the document
  12. * @property {string} pageContent - The raw text content of the document
  13. * @property {number} token_count_estimate - Number of tokens in the document
  14. */
  15. function isNullOrNaN(value) {
  16. if (value === null) return true;
  17. return isNaN(value);
  18. }
  19. class TextSplitter {
  20. #splitter;
  21. constructor(config = {}) {
  22. /*
  23. config can be a ton of things depending on what is required or optional by the specific splitter.
  24. Non-splitter related keys
  25. {
  26. splitByFilename: string, // TODO
  27. }
  28. ------
  29. Default: "RecursiveCharacterTextSplitter"
  30. Config: {
  31. chunkSize: number,
  32. chunkOverlap: number,
  33. chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
  34. }
  35. ------
  36. */
  37. this.config = config;
  38. this.#splitter = this.#setSplitter(config);
  39. }
  40. log(text, ...args) {
  41. console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
  42. }
  43. /**
  44. * Does a quick check to determine the text chunk length limit.
  45. * Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
  46. * so here we want to allow override of the default 1000, but up to the models maximum, which is
  47. * sometimes user defined.
  48. */
  49. static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
  50. const prefValue = isNullOrNaN(preferred)
  51. ? Number(embedderLimit)
  52. : Number(preferred);
  53. const limit = Number(embedderLimit);
  54. if (prefValue > limit)
  55. console.log(
  56. `\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
  57. );
  58. return prefValue > limit ? limit : prefValue;
  59. }
  60. /**
  61. * Creates a string of metadata to be prepended to each chunk.
  62. * @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
  63. * @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk.
  64. */
  65. static buildHeaderMeta(metadata = {}) {
  66. if (!metadata || Object.keys(metadata).length === 0) return null;
  67. const PLUCK_MAP = {
  68. title: {
  69. as: "sourceDocument",
  70. pluck: (metadata) => {
  71. return metadata?.title || null;
  72. },
  73. },
  74. published: {
  75. as: "published",
  76. pluck: (metadata) => {
  77. return metadata?.published || null;
  78. },
  79. },
  80. chunkSource: {
  81. as: "source",
  82. pluck: (metadata) => {
  83. const validPrefixes = ["link://", "youtube://"];
  84. // If the chunkSource is a link or youtube link, we can add the URL
  85. // as its source in the metadata so the LLM can use it for context.
  86. // eg prompt: Where did you get this information? -> answer: "from https://example.com"
  87. if (
  88. !metadata?.chunkSource || // Exists
  89. !metadata?.chunkSource.length || // Is not empty
  90. typeof metadata.chunkSource !== "string" || // Is a string
  91. !validPrefixes.some(
  92. (prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
  93. )
  94. )
  95. return null;
  96. // We know a prefix is present, so we can split on it and return the rest.
  97. // If nothing is found, return null and it will not be added to the metadata.
  98. let source = null;
  99. for (const prefix of validPrefixes) {
  100. source = metadata.chunkSource.split(prefix)?.[1] || null;
  101. if (source) break;
  102. }
  103. return source;
  104. },
  105. },
  106. };
  107. const pluckedData = {};
  108. Object.entries(PLUCK_MAP).forEach(([key, value]) => {
  109. if (!(key in metadata)) return; // Skip if the metadata key is not present.
  110. const pluckedValue = value.pluck(metadata);
  111. if (!pluckedValue) return; // Skip if the plucked value is null/empty.
  112. pluckedData[value.as] = pluckedValue;
  113. });
  114. return pluckedData;
  115. }
  116. /**
  117. * Creates a string of metadata to be prepended to each chunk.
  118. */
  119. stringifyHeader() {
  120. if (!this.config.chunkHeaderMeta) return null;
  121. let content = "";
  122. Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
  123. if (!key || !value) return;
  124. content += `${key}: ${value}\n`;
  125. });
  126. if (!content) return null;
  127. return `<document_metadata>\n${content}</document_metadata>\n\n`;
  128. }
  129. #setSplitter(config = {}) {
  130. // if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
  131. return new RecursiveSplitter({
  132. chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
  133. chunkOverlap: isNaN(config?.chunkOverlap)
  134. ? 20
  135. : Number(config?.chunkOverlap),
  136. chunkHeader: this.stringifyHeader(),
  137. });
  138. }
  139. async splitText(documentText) {
  140. return this.#splitter._splitText(documentText);
  141. }
  142. }
  143. // Wrapper for Langchain default RecursiveCharacterTextSplitter class.
  144. class RecursiveSplitter {
  145. constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
  146. const {
  147. RecursiveCharacterTextSplitter,
  148. } = require("@langchain/textsplitters");
  149. this.log(`Will split with`, { chunkSize, chunkOverlap });
  150. this.chunkHeader = chunkHeader;
  151. this.engine = new RecursiveCharacterTextSplitter({
  152. chunkSize,
  153. chunkOverlap,
  154. });
  155. }
  156. log(text, ...args) {
  157. console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
  158. }
  159. async _splitText(documentText) {
  160. if (!this.chunkHeader) return this.engine.splitText(documentText);
  161. const strings = await this.engine.splitText(documentText);
  162. const documents = await this.engine.createDocuments(strings, [], {
  163. chunkHeader: this.chunkHeader,
  164. });
  165. return documents
  166. .filter((doc) => !!doc.pageContent)
  167. .map((doc) => doc.pageContent);
  168. }
  169. }
  170. module.exports.TextSplitter = TextSplitter;