You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

423 lines
15 KiB

11 months ago
  1. const { ChromaClient } = require("chromadb");
  2. const { TextSplitter } = require("../../TextSplitter");
  3. const { SystemSettings } = require("../../../models/systemSettings");
  4. const { storeVectorResult, cachedVectorInformation } = require("../../files");
  5. const { v4: uuidv4 } = require("uuid");
  6. const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
  7. const { parseAuthHeader } = require("../../http");
  8. const { sourceIdentifier } = require("../../chats");
  9. const COLLECTION_REGEX = new RegExp(
  10. /^(?!\d+\.\d+\.\d+\.\d+$)(?!.*\.\.)(?=^[a-zA-Z0-9][a-zA-Z0-9_-]{1,61}[a-zA-Z0-9]$).{3,63}$/
  11. );
  12. const Chroma = {
  13. name: "Chroma",
  14. // Chroma DB has specific requirements for collection names:
  15. // (1) Must contain 3-63 characters
  16. // (2) Must start and end with an alphanumeric character
  17. // (3) Can only contain alphanumeric characters, underscores, or hyphens
  18. // (4) Cannot contain two consecutive periods (..)
  19. // (5) Cannot be a valid IPv4 address
  20. // We need to enforce these rules by normalizing the collection names
  21. // before communicating with the Chroma DB.
  22. normalize: function (inputString) {
  23. if (COLLECTION_REGEX.test(inputString)) return inputString;
  24. let normalized = inputString.replace(/[^a-zA-Z0-9_-]/g, "-");
  25. // Replace consecutive periods with a single period (if any)
  26. normalized = normalized.replace(/\.\.+/g, ".");
  27. // Ensure the name doesn't start with a non-alphanumeric character
  28. if (normalized[0] && !/^[a-zA-Z0-9]$/.test(normalized[0])) {
  29. normalized = "anythingllm-" + normalized.slice(1);
  30. }
  31. // Ensure the name doesn't end with a non-alphanumeric character
  32. if (
  33. normalized[normalized.length - 1] &&
  34. !/^[a-zA-Z0-9]$/.test(normalized[normalized.length - 1])
  35. ) {
  36. normalized = normalized.slice(0, -1);
  37. }
  38. // Ensure the length is between 3 and 63 characters
  39. if (normalized.length < 3) {
  40. normalized = `anythingllm-${normalized}`;
  41. } else if (normalized.length > 63) {
  42. // Recheck the norm'd name if sliced since its ending can still be invalid.
  43. normalized = this.normalize(normalized.slice(0, 63));
  44. }
  45. // Ensure the name is not an IPv4 address
  46. if (/^\d+\.\d+\.\d+\.\d+$/.test(normalized)) {
  47. normalized = "-" + normalized.slice(1);
  48. }
  49. return normalized;
  50. },
  51. connect: async function () {
  52. if (process.env.VECTOR_DB !== "chroma")
  53. throw new Error("Chroma::Invalid ENV settings");
  54. const client = new ChromaClient({
  55. path: process.env.CHROMA_ENDPOINT, // if not set will fallback to localhost:8000
  56. ...(!!process.env.CHROMA_API_HEADER && !!process.env.CHROMA_API_KEY
  57. ? {
  58. fetchOptions: {
  59. headers: parseAuthHeader(
  60. process.env.CHROMA_API_HEADER || "X-Api-Key",
  61. process.env.CHROMA_API_KEY
  62. ),
  63. },
  64. }
  65. : {}),
  66. });
  67. const isAlive = await client.heartbeat();
  68. if (!isAlive)
  69. throw new Error(
  70. "ChromaDB::Invalid Heartbeat received - is the instance online?"
  71. );
  72. return { client };
  73. },
  74. heartbeat: async function () {
  75. const { client } = await this.connect();
  76. return { heartbeat: await client.heartbeat() };
  77. },
  78. totalVectors: async function () {
  79. const { client } = await this.connect();
  80. const collections = await client.listCollections();
  81. var totalVectors = 0;
  82. for (const collectionObj of collections) {
  83. const collection = await client
  84. .getCollection({ name: collectionObj.name })
  85. .catch(() => null);
  86. if (!collection) continue;
  87. totalVectors += await collection.count();
  88. }
  89. return totalVectors;
  90. },
  91. distanceToSimilarity: function (distance = null) {
  92. if (distance === null || typeof distance !== "number") return 0.0;
  93. if (distance >= 1.0) return 1;
  94. if (distance < 0) return 1 - Math.abs(distance);
  95. return 1 - distance;
  96. },
  97. namespaceCount: async function (_namespace = null) {
  98. const { client } = await this.connect();
  99. const namespace = await this.namespace(client, this.normalize(_namespace));
  100. return namespace?.vectorCount || 0;
  101. },
  102. similarityResponse: async function ({
  103. client,
  104. namespace,
  105. queryVector,
  106. similarityThreshold = 0.25,
  107. topN = 4,
  108. filterIdentifiers = [],
  109. }) {
  110. const collection = await client.getCollection({
  111. name: this.normalize(namespace),
  112. });
  113. const result = {
  114. contextTexts: [],
  115. sourceDocuments: [],
  116. scores: [],
  117. };
  118. const response = await collection.query({
  119. queryEmbeddings: queryVector,
  120. nResults: topN,
  121. });
  122. response.ids[0].forEach((_, i) => {
  123. if (
  124. this.distanceToSimilarity(response.distances[0][i]) <
  125. similarityThreshold
  126. )
  127. return;
  128. if (
  129. filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
  130. ) {
  131. console.log(
  132. "Chroma: A source was filtered from context as it's parent document is pinned."
  133. );
  134. return;
  135. }
  136. result.contextTexts.push(response.documents[0][i]);
  137. result.sourceDocuments.push(response.metadatas[0][i]);
  138. result.scores.push(this.distanceToSimilarity(response.distances[0][i]));
  139. });
  140. return result;
  141. },
  142. namespace: async function (client, namespace = null) {
  143. if (!namespace) throw new Error("No namespace value provided.");
  144. const collection = await client
  145. .getCollection({ name: this.normalize(namespace) })
  146. .catch(() => null);
  147. if (!collection) return null;
  148. return {
  149. ...collection,
  150. vectorCount: await collection.count(),
  151. };
  152. },
  153. hasNamespace: async function (namespace = null) {
  154. if (!namespace) return false;
  155. const { client } = await this.connect();
  156. return await this.namespaceExists(client, this.normalize(namespace));
  157. },
  158. namespaceExists: async function (client, namespace = null) {
  159. if (!namespace) throw new Error("No namespace value provided.");
  160. const collection = await client
  161. .getCollection({ name: this.normalize(namespace) })
  162. .catch((e) => {
  163. console.error("ChromaDB::namespaceExists", e.message);
  164. return null;
  165. });
  166. return !!collection;
  167. },
  168. deleteVectorsInNamespace: async function (client, namespace = null) {
  169. await client.deleteCollection({ name: this.normalize(namespace) });
  170. return true;
  171. },
  172. addDocumentToNamespace: async function (
  173. namespace,
  174. documentData = {},
  175. fullFilePath = null,
  176. skipCache = false
  177. ) {
  178. const { DocumentVectors } = require("../../../models/vectors");
  179. try {
  180. const { pageContent, docId, ...metadata } = documentData;
  181. if (!pageContent || pageContent.length == 0) return false;
  182. console.log("Adding new vectorized document into namespace", namespace);
  183. if (skipCache) {
  184. const cacheResult = await cachedVectorInformation(fullFilePath);
  185. if (cacheResult.exists) {
  186. const { client } = await this.connect();
  187. const collection = await client.getOrCreateCollection({
  188. name: this.normalize(namespace),
  189. metadata: { "hnsw:space": "cosine" },
  190. });
  191. const { chunks } = cacheResult;
  192. const documentVectors = [];
  193. for (const chunk of chunks) {
  194. const submission = {
  195. ids: [],
  196. embeddings: [],
  197. metadatas: [],
  198. documents: [],
  199. };
  200. // Before sending to Chroma and saving the records to our db
  201. // we need to assign the id of each chunk that is stored in the cached file.
  202. chunk.forEach((chunk) => {
  203. const id = uuidv4();
  204. const { id: _id, ...metadata } = chunk.metadata;
  205. documentVectors.push({ docId, vectorId: id });
  206. submission.ids.push(id);
  207. submission.embeddings.push(chunk.values);
  208. submission.metadatas.push(metadata);
  209. submission.documents.push(metadata.text);
  210. });
  211. const additionResult = await collection.add(submission);
  212. if (!additionResult)
  213. throw new Error("Error embedding into ChromaDB", additionResult);
  214. }
  215. await DocumentVectors.bulkInsert(documentVectors);
  216. return { vectorized: true, error: null };
  217. }
  218. }
  219. // If we are here then we are going to embed and store a novel document.
  220. // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
  221. // because we then cannot atomically control our namespace to granularly find/remove documents
  222. // from vectordb.
  223. const EmbedderEngine = getEmbeddingEngineSelection();
  224. const textSplitter = new TextSplitter({
  225. chunkSize: TextSplitter.determineMaxChunkSize(
  226. await SystemSettings.getValueOrFallback({
  227. label: "text_splitter_chunk_size",
  228. }),
  229. EmbedderEngine?.embeddingMaxChunkLength
  230. ),
  231. chunkOverlap: await SystemSettings.getValueOrFallback(
  232. { label: "text_splitter_chunk_overlap" },
  233. 20
  234. ),
  235. chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
  236. });
  237. const textChunks = await textSplitter.splitText(pageContent);
  238. console.log("Chunks created from document:", textChunks.length);
  239. const documentVectors = [];
  240. const vectors = [];
  241. const vectorValues = await EmbedderEngine.embedChunks(textChunks);
  242. const submission = {
  243. ids: [],
  244. embeddings: [],
  245. metadatas: [],
  246. documents: [],
  247. };
  248. if (!!vectorValues && vectorValues.length > 0) {
  249. for (const [i, vector] of vectorValues.entries()) {
  250. const vectorRecord = {
  251. id: uuidv4(),
  252. values: vector,
  253. // [DO NOT REMOVE]
  254. // LangChain will be unable to find your text if you embed manually and dont include the `text` key.
  255. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
  256. metadata: { ...metadata, text: textChunks[i] },
  257. };
  258. submission.ids.push(vectorRecord.id);
  259. submission.embeddings.push(vectorRecord.values);
  260. submission.metadatas.push(metadata);
  261. submission.documents.push(textChunks[i]);
  262. vectors.push(vectorRecord);
  263. documentVectors.push({ docId, vectorId: vectorRecord.id });
  264. }
  265. } else {
  266. throw new Error(
  267. "Could not embed document chunks! This document will not be recorded."
  268. );
  269. }
  270. const { client } = await this.connect();
  271. const collection = await client.getOrCreateCollection({
  272. name: this.normalize(namespace),
  273. metadata: { "hnsw:space": "cosine" },
  274. });
  275. if (vectors.length > 0) {
  276. const chunks = [];
  277. console.log("Inserting vectorized chunks into Chroma collection.");
  278. for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
  279. const additionResult = await collection.add(submission);
  280. if (!additionResult)
  281. throw new Error("Error embedding into ChromaDB", additionResult);
  282. await storeVectorResult(chunks, fullFilePath);
  283. }
  284. await DocumentVectors.bulkInsert(documentVectors);
  285. return { vectorized: true, error: null };
  286. } catch (e) {
  287. console.error("addDocumentToNamespace", e.message);
  288. return { vectorized: false, error: e.message };
  289. }
  290. },
  291. deleteDocumentFromNamespace: async function (namespace, docId) {
  292. const { DocumentVectors } = require("../../../models/vectors");
  293. const { client } = await this.connect();
  294. if (!(await this.namespaceExists(client, namespace))) return;
  295. const collection = await client.getCollection({
  296. name: this.normalize(namespace),
  297. });
  298. const knownDocuments = await DocumentVectors.where({ docId });
  299. if (knownDocuments.length === 0) return;
  300. const vectorIds = knownDocuments.map((doc) => doc.vectorId);
  301. await collection.delete({ ids: vectorIds });
  302. const indexes = knownDocuments.map((doc) => doc.id);
  303. await DocumentVectors.deleteIds(indexes);
  304. return true;
  305. },
  306. performSimilaritySearch: async function ({
  307. namespace = null,
  308. input = "",
  309. LLMConnector = null,
  310. similarityThreshold = 0.25,
  311. topN = 4,
  312. filterIdentifiers = [],
  313. }) {
  314. if (!namespace || !input || !LLMConnector)
  315. throw new Error("Invalid request to performSimilaritySearch.");
  316. const { client } = await this.connect();
  317. if (!(await this.namespaceExists(client, this.normalize(namespace)))) {
  318. return {
  319. contextTexts: [],
  320. sources: [],
  321. message: "Invalid query - no documents found for workspace!",
  322. };
  323. }
  324. const queryVector = await LLMConnector.embedTextInput(input);
  325. const { contextTexts, sourceDocuments } = await this.similarityResponse({
  326. client,
  327. namespace,
  328. queryVector,
  329. similarityThreshold,
  330. topN,
  331. filterIdentifiers,
  332. });
  333. const sources = sourceDocuments.map((metadata, i) => {
  334. return { metadata: { ...metadata, text: contextTexts[i] } };
  335. });
  336. return {
  337. contextTexts,
  338. sources: this.curateSources(sources),
  339. message: false,
  340. };
  341. },
  342. "namespace-stats": async function (reqBody = {}) {
  343. const { namespace = null } = reqBody;
  344. if (!namespace) throw new Error("namespace required");
  345. const { client } = await this.connect();
  346. if (!(await this.namespaceExists(client, this.normalize(namespace))))
  347. throw new Error("Namespace by that name does not exist.");
  348. const stats = await this.namespace(client, this.normalize(namespace));
  349. return stats
  350. ? stats
  351. : { message: "No stats were able to be fetched from DB for namespace" };
  352. },
  353. "delete-namespace": async function (reqBody = {}) {
  354. const { namespace = null } = reqBody;
  355. const { client } = await this.connect();
  356. if (!(await this.namespaceExists(client, this.normalize(namespace))))
  357. throw new Error("Namespace by that name does not exist.");
  358. const details = await this.namespace(client, this.normalize(namespace));
  359. await this.deleteVectorsInNamespace(client, this.normalize(namespace));
  360. return {
  361. message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
  362. };
  363. },
  364. reset: async function () {
  365. const { client } = await this.connect();
  366. await client.reset();
  367. return { reset: true };
  368. },
  369. curateSources: function (sources = []) {
  370. const documents = [];
  371. for (const source of sources) {
  372. const { metadata = {} } = source;
  373. if (Object.keys(metadata).length > 0) {
  374. documents.push({
  375. ...metadata,
  376. ...(source.hasOwnProperty("pageContent")
  377. ? { text: source.pageContent }
  378. : {}),
  379. });
  380. }
  381. }
  382. return documents;
  383. },
  384. };
  385. module.exports.Chroma = Chroma;