You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

335 lines
11 KiB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { v5: uuidv5 } = require("uuid");
  4. const { Document } = require("../../models/documents");
  5. const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
  6. const documentsPath =
  7. process.env.NODE_ENV === "development"
  8. ? path.resolve(__dirname, `../../storage/documents`)
  9. : path.resolve(process.env.STORAGE_DIR, `documents`);
  10. const vectorCachePath =
  11. process.env.NODE_ENV === "development"
  12. ? path.resolve(__dirname, `../../storage/vector-cache`)
  13. : path.resolve(process.env.STORAGE_DIR, `vector-cache`);
  14. // Should take in a folder that is a subfolder of documents
  15. // eg: youtube-subject/video-123.json
  16. async function fileData(filePath = null) {
  17. if (!filePath) throw new Error("No docPath provided in request");
  18. const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
  19. if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
  20. return null;
  21. const data = fs.readFileSync(fullFilePath, "utf8");
  22. return JSON.parse(data);
  23. }
  24. async function viewLocalFiles() {
  25. if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  26. const liveSyncAvailable = await DocumentSyncQueue.enabled();
  27. const directory = {
  28. name: "documents",
  29. type: "folder",
  30. items: [],
  31. };
  32. // console.log("111111111111111111111111111111111111111111111111111111111111111111111111111111111");
  33. for (const file of fs.readdirSync(documentsPath)) {
  34. // console.log("file:", file);
  35. if (path.extname(file) === ".md") continue;
  36. const folderPath = path.resolve(documentsPath, file);
  37. const isFolder = fs.lstatSync(folderPath).isDirectory();
  38. if (isFolder) {
  39. const subdocs = {
  40. name: file,
  41. type: "folder",
  42. items: [],
  43. };
  44. const subfiles = fs.readdirSync(folderPath);
  45. const filenames = {};
  46. for (const subfile of subfiles) {
  47. if (path.extname(subfile) !== ".json") continue;
  48. const filePath = path.join(folderPath, subfile);
  49. const rawData = fs.readFileSync(filePath, "utf8");
  50. // console.log("rawData:", rawData);
  51. const cachefilename = `${file}/${subfile}`;
  52. const { pageContent, ...metadata } = JSON.parse(rawData);
  53. subdocs.items.push({
  54. name: subfile,
  55. type: "file",
  56. ...metadata,
  57. cached: await cachedVectorInformation(cachefilename, true),
  58. canWatch: liveSyncAvailable
  59. ? DocumentSyncQueue.canWatch(metadata)
  60. : false,
  61. // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
  62. // watched: false, // boolean to indicate if this document is watched in ANY workspace
  63. });
  64. filenames[cachefilename] = subfile;
  65. }
  66. // Grab the pinned workspaces and watched documents for this folder's documents
  67. // at the time of the query so we don't have to re-query the database for each file
  68. const pinnedWorkspacesByDocument =
  69. await getPinnedWorkspacesByDocument(filenames);
  70. const watchedDocumentsFilenames =
  71. await getWatchedDocumentFilenames(filenames);
  72. for (const item of subdocs.items) {
  73. item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
  74. item.watched =
  75. watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
  76. }
  77. directory.items.push(subdocs);
  78. }
  79. }
  80. // Make sure custom-documents is always the first folder in picker
  81. directory.items = [
  82. directory.items.find((folder) => folder.name === "custom-documents"),
  83. ...directory.items.filter((folder) => folder.name !== "custom-documents"),
  84. ].filter((i) => !!i);
  85. return directory;
  86. }
  87. /**
  88. * Searches the vector-cache folder for existing information so we dont have to re-embed a
  89. * document and can instead push directly to vector db.
  90. * @param {string} filename - the filename to check for cached vector information
  91. * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
  92. * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
  93. */
  94. async function cachedVectorInformation(filename = null, checkOnly = false) {
  95. if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
  96. const digest = uuidv5(filename, uuidv5.URL);
  97. const file = path.resolve(vectorCachePath, `${digest}.json`);
  98. const exists = fs.existsSync(file);
  99. if (checkOnly) return exists;
  100. if (!exists) return { exists, chunks: [] };
  101. console.log(
  102. `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
  103. );
  104. const rawData = fs.readFileSync(file, "utf8");
  105. return { exists: true, chunks: JSON.parse(rawData) };
  106. }
  107. // vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
  108. // filename is the fullpath to the doc so we can compare by filename to find cached matches.
  109. async function storeVectorResult(vectorData = [], filename = null) {
  110. if (!filename) return;
  111. console.log(
  112. `Caching vectorized results of ${filename} to prevent duplicated embedding.`
  113. );
  114. if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
  115. const digest = uuidv5(filename, uuidv5.URL);
  116. const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
  117. fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
  118. return;
  119. }
  120. // Purges a file from the documents/ folder.
  121. async function purgeSourceDocument(filename = null) {
  122. if (!filename) return;
  123. const filePath = path.resolve(documentsPath, normalizePath(filename));
  124. if (
  125. !fs.existsSync(filePath) ||
  126. !isWithin(documentsPath, filePath) ||
  127. !fs.lstatSync(filePath).isFile()
  128. )
  129. return;
  130. console.log(`Purging source document of ${filename}.`);
  131. fs.rmSync(filePath);
  132. return;
  133. }
  134. // Purges a vector-cache file from the vector-cache/ folder.
  135. async function purgeVectorCache(filename = null) {
  136. if (!filename) return;
  137. const digest = uuidv5(filename, uuidv5.URL);
  138. const filePath = path.resolve(vectorCachePath, `${digest}.json`);
  139. if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
  140. console.log(`Purging vector-cache of ${filename}.`);
  141. fs.rmSync(filePath);
  142. return;
  143. }
  144. // Search for a specific document by its unique name in the entire `documents`
  145. // folder via iteration of all folders and checking if the expected file exists.
  146. async function findDocumentInDocuments(documentName = null) {
  147. if (!documentName) return null;
  148. for (const folder of fs.readdirSync(documentsPath)) {
  149. const isFolder = fs
  150. .lstatSync(path.join(documentsPath, folder))
  151. .isDirectory();
  152. if (!isFolder) continue;
  153. const targetFilename = normalizePath(documentName);
  154. const targetFileLocation = path.join(documentsPath, folder, targetFilename);
  155. if (
  156. !fs.existsSync(targetFileLocation) ||
  157. !isWithin(documentsPath, targetFileLocation)
  158. )
  159. continue;
  160. const fileData = fs.readFileSync(targetFileLocation, "utf8");
  161. const cachefilename = `${folder}/${targetFilename}`;
  162. const { pageContent, ...metadata } = JSON.parse(fileData);
  163. return {
  164. name: targetFilename,
  165. type: "file",
  166. ...metadata,
  167. cached: await cachedVectorInformation(cachefilename, true),
  168. };
  169. }
  170. return null;
  171. }
  172. /**
  173. * Checks if a given path is within another path.
  174. * @param {string} outer - The outer path (should be resolved).
  175. * @param {string} inner - The inner path (should be resolved).
  176. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  177. */
  178. function isWithin(outer, inner) {
  179. if (outer === inner) return false;
  180. const rel = path.relative(outer, inner);
  181. return !rel.startsWith("../") && rel !== "..";
  182. }
  183. function normalizePath(filepath = "") {
  184. const result = path
  185. .normalize(filepath.trim())
  186. .replace(/^(\.\.(\/|\\|$))+/, "")
  187. .trim();
  188. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  189. return result;
  190. }
  191. // Check if the vector-cache folder is empty or not
  192. // useful for it the user is changing embedders as this will
  193. // break the previous cache.
  194. function hasVectorCachedFiles() {
  195. try {
  196. return (
  197. fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
  198. .length !== 0
  199. );
  200. } catch {}
  201. return false;
  202. }
  203. /**
  204. * @param {string[]} filenames - array of filenames to check for pinned workspaces
  205. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  206. */
  207. async function getPinnedWorkspacesByDocument(filenames = []) {
  208. return (
  209. await Document.where(
  210. {
  211. docpath: {
  212. in: Object.keys(filenames),
  213. },
  214. pinned: true,
  215. },
  216. null,
  217. null,
  218. null,
  219. {
  220. workspaceId: true,
  221. docpath: true,
  222. }
  223. )
  224. ).reduce((result, { workspaceId, docpath }) => {
  225. const filename = filenames[docpath];
  226. if (!result[filename]) result[filename] = [];
  227. if (!result[filename].includes(workspaceId))
  228. result[filename].push(workspaceId);
  229. return result;
  230. }, {});
  231. }
  232. /**
  233. * Get a record of filenames and their corresponding workspaceIds that have watched a document
  234. * that will be used to determine if a document should be displayed in the watched documents sidebar
  235. * @param {string[]} filenames - array of filenames to check for watched workspaces
  236. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  237. */
  238. async function getWatchedDocumentFilenames(filenames = []) {
  239. return (
  240. await Document.where(
  241. {
  242. docpath: { in: Object.keys(filenames) },
  243. watched: true,
  244. },
  245. null,
  246. null,
  247. null,
  248. { workspaceId: true, docpath: true }
  249. )
  250. ).reduce((result, { workspaceId, docpath }) => {
  251. const filename = filenames[docpath];
  252. result[filename] = workspaceId;
  253. return result;
  254. }, {});
  255. }
  256. /**
  257. * Purges the entire vector-cache folder and recreates it.
  258. * @returns {void}
  259. */
  260. function purgeEntireVectorCache() {
  261. fs.rmSync(vectorCachePath, { recursive: true, force: true });
  262. fs.mkdirSync(vectorCachePath);
  263. return;
  264. }
  265. /**
  266. * 移动文件到目标目录并重命名
  267. * @param {string} sourceFilePath - 源文件路径
  268. * @param {string} targetDirectory - 目标目录路径
  269. * @param {string} newFileName - 新文件名
  270. */
  271. function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) {
  272. // 1. 检查源文件是否存在
  273. if (!fs.existsSync(sourceFilePath)) {
  274. throw new Error(`源文件不存在: ${sourceFilePath}`);
  275. }
  276. // 2. 检查目标目录是否存在,如果不存在则创建
  277. if (!fs.existsSync(targetDirectory)) {
  278. fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录
  279. }
  280. // 3. 构造目标文件的完整路径(使用新文件名)
  281. const targetFilePath = path.join(targetDirectory, newFileName);
  282. // 4. 移动文件并重命名
  283. fs.renameSync(sourceFilePath, targetFilePath);
  284. console.log(`文件已移动到: ${targetFilePath}`);
  285. }
  286. module.exports = {
  287. findDocumentInDocuments,
  288. cachedVectorInformation,
  289. viewLocalFiles,
  290. purgeSourceDocument,
  291. purgeVectorCache,
  292. storeVectorResult,
  293. fileData,
  294. normalizePath,
  295. isWithin,
  296. documentsPath,
  297. hasVectorCachedFiles,
  298. purgeEntireVectorCache,
  299. moveAndRenameFile,
  300. };