chtech-anythingllm/server/utils/files/index.js


								const fs = require("fs");

								const path = require("path");

								const { v5: uuidv5 } = require("uuid");

								const { Document } = require("../../models/documents");

								const { DocumentSyncQueue } = require("../../models/documentSyncQueue");

								const { userFromSession } = require("../http");

								const { DeptDocument } = require("../../models/deptDocument");

								const documentsPath =

								  process.env.NODE_ENV === "development"

								    ? path.resolve(__dirname, `../../storage/documents`)

								    : path.resolve(process.env.STORAGE_DIR, `documents`);

								const vectorCachePath =

								  process.env.NODE_ENV === "development"

								    ? path.resolve(__dirname, `../../storage/vector-cache`)

								    : path.resolve(process.env.STORAGE_DIR, `vector-cache`);


								// Should take in a folder that is a subfolder of documents

								// eg: youtube-subject/video-123.json

								async function fileData(filePath = null) {

								  if (!filePath) throw new Error("No docPath provided in request");

								  const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));

								  if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))

								    return null;


								  const data = fs.readFileSync(fullFilePath, "utf8");

								  return JSON.parse(data);

								}


								// async function viewLocalFiles() {

								//   if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);

								//   const liveSyncAvailable = await DocumentSyncQueue.enabled();

								//   const directory = {

								//     name: "documents",

								//     type: "folder",

								//     items: [],

								//   };

								//   for (const file of fs.readdirSync(documentsPath)) {

								//     // console.log("file:", file);

								//     if (path.extname(file) === ".md") continue;

								//     const folderPath = path.resolve(documentsPath, file);

								//     const isFolder = fs.lstatSync(folderPath).isDirectory();

								//     if (isFolder) {

								//       const subdocs = {

								//         name: file,

								//         type: "folder",

								//         items: [],

								//       };

								//       const subfiles = fs.readdirSync(folderPath);

								//       const filenames = {};

								//       for (const subfile of subfiles) {

								//         if (path.extname(subfile) !== ".json") continue;

								//         const filePath = path.join(folderPath, subfile);

								//         const rawData = fs.readFileSync(filePath, "utf8");

								//         // console.log("rawData:", rawData);

								//         const cachefilename = `${file}/${subfile}`;

								//         const { pageContent, ...metadata } = JSON.parse(rawData);

								//         subdocs.items.push({

								//           name: subfile,

								//           type: "file",

								//           ...metadata,

								//           cached: await cachedVectorInformation(cachefilename, true),

								//           canWatch: liveSyncAvailable

								//             ? DocumentSyncQueue.canWatch(metadata)

								//             : false,

								//           // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document

								//           // watched: false, // boolean to indicate if this document is watched in ANY workspace

								//         });

								//         filenames[cachefilename] = subfile;

								//       }

								//

								//       // Grab the pinned workspaces and watched documents for this folder's documents

								//       // at the time of the query so we don't have to re-query the database for each file

								//       const pinnedWorkspacesByDocument =

								//         await getPinnedWorkspacesByDocument(filenames);

								//       const watchedDocumentsFilenames =

								//         await getWatchedDocumentFilenames(filenames);

								//       for (const item of subdocs.items) {

								//         item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];

								//         item.watched =

								//           watchedDocumentsFilenames.hasOwnProperty(item.name) || false;

								//       }

								//

								//       directory.items.push(subdocs);

								//     }

								//   }

								//

								//   // Make sure custom-documents is always the first folder in picker

								//   directory.items = [

								//     directory.items.find((folder) => folder.name === "custom-documents"),

								//     ...directory.items.filter((folder) => folder.name !== "custom-documents"),

								//   ].filter((i) => !!i);

								//

								//   return directory;

								// }


								// async function viewLocalFiles(deptId) {

								//   const directory = {

								//     name: "documents",

								//     type: "folder",

								//     items: [],

								//   };

								//   if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);

								//   const liveSyncAvailable = await DocumentSyncQueue.enabled();

								//

								//   // 查询 deptDocuments

								//   const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });

								//   if (!deptDocuments || deptDocuments.length === 0) {

								//     return directory;

								//   }

								//

								//   // 初始化分类对象

								//   const publicd = {

								//     name: "公开",

								//     type: "folder",

								//     items: [],

								//   };

								//   const privated = {

								//     name: "私有",

								//     type: "folder",

								//     items: [],

								//   };

								//   const temp = {

								//     name: "临时",

								//     type: "folder",

								//     items: [],

								//   };

								//

								//   // 遍历 deptDocuments

								//   for (const doc of deptDocuments) {

								//     try {

								//       const filePath = doc.parsedFilePath; // 获取文件路径

								//       if (!fs.existsSync(filePath)) continue; // 如果文件不存在，跳过

								//

								//       // 读取文件内容

								//       const rawData = fs.readFileSync(filePath, 'utf8');

								//       const { pageContent, ...metadata } = JSON.parse(rawData);

								//

								//       // 构造文件信息对象（保持与原方法一致的字段）

								//       const fileInfo = {

								//         name: path.basename(filePath), // 文件名

								//         type: "file",

								//         ...metadata,

								//         cached: await cachedVectorInformation(filePath, true),

								//         canWatch: liveSyncAvailable

								//           ? DocumentSyncQueue.canWatch(metadata)

								//           : false,

								//         pinnedWorkspaces: [], // 初始化为空数组

								//         watched: false, // 初始化为 false

								//       };

								//

								//       // 根据 isPublic 属性分类

								//       if (doc.isPublic === 0) {

								//         publicd.items.push(fileInfo);

								//       } else if (doc.isPublic === 1) {

								//         privated.items.push(fileInfo);

								//       } else {

								//         temp.items.push(fileInfo);

								//       }

								//     } catch (error) {

								//       console.error(`Error processing file ${doc.parsedFilePath}:`, error);

								//     }

								//   }

								//

								//   directory.items = [publicd, privated, temp];

								//   // 返回嵌套结构

								//   return directory;

								// }


								async function viewLocalFiles(deptId) {

								  const directory = {

								    name: "documents",

								    type: "folder",

								    items: [],

								  };

								  if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);

								  const liveSyncAvailable = await DocumentSyncQueue.enabled();


								  // 查询 deptDocuments

								  const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });

								  if (!deptDocuments || deptDocuments.length === 0) {

								    return directory;

								  }


								  // 初始化分类对象

								  const publicd = {

								    name: "公开",

								    type: "folder",

								    items: [],

								  };

								  const privated = {

								    name: "私有",

								    type: "folder",

								    items: [],

								  };

								  const temp = {

								    name: "临时",

								    type: "folder",

								    items: [],

								  };


								  // 遍历 deptDocuments

								  for (const doc of deptDocuments) {

								    try {

								      let filePath = doc.parsedFilePath; // 获取文件路径

								      if (!fs.existsSync(filePath)) {

								        filePath = process.env.NODE_ENV === "development"

								          ? filePath

								          : path.resolve(process.env.STORAGE_DIR, "documents", filePath);

								      }


								      // 读取文件内容

								      const rawData = fs.readFileSync(filePath, 'utf8');

								      const { pageContent, ...metadata } = JSON.parse(rawData);


								      // 计算相对路径，并将路径分隔符统一为 `/`

								      const relativePath = path.relative(documentsPath, filePath).replace(/\\/g, '/');


								      // 构造文件信息对象（保持与原方法一致的字段）

								      const fileInfo = {

								        name: path.basename(filePath), // 文件名

								        type: "file",

								        ...metadata,

								        cached: await cachedVectorInformation(filePath, true),

								        canWatch: liveSyncAvailable

								          ? DocumentSyncQueue.canWatch(metadata)

								          : false,

								        pinnedWorkspaces: [], // 初始化为空数组

								        watched: false, // 初始化为 false

								        relativePath: relativePath, // 新增字段：相对路径（使用 `/` 分隔符）

								      };


								      // 根据 isPublic 属性分类

								      if (doc.isPublic === 0) {

								        publicd.items.push(fileInfo);

								      } else if (doc.isPublic === 1) {

								        privated.items.push(fileInfo);

								      } else {

								        temp.items.push(fileInfo);

								      }

								    } catch (error) {

								      console.error(`Error processing file ${doc.parsedFilePath}:`, error);

								    }

								  }


								  directory.items = [publicd, privated, temp];

								  // 返回嵌套结构

								  return directory;

								}


								/**

								 * Searches the vector-cache folder for existing information so we dont have to re-embed a

								 * document and can instead push directly to vector db.

								 * @param {string} filename - the filename to check for cached vector information

								 * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data

								 * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks

								 */

								async function cachedVectorInformation(filename = null, checkOnly = false) {

								  if (!filename) return checkOnly ? false : { exists: false, chunks: [] };


								  const digest = uuidv5(filename, uuidv5.URL);

								  const file = path.resolve(vectorCachePath, `${digest}.json`);

								  const exists = fs.existsSync(file);


								  if (checkOnly) return exists;

								  if (!exists) return { exists, chunks: [] };


								  console.log(

								    `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`

								  );

								  const rawData = fs.readFileSync(file, "utf8");

								  return { exists: true, chunks: JSON.parse(rawData) };

								}


								// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc

								// filename is the fullpath to the doc so we can compare by filename to find cached matches.

								async function storeVectorResult(vectorData = [], filename = null) {

								  if (!filename) return;

								  console.log(

								    `Caching vectorized results of ${filename} to prevent duplicated embedding.`

								  );

								  if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);


								  const digest = uuidv5(filename, uuidv5.URL);

								  const writeTo = path.resolve(vectorCachePath, `${digest}.json`);

								  fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");

								  return;

								}


								// Purges a file from the documents/ folder.

								async function purgeSourceDocument(filename = null) {

								  if (!filename) return;

								  const filePath = path.resolve(documentsPath, normalizePath(filename));


								  if (

								    !fs.existsSync(filePath) ||

								    !isWithin(documentsPath, filePath) ||

								    !fs.lstatSync(filePath).isFile()

								  )

								    return;


								  console.log(`Purging source document of ${filename}.`);

								  fs.rmSync(filePath);

								  return;

								}


								// Purges a vector-cache file from the vector-cache/ folder.

								async function purgeVectorCache(filename = null) {

								  if (!filename) return;

								  const digest = uuidv5(filename, uuidv5.URL);

								  const filePath = path.resolve(vectorCachePath, `${digest}.json`);


								  if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;

								  console.log(`Purging vector-cache of ${filename}.`);

								  fs.rmSync(filePath);

								  return;

								}


								// Search for a specific document by its unique name in the entire `documents`

								// folder via iteration of all folders and checking if the expected file exists.

								async function findDocumentInDocuments(documentName = null) {

								  if (!documentName) return null;

								  for (const folder of fs.readdirSync(documentsPath)) {

								    const isFolder = fs

								      .lstatSync(path.join(documentsPath, folder))

								      .isDirectory();

								    if (!isFolder) continue;


								    const targetFilename = normalizePath(documentName);

								    const targetFileLocation = path.join(documentsPath, folder, targetFilename);


								    if (

								      !fs.existsSync(targetFileLocation) ||

								      !isWithin(documentsPath, targetFileLocation)

								    )

								      continue;


								    const fileData = fs.readFileSync(targetFileLocation, "utf8");

								    const cachefilename = `${folder}/${targetFilename}`;

								    const { pageContent, ...metadata } = JSON.parse(fileData);

								    return {

								      name: targetFilename,

								      type: "file",

								      ...metadata,

								      cached: await cachedVectorInformation(cachefilename, true),

								    };

								  }


								  return null;

								}


								/**

								 * Checks if a given path is within another path.

								 * @param {string} outer - The outer path (should be resolved).

								 * @param {string} inner - The inner path (should be resolved).

								 * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.

								 */

								function isWithin(outer, inner) {

								  if (outer === inner) return false;

								  const rel = path.relative(outer, inner);

								  return !rel.startsWith("../") && rel !== "..";

								}


								function normalizePath(filepath = "") {

								  const result = path

								    .normalize(filepath.trim())

								    .replace(/^(\.\.(\/|\\|$))+/, "")

								    .trim();

								  if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");

								  return result;

								}


								// Check if the vector-cache folder is empty or not

								// useful for it the user is changing embedders as this will

								// break the previous cache.

								function hasVectorCachedFiles() {

								  try {

								    return (

								      fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))

								        .length !== 0

								    );

								  } catch {}

								  return false;

								}


								/**

								 * @param {string[]} filenames - array of filenames to check for pinned workspaces

								 * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds

								 */

								async function getPinnedWorkspacesByDocument(filenames = []) {

								  return (

								    await Document.where(

								      {

								        docpath: {

								          in: Object.keys(filenames),

								        },

								        pinned: true,

								      },

								      null,

								      null,

								      null,

								      {

								        workspaceId: true,

								        docpath: true,

								      }

								    )

								  ).reduce((result, { workspaceId, docpath }) => {

								    const filename = filenames[docpath];

								    if (!result[filename]) result[filename] = [];

								    if (!result[filename].includes(workspaceId))

								      result[filename].push(workspaceId);

								    return result;

								  }, {});

								}


								/**

								 * Get a record of filenames and their corresponding workspaceIds that have watched a document

								 * that will be used to determine if a document should be displayed in the watched documents sidebar

								 * @param {string[]} filenames - array of filenames to check for watched workspaces

								 * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds

								 */

								async function getWatchedDocumentFilenames(filenames = []) {

								  return (

								    await Document.where(

								      {

								        docpath: { in: Object.keys(filenames) },

								        watched: true,

								      },

								      null,

								      null,

								      null,

								      { workspaceId: true, docpath: true }

								    )

								  ).reduce((result, { workspaceId, docpath }) => {

								    const filename = filenames[docpath];

								    result[filename] = workspaceId;

								    return result;

								  }, {});

								}


								/**

								 * Purges the entire vector-cache folder and recreates it.

								 * @returns {void}

								 */

								function purgeEntireVectorCache() {

								  fs.rmSync(vectorCachePath, { recursive: true, force: true });

								  fs.mkdirSync(vectorCachePath);

								  return;

								}


								/**

								 * 移动文件到目标目录并重命名

								 * @param {string} sourceFilePath - 源文件路径

								 * @param {string} targetDirectory - 目标目录路径

								 * @param {string} newFileName - 新文件名

								 */

								function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) {

								  // 1. 检查源文件是否存在

								  if (!fs.existsSync(sourceFilePath)) {

								    throw new Error(`源文件不存在: ${sourceFilePath}`);

								  }


								  // 2. 检查目标目录是否存在，如果不存在则创建

								  if (!fs.existsSync(targetDirectory)) {

								    fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录

								  }


								  // 3. 构造目标文件的完整路径（使用新文件名）

								  const targetFilePath = path.join(targetDirectory, newFileName);


								  // 4. 移动文件并重命名

								  fs.renameSync(sourceFilePath, targetFilePath);


								  console.log(`文件已移动到: ${targetFilePath}`);

								}


								module.exports = {

								  findDocumentInDocuments,

								  cachedVectorInformation,

								  viewLocalFiles,

								  purgeSourceDocument,

								  purgeVectorCache,

								  storeVectorResult,

								  fileData,

								  normalizePath,

								  isWithin,

								  documentsPath,

								  hasVectorCachedFiles,

								  purgeEntireVectorCache,

								  moveAndRenameFile,

								};