You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
5.9 KiB

11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { MimeDetector } = require("./mime");
  4. /**
  5. * Checks if a file is text by checking the mime type and then falling back to buffer inspection.
  6. * This way we can capture all the cases where the mime type is not known but still parseable as text
  7. * without having to constantly add new mime type overrides.
  8. * @param {string} filepath - The path to the file.
  9. * @returns {boolean} - Returns true if the file is text, false otherwise.
  10. */
  11. function isTextType(filepath) {
  12. if (!fs.existsSync(filepath)) return false;
  13. const result = isKnownTextMime(filepath);
  14. if (result.valid) return true; // Known text type - return true.
  15. if (result.reason !== "generic") return false; // If any other reason than generic - return false.
  16. return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
  17. }
  18. /**
  19. * Checks if a file is known to be text by checking the mime type.
  20. * @param {string} filepath - The path to the file.
  21. * @returns {boolean} - Returns true if the file is known to be text, false otherwise.
  22. */
  23. function isKnownTextMime(filepath) {
  24. try {
  25. const mimeLib = new MimeDetector();
  26. const mime = mimeLib.getType(filepath);
  27. if (mimeLib.badMimes.includes(mime))
  28. return { valid: false, reason: "bad_mime" };
  29. const type = mime.split("/")[0];
  30. if (mimeLib.nonTextTypes.includes(type))
  31. return { valid: false, reason: "non_text_mime" };
  32. return { valid: true, reason: "valid_mime" };
  33. } catch (e) {
  34. return { valid: false, reason: "generic" };
  35. }
  36. }
  37. /**
  38. * Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
  39. * If the file looks too much like a binary file, it will return false.
  40. * @param {string} filepath - The path to the file.
  41. * @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
  42. */
  43. function parseableAsText(filepath) {
  44. try {
  45. const fd = fs.openSync(filepath, "r");
  46. const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
  47. const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
  48. fs.closeSync(fd);
  49. const content = buffer.subarray(0, bytesRead).toString("utf8");
  50. const nullCount = (content.match(/\0/g) || []).length;
  51. const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
  52. .length;
  53. const threshold = bytesRead * 0.1;
  54. return nullCount + controlCount < threshold;
  55. } catch {
  56. return false;
  57. }
  58. }
  59. function trashFile(filepath) {
  60. if (!fs.existsSync(filepath)) return;
  61. try {
  62. const isDir = fs.lstatSync(filepath).isDirectory();
  63. if (isDir) return;
  64. } catch {
  65. return;
  66. }
  67. fs.rmSync(filepath);
  68. return;
  69. }
  70. function createdDate(filepath) {
  71. try {
  72. const { birthtimeMs, birthtime } = fs.statSync(filepath);
  73. if (birthtimeMs === 0) throw new Error("Invalid stat for file!");
  74. return birthtime.toLocaleString();
  75. } catch {
  76. return "unknown";
  77. }
  78. }
  79. function writeToServerDocuments(
  80. data = {},
  81. filename,
  82. destinationOverride = null
  83. ) {
  84. const destination = destinationOverride
  85. ? path.resolve(destinationOverride)
  86. : path.resolve(
  87. __dirname,
  88. "../../../server/storage/documents/custom-documents"
  89. );
  90. if (!fs.existsSync(destination))
  91. fs.mkdirSync(destination, { recursive: true });
  92. const destinationFilePath = path.resolve(destination, filename) + ".json";
  93. fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
  94. encoding: "utf-8",
  95. });
  96. return {
  97. ...data,
  98. // relative location string that can be passed into the /update-embeddings api
  99. // that will work since we know the location exists and since we only allow
  100. // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
  101. location: destinationFilePath.split("/").slice(-2).join("/"),
  102. };
  103. }
  104. // When required we can wipe the entire collector hotdir and tmp storage in case
  105. // there were some large file failures that we unable to be removed a reboot will
  106. // force remove them.
  107. async function wipeCollectorStorage() {
  108. const cleanHotDir = new Promise((resolve) => {
  109. const directory = path.resolve(__dirname, "../../hotdir");
  110. fs.readdir(directory, (err, files) => {
  111. if (err) resolve();
  112. for (const file of files) {
  113. if (file === "__HOTDIR__.md") continue;
  114. try {
  115. fs.rmSync(path.join(directory, file));
  116. } catch {}
  117. }
  118. resolve();
  119. });
  120. });
  121. const cleanTmpDir = new Promise((resolve) => {
  122. const directory = path.resolve(__dirname, "../../storage/tmp");
  123. fs.readdir(directory, (err, files) => {
  124. if (err) resolve();
  125. for (const file of files) {
  126. if (file === ".placeholder") continue;
  127. try {
  128. fs.rmSync(path.join(directory, file));
  129. } catch {}
  130. }
  131. resolve();
  132. });
  133. });
  134. await Promise.all([cleanHotDir, cleanTmpDir]);
  135. console.log(`Collector hot directory and tmp storage wiped!`);
  136. return;
  137. }
  138. /**
  139. * Checks if a given path is within another path.
  140. * @param {string} outer - The outer path (should be resolved).
  141. * @param {string} inner - The inner path (should be resolved).
  142. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  143. */
  144. function isWithin(outer, inner) {
  145. if (outer === inner) return false;
  146. const rel = path.relative(outer, inner);
  147. return !rel.startsWith("../") && rel !== "..";
  148. }
  149. function normalizePath(filepath = "") {
  150. const result = path
  151. .normalize(filepath.trim())
  152. .replace(/^(\.\.(\/|\\|$))+/, "")
  153. .trim();
  154. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  155. return result;
  156. }
  157. function sanitizeFileName(fileName) {
  158. if (!fileName) return fileName;
  159. return fileName.replace(/[<>:"\/\\|?*]/g, "");
  160. }
  161. module.exports = {
  162. trashFile,
  163. isTextType,
  164. createdDate,
  165. writeToServerDocuments,
  166. wipeCollectorStorage,
  167. normalizePath,
  168. isWithin,
  169. sanitizeFileName,
  170. };