You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
6.2 KiB

11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { default: slugify } = require("slugify");
  4. const { v4 } = require("uuid");
  5. const { writeToServerDocuments, sanitizeFileName } = require("../../files");
  6. const { tokenizeString } = require("../../tokenizer");
  7. const { ConfluencePagesLoader } = require("./ConfluenceLoader");
  8. /**
  9. * Load Confluence documents from a spaceID and Confluence credentials
  10. * @param {object} args - forwarded request body params
  11. * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
  12. * @returns
  13. */
  14. async function loadConfluence(
  15. {
  16. baseUrl = null,
  17. spaceKey = null,
  18. username = null,
  19. accessToken = null,
  20. cloud = true,
  21. personalAccessToken = null,
  22. },
  23. response
  24. ) {
  25. if (!personalAccessToken && (!username || !accessToken)) {
  26. return {
  27. success: false,
  28. reason:
  29. "You need either a personal access token (PAT), or a username and access token to use the Confluence connector.",
  30. };
  31. }
  32. if (!baseUrl || !validBaseUrl(baseUrl)) {
  33. return {
  34. success: false,
  35. reason: "Provided base URL is not a valid URL.",
  36. };
  37. }
  38. if (!spaceKey) {
  39. return {
  40. success: false,
  41. reason: "You need to provide a Confluence space key.",
  42. };
  43. }
  44. const { origin, hostname } = new URL(baseUrl);
  45. console.log(`-- Working Confluence ${origin} --`);
  46. const loader = new ConfluencePagesLoader({
  47. baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
  48. spaceKey,
  49. username,
  50. accessToken,
  51. cloud,
  52. personalAccessToken,
  53. });
  54. const { docs, error } = await loader
  55. .load()
  56. .then((docs) => {
  57. return { docs, error: null };
  58. })
  59. .catch((e) => {
  60. return {
  61. docs: [],
  62. error: e.message?.split("Error:")?.[1] || e.message,
  63. };
  64. });
  65. if (!docs.length || !!error) {
  66. return {
  67. success: false,
  68. reason: error ?? "No pages found for that Confluence space.",
  69. };
  70. }
  71. const outFolder = slugify(
  72. `confluence-${hostname}-${v4().slice(0, 4)}`
  73. ).toLowerCase();
  74. const outFolderPath =
  75. process.env.NODE_ENV === "development"
  76. ? path.resolve(
  77. __dirname,
  78. `../../../../server/storage/documents/${outFolder}`
  79. )
  80. : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
  81. if (!fs.existsSync(outFolderPath))
  82. fs.mkdirSync(outFolderPath, { recursive: true });
  83. docs.forEach((doc) => {
  84. if (!doc.pageContent) return;
  85. const data = {
  86. id: v4(),
  87. url: doc.metadata.url + ".page",
  88. title: doc.metadata.title || doc.metadata.source,
  89. docAuthor: origin,
  90. description: doc.metadata.title,
  91. docSource: `${origin} Confluence`,
  92. chunkSource: generateChunkSource(
  93. { doc, baseUrl: origin, spaceKey, accessToken, username, cloud },
  94. response.locals.encryptionWorker
  95. ),
  96. published: new Date().toLocaleString(),
  97. wordCount: doc.pageContent.split(" ").length,
  98. pageContent: doc.pageContent,
  99. token_count_estimate: tokenizeString(doc.pageContent),
  100. };
  101. console.log(
  102. `[Confluence Loader]: Saving ${doc.metadata.title} to ${outFolder}`
  103. );
  104. const fileName = sanitizeFileName(
  105. `${slugify(doc.metadata.title)}-${data.id}`
  106. );
  107. writeToServerDocuments(data, fileName, outFolderPath);
  108. });
  109. return {
  110. success: true,
  111. reason: null,
  112. data: {
  113. spaceKey,
  114. destination: outFolder,
  115. },
  116. };
  117. }
  118. /**
  119. * Gets the page content from a specific Confluence page, not all pages in a workspace.
  120. * @returns
  121. */
  122. async function fetchConfluencePage({
  123. pageUrl,
  124. baseUrl,
  125. spaceKey,
  126. username,
  127. accessToken,
  128. cloud = true,
  129. }) {
  130. if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
  131. return {
  132. success: false,
  133. content: null,
  134. reason:
  135. "You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.",
  136. };
  137. }
  138. if (!validBaseUrl(baseUrl)) {
  139. return {
  140. success: false,
  141. content: null,
  142. reason: "Provided base URL is not a valid URL.",
  143. };
  144. }
  145. if (!spaceKey) {
  146. return {
  147. success: false,
  148. content: null,
  149. reason: "You need to provide a Confluence space key.",
  150. };
  151. }
  152. console.log(`-- Working Confluence Page ${pageUrl} --`);
  153. const loader = new ConfluencePagesLoader({
  154. baseUrl, // Should be the origin of the baseUrl
  155. spaceKey,
  156. username,
  157. accessToken,
  158. cloud,
  159. });
  160. const { docs, error } = await loader
  161. .load()
  162. .then((docs) => {
  163. return { docs, error: null };
  164. })
  165. .catch((e) => {
  166. return {
  167. docs: [],
  168. error: e.message?.split("Error:")?.[1] || e.message,
  169. };
  170. });
  171. if (!docs.length || !!error) {
  172. return {
  173. success: false,
  174. reason: error ?? "No pages found for that Confluence space.",
  175. content: null,
  176. };
  177. }
  178. const targetDocument = docs.find(
  179. (doc) => doc.pageContent && doc.metadata.url === pageUrl
  180. );
  181. if (!targetDocument) {
  182. return {
  183. success: false,
  184. reason: "Target page could not be found in Confluence space.",
  185. content: null,
  186. };
  187. }
  188. return {
  189. success: true,
  190. reason: null,
  191. content: targetDocument.pageContent,
  192. };
  193. }
  194. /**
  195. * Validates if the provided baseUrl is a valid URL at all.
  196. * @param {string} baseUrl
  197. * @returns {boolean}
  198. */
  199. function validBaseUrl(baseUrl) {
  200. try {
  201. new URL(baseUrl);
  202. return true;
  203. } catch (e) {
  204. return false;
  205. }
  206. }
  207. /**
  208. * Generate the full chunkSource for a specific Confluence page so that we can resync it later.
  209. * This data is encrypted into a single `payload` query param so we can replay credentials later
  210. * since this was encrypted with the systems persistent password and salt.
  211. * @param {object} chunkSourceInformation
  212. * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
  213. * @returns {string}
  214. */
  215. function generateChunkSource(
  216. { doc, baseUrl, spaceKey, accessToken, username, cloud },
  217. encryptionWorker
  218. ) {
  219. const payload = {
  220. baseUrl,
  221. spaceKey,
  222. token: accessToken,
  223. username,
  224. cloud,
  225. };
  226. return `confluence://${doc.metadata.url}?payload=${encryptionWorker.encrypt(
  227. JSON.stringify(payload)
  228. )}`;
  229. }
  230. module.exports = {
  231. loadConfluence,
  232. fetchConfluencePage,
  233. };